1/* Run a stand-alone AMD GCN kernel.
2
3   Copyright 2017 Mentor Graphics Corporation
4   Copyright (C) 2018-2020 Free Software Foundation, Inc.
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation, either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18
19/* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21   The kernel entry point's signature must use a standard main signature:
22
23     int main(int argc, char **argv)
24*/
25
26#include <stdint.h>
27#include <stdbool.h>
28#include <stdlib.h>
29#include <malloc.h>
30#include <stdio.h>
31#include <string.h>
32#include <dlfcn.h>
33#include <unistd.h>
34#include <elf.h>
35#include <signal.h>
36
37/* These probably won't be in elf.h for a while.  */
38#ifndef R_AMDGPU_NONE
39#define R_AMDGPU_NONE		0
40#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
41#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
42#define R_AMDGPU_ABS64		3	/* S + A  */
43#define R_AMDGPU_REL32		4	/* S + A - P  */
44#define R_AMDGPU_REL64		5	/* S + A - P  */
45#define R_AMDGPU_ABS32		6	/* S + A  */
46#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
47#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
48#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
49#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
50#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
51#define reserved		12
52#define R_AMDGPU_RELATIVE64	13	/* B + A  */
53#endif
54
55#include "hsa.h"
56
57#ifndef HSA_RUNTIME_LIB
58#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59#endif
60
61#ifndef VERSION_STRING
62#define VERSION_STRING "(version unknown)"
63#endif
64
65bool debug = false;
66
67hsa_agent_t device = { 0 };
68hsa_queue_t *queue = NULL;
69uint64_t init_array_kernel = 0;
70uint64_t fini_array_kernel = 0;
71uint64_t main_kernel = 0;
72hsa_executable_t executable = { 0 };
73
74hsa_region_t kernargs_region = { 0 };
75hsa_region_t heap_region = { 0 };
76uint32_t kernarg_segment_size = 0;
77uint32_t group_segment_size = 0;
78uint32_t private_segment_size = 0;
79
80static void
81usage (const char *progname)
82{
83  printf ("Usage: %s [options] kernel [kernel-args]\n\n"
84	  "Options:\n"
85	  "  --help\n"
86	  "  --version\n"
87	  "  --debug\n", progname);
88}
89
90static void
91version (const char *progname)
92{
93  printf ("%s " VERSION_STRING "\n", progname);
94}
95
96/* As an HSA runtime is dlopened, following structure defines the necessary
97   function pointers.
98   Code adapted from libgomp.  */
99
100struct hsa_runtime_fn_info
101{
102  /* HSA runtime.  */
103  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
104					const char **status_string);
105  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
106					 hsa_agent_info_t attribute,
107					 void *value);
108  hsa_status_t (*hsa_init_fn) (void);
109  hsa_status_t (*hsa_iterate_agents_fn)
110    (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
111  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
112					  hsa_region_info_t attribute,
113					  void *value);
114  hsa_status_t (*hsa_queue_create_fn)
115    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
116     void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
117     void *data, uint32_t private_segment_size,
118     uint32_t group_segment_size, hsa_queue_t **queue);
119  hsa_status_t (*hsa_agent_iterate_regions_fn)
120    (hsa_agent_t agent,
121     hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
122  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
123  hsa_status_t (*hsa_executable_create_fn)
124    (hsa_profile_t profile, hsa_executable_state_t executable_state,
125     const char *options, hsa_executable_t *executable);
126  hsa_status_t (*hsa_executable_global_variable_define_fn)
127    (hsa_executable_t executable, const char *variable_name, void *address);
128  hsa_status_t (*hsa_executable_load_code_object_fn)
129    (hsa_executable_t executable, hsa_agent_t agent,
130     hsa_code_object_t code_object, const char *options);
131  hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
132					    const char *options);
133  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
134					uint32_t num_consumers,
135					const hsa_agent_t *consumers,
136					hsa_signal_t *signal);
137  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
138					  void **ptr);
139  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
140					      hsa_access_permission_t access);
141  hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
142				      size_t size);
143  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
144  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
145  hsa_status_t (*hsa_executable_get_symbol_fn)
146    (hsa_executable_t executable, const char *module_name,
147     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
148     hsa_executable_symbol_t *symbol);
149  hsa_status_t (*hsa_executable_symbol_get_info_fn)
150    (hsa_executable_symbol_t executable_symbol,
151     hsa_executable_symbol_info_t attribute, void *value);
152  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
153				       hsa_signal_value_t value);
154  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
155    (hsa_signal_t signal, hsa_signal_condition_t condition,
156     hsa_signal_value_t compare_value, uint64_t timeout_hint,
157     hsa_wait_state_t wait_state_hint);
158  hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
159    (hsa_signal_t signal, hsa_signal_condition_t condition,
160     hsa_signal_value_t compare_value, uint64_t timeout_hint,
161     hsa_wait_state_t wait_state_hint);
162  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
163  hsa_status_t (*hsa_code_object_deserialize_fn)
164    (void *serialized_code_object, size_t serialized_code_object_size,
165     const char *options, hsa_code_object_t *code_object);
166  uint64_t (*hsa_queue_load_write_index_relaxed_fn)
167    (const hsa_queue_t *queue);
168  void (*hsa_queue_store_write_index_relaxed_fn)
169    (const hsa_queue_t *queue, uint64_t value);
170  hsa_status_t (*hsa_shut_down_fn) ();
171};
172
173/* HSA runtime functions that are initialized in init_hsa_context.
174   Code adapted from libgomp.  */
175
176static struct hsa_runtime_fn_info hsa_fns;
177
178#define DLSYM_FN(function)					 \
179  *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
180  if (hsa_fns.function##_fn == NULL)				 \
181    goto fail;
182
183static void
184init_hsa_runtime_functions (void)
185{
186  void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
187  if (handle == NULL)
188    {
189      fprintf (stderr,
190	       "The HSA runtime is required to run GCN kernels on hardware.\n"
191	       "%s: File not found or could not be opened\n",
192	       HSA_RUNTIME_LIB);
193      exit (1);
194    }
195
196  DLSYM_FN (hsa_status_string)
197  DLSYM_FN (hsa_agent_get_info)
198  DLSYM_FN (hsa_init)
199  DLSYM_FN (hsa_iterate_agents)
200  DLSYM_FN (hsa_region_get_info)
201  DLSYM_FN (hsa_queue_create)
202  DLSYM_FN (hsa_agent_iterate_regions)
203  DLSYM_FN (hsa_executable_destroy)
204  DLSYM_FN (hsa_executable_create)
205  DLSYM_FN (hsa_executable_global_variable_define)
206  DLSYM_FN (hsa_executable_load_code_object)
207  DLSYM_FN (hsa_executable_freeze)
208  DLSYM_FN (hsa_signal_create)
209  DLSYM_FN (hsa_memory_allocate)
210  DLSYM_FN (hsa_memory_assign_agent)
211  DLSYM_FN (hsa_memory_copy)
212  DLSYM_FN (hsa_memory_free)
213  DLSYM_FN (hsa_signal_destroy)
214  DLSYM_FN (hsa_executable_get_symbol)
215  DLSYM_FN (hsa_executable_symbol_get_info)
216  DLSYM_FN (hsa_signal_wait_acquire)
217  DLSYM_FN (hsa_signal_wait_relaxed)
218  DLSYM_FN (hsa_signal_store_relaxed)
219  DLSYM_FN (hsa_queue_destroy)
220  DLSYM_FN (hsa_code_object_deserialize)
221  DLSYM_FN (hsa_queue_load_write_index_relaxed)
222  DLSYM_FN (hsa_queue_store_write_index_relaxed)
223  DLSYM_FN (hsa_shut_down)
224
225  return;
226
227fail:
228  fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
229  exit (1);
230}
231
232#undef DLSYM_FN
233
234/* Report a fatal error STR together with the HSA error corresponding to
235   STATUS and terminate execution of the current process.  */
236
237static void
238hsa_fatal (const char *str, hsa_status_t status)
239{
240  const char *hsa_error_msg;
241  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
242  fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
243	   hsa_error_msg);
244  exit (1);
245}
246
247/* Helper macros to ensure we check the return values from the HSA Runtime.
248   These just keep the rest of the code a bit cleaner.  */
249
250#define XHSA_CMP(FN, CMP, MSG)		   \
251  do {					   \
252    hsa_status_t status = (FN);		   \
253    if (!(CMP))				   \
254      hsa_fatal ((MSG), status);	   \
255    else if (debug)			   \
256      fprintf (stderr, "%s: OK\n", (MSG)); \
257  } while (0)
258#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
259
260/* Callback of hsa_iterate_agents.
261   Called once for each available device, and returns "break" when a
262   suitable one has been found.  */
263
264static hsa_status_t
265get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
266{
267  hsa_device_type_t device_type;
268  XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
269				       &device_type),
270	"Get agent type");
271
272  /* Select only GPU devices.  */
273  /* TODO: support selecting from multiple GPUs.  */
274  if (HSA_DEVICE_TYPE_GPU == device_type)
275    {
276      device = agent;
277      return HSA_STATUS_INFO_BREAK;
278    }
279
280  /* The device was not suitable.  */
281  return HSA_STATUS_SUCCESS;
282}
283
284/* Callback of hsa_iterate_regions.
285   Called once for each available memory region, and returns "break" when a
286   suitable one has been found.  */
287
288static hsa_status_t
289get_memory_region (hsa_region_t region, hsa_region_t *retval,
290		   hsa_region_global_flag_t kind)
291{
292  /* Reject non-global regions.  */
293  hsa_region_segment_t segment;
294  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
295  if (HSA_REGION_SEGMENT_GLOBAL != segment)
296    return HSA_STATUS_SUCCESS;
297
298  /* Find a region with the KERNARG flag set.  */
299  hsa_region_global_flag_t flags;
300  hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
301				  &flags);
302  if (flags & kind)
303    {
304      *retval = region;
305      return HSA_STATUS_INFO_BREAK;
306    }
307
308  /* The region was not suitable.  */
309  return HSA_STATUS_SUCCESS;
310}
311
312static hsa_status_t
313get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
314{
315  return get_memory_region (region, &kernargs_region,
316			    HSA_REGION_GLOBAL_FLAG_KERNARG);
317}
318
319static hsa_status_t
320get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
321{
322  return get_memory_region (region, &heap_region,
323			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
324}
325
326/* Initialize the HSA Runtime library and GPU device.  */
327
328static void
329init_device ()
330{
331  /* Load the shared library and find the API functions.  */
332  init_hsa_runtime_functions ();
333
334  /* Initialize the HSA Runtime.  */
335  XHSA (hsa_fns.hsa_init_fn (),
336	"Initialize run-time");
337
338  /* Select a suitable device.
339     The call-back function, get_gpu_agent, does the selection.  */
340  XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
341	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
342	    "Find a device");
343
344  /* Initialize the queue used for launching kernels.  */
345  uint32_t queue_size = 0;
346  XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
347				       &queue_size),
348	"Find max queue size");
349  XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
350				     HSA_QUEUE_TYPE_SINGLE, NULL,
351				     NULL, UINT32_MAX, UINT32_MAX, &queue),
352	"Set up a device queue");
353
354  /* Select a memory region for the kernel arguments.
355     The call-back function, get_kernarg_region, does the selection.  */
356  XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
357						  NULL),
358	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
359	    "Locate kernargs memory");
360
361  /* Select a memory region for the kernel heap.
362     The call-back function, get_heap_region, does the selection.  */
363  XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
364						  NULL),
365	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
366	    "Locate device memory");
367}
368
369
370/* Read a whole input file.
371   Code copied from mkoffload. */
372
373static char *
374read_file (const char *filename, size_t *plen)
375{
376  size_t alloc = 16384;
377  size_t base = 0;
378  char *buffer;
379
380  FILE *stream = fopen (filename, "rb");
381  if (!stream)
382    {
383      perror (filename);
384      exit (1);
385    }
386
387  if (!fseek (stream, 0, SEEK_END))
388    {
389      /* Get the file size.  */
390      long s = ftell (stream);
391      if (s >= 0)
392	alloc = s + 100;
393      fseek (stream, 0, SEEK_SET);
394    }
395  buffer = malloc (alloc);
396
397  for (;;)
398    {
399      size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
400
401      if (!n)
402	break;
403      base += n;
404      if (base + 1 == alloc)
405	{
406	  alloc *= 2;
407	  buffer = realloc (buffer, alloc);
408	}
409    }
410  buffer[base] = 0;
411  *plen = base;
412
413  fclose (stream);
414
415  return buffer;
416}
417
418/* Read a HSA Code Object (HSACO) from file, and load it into the device.  */
419
420static void
421load_image (const char *filename)
422{
423  size_t image_size;
424  Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
425
426  /* An "executable" consists of one or more code objects.  */
427  XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
428					  HSA_EXECUTABLE_STATE_UNFROZEN, "",
429					  &executable),
430	"Initialize GCN executable");
431
432  /* Hide relocations from the HSA runtime loader.
433     Keep a copy of the unmodified section headers to use later.  */
434  Elf64_Shdr *image_sections =
435    (Elf64_Shdr *) ((char *) image + image->e_shoff);
436  Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
437  memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
438  for (int i = image->e_shnum - 1; i >= 0; i--)
439    {
440      if (image_sections[i].sh_type == SHT_RELA
441	  || image_sections[i].sh_type == SHT_REL)
442	/* Change section type to something harmless.  */
443	image_sections[i].sh_type = SHT_NOTE;
444    }
445
446  /* Add the HSACO to the executable.  */
447  hsa_code_object_t co = { 0 };
448  XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
449	"Deserialize GCN code object");
450  XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
451						    ""),
452	"Load GCN code object");
453
454  /* We're done modifying he executable.  */
455  XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
456	"Freeze GCN executable");
457
458  /* Locate the "_init_array" function, and read the kernel's properties.  */
459  hsa_executable_symbol_t symbol;
460  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
461					      device, 0, &symbol),
462	"Find '_init_array' function");
463  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
464	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
465	"Extract '_init_array' kernel object kernel object");
466
467  /* Locate the "_fini_array" function, and read the kernel's properties.  */
468  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
469					      device, 0, &symbol),
470	"Find '_fini_array' function");
471  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
472	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
473	"Extract '_fini_array' kernel object kernel object");
474
475  /* Locate the "main" function, and read the kernel's properties.  */
476  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
477					      device, 0, &symbol),
478	"Find 'main' function");
479  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
480	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
481	"Extract 'main' kernel object");
482  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
483	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
484	     &kernarg_segment_size),
485	"Extract kernarg segment size");
486  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
487	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
488	     &group_segment_size),
489	"Extract group segment size");
490  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
491	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
492	     &private_segment_size),
493	"Extract private segment size");
494
495  /* Find main function in ELF, and calculate actual load offset.  */
496  Elf64_Addr load_offset;
497  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
498	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
499	     &load_offset),
500	"Extract 'main' symbol address");
501  for (int i = 0; i < image->e_shnum; i++)
502    if (sections[i].sh_type == SHT_SYMTAB)
503      {
504	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
505	char *strings = (char *) image + strtab->sh_offset;
506
507	for (size_t offset = 0;
508	     offset < sections[i].sh_size;
509	     offset += sections[i].sh_entsize)
510	  {
511	    Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
512					    + sections[i].sh_offset + offset);
513	    if (strcmp ("main", strings + sym->st_name) == 0)
514	      {
515		load_offset -= sym->st_value;
516		goto found_main;
517	      }
518	  }
519      }
520  /* We only get here when main was not found.
521     This should never happen.  */
522  fprintf (stderr, "Error: main function not found.\n");
523  abort ();
524found_main:;
525
526  /* Find dynamic symbol table.  */
527  Elf64_Shdr *dynsym = NULL;
528  for (int i = 0; i < image->e_shnum; i++)
529    if (sections[i].sh_type == SHT_DYNSYM)
530      {
531	dynsym = &sections[i];
532	break;
533      }
534
535  /* Fix up relocations.  */
536  for (int i = 0; i < image->e_shnum; i++)
537    {
538      if (sections[i].sh_type == SHT_RELA)
539	for (size_t offset = 0;
540	     offset < sections[i].sh_size;
541	     offset += sections[i].sh_entsize)
542	  {
543	    Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
544						+ sections[i].sh_offset
545						+ offset);
546	    Elf64_Sym *sym =
547	      (dynsym
548	       ? (Elf64_Sym *) ((char *) image
549				+ dynsym->sh_offset
550				+ (dynsym->sh_entsize
551				   * ELF64_R_SYM (reloc->r_info))) : NULL);
552
553	    int64_t S = (sym ? sym->st_value : 0);
554	    int64_t P = reloc->r_offset + load_offset;
555	    int64_t A = reloc->r_addend;
556	    int64_t B = load_offset;
557	    int64_t V, size;
558	    switch (ELF64_R_TYPE (reloc->r_info))
559	      {
560	      case R_AMDGPU_ABS32_LO:
561		V = (S + A) & 0xFFFFFFFF;
562		size = 4;
563		break;
564	      case R_AMDGPU_ABS32_HI:
565		V = (S + A) >> 32;
566		size = 4;
567		break;
568	      case R_AMDGPU_ABS64:
569		V = S + A;
570		size = 8;
571		break;
572	      case R_AMDGPU_REL32:
573		V = S + A - P;
574		size = 4;
575		break;
576	      case R_AMDGPU_REL64:
577		/* FIXME
578		   LLD seems to emit REL64 where the assembler has ABS64.
579		   This is clearly wrong because it's not what the compiler
580		   is expecting.  Let's assume, for now, that it's a bug.
581		   In any case, GCN kernels are always self contained and
582		   therefore relative relocations will have been resolved
583		   already, so this should be a safe workaround.  */
584		V = S + A /* - P */ ;
585		size = 8;
586		break;
587	      case R_AMDGPU_ABS32:
588		V = S + A;
589		size = 4;
590		break;
591	      /* TODO R_AMDGPU_GOTPCREL */
592	      /* TODO R_AMDGPU_GOTPCREL32_LO */
593	      /* TODO R_AMDGPU_GOTPCREL32_HI */
594	      case R_AMDGPU_REL32_LO:
595		V = (S + A - P) & 0xFFFFFFFF;
596		size = 4;
597		break;
598	      case R_AMDGPU_REL32_HI:
599		V = (S + A - P) >> 32;
600		size = 4;
601		break;
602	      case R_AMDGPU_RELATIVE64:
603		V = B + A;
604		size = 8;
605		break;
606	      default:
607		fprintf (stderr, "Error: unsupported relocation type.\n");
608		exit (1);
609	      }
610	    XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
611		  "Fix up relocation");
612	  }
613    }
614}
615
616/* Allocate some device memory from the kernargs region.
617   The returned address will be 32-bit (with excess zeroed on 64-bit host),
618   and accessible via the same address on both host and target (via
619   __flat_scalar GCN address space).  */
620
621static void *
622device_malloc (size_t size, hsa_region_t region)
623{
624  void *result;
625  XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
626	"Allocate device memory");
627  return result;
628}
629
630/* These are the device pointers that will be transferred to the target.
631   The HSA Runtime points the kernargs register here.
632   They correspond to function signature:
633       int main (int argc, char *argv[], int *return_value)
634   The compiler expects this, for kernel functions, and will
635   automatically assign the exit value to *return_value.  */
636struct kernargs
637{
638  /* Kernargs.  */
639  int32_t argc;
640  int64_t argv;
641  int64_t out_ptr;
642  int64_t heap_ptr;
643
644  /* Output data.  */
645  struct output
646  {
647    int return_value;
648    unsigned int next_output;
649    struct printf_data
650    {
651      int written;
652      char msg[128];
653      int type;
654      union
655      {
656	int64_t ivalue;
657	double dvalue;
658	char text[128];
659      };
660    } queue[1024];
661    unsigned int consumed;
662  } output_data;
663};
664
665struct heap
666{
667  int64_t size;
668  char data[0];
669} heap;
670
671/* Print any console output from the kernel.
672   We print all entries from "consumed" to the next entry without a "written"
673   flag, or "next_output" is reached.  The buffer is circular, but the
674   indices are absolute.  It is assumed the kernel will stop writing data
675   if "next_output" wraps (becomes smaller than "consumed").  */
676void
677gomp_print_output (struct kernargs *kernargs, bool final)
678{
679  unsigned int limit = (sizeof (kernargs->output_data.queue)
680			/ sizeof (kernargs->output_data.queue[0]));
681
682  unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
683				       __ATOMIC_ACQUIRE);
684  unsigned int to = kernargs->output_data.next_output;
685
686  if (from > to)
687    {
688      /* Overflow.  */
689      if (final)
690	printf ("GCN print buffer overflowed.\n");
691      return;
692    }
693
694  unsigned int i;
695  for (i = from; i < to; i++)
696    {
697      struct printf_data *data = &kernargs->output_data.queue[i%limit];
698
699      if (!data->written && !final)
700	break;
701
702      switch (data->type)
703	{
704	case 0:
705	  printf ("%.128s%ld\n", data->msg, data->ivalue);
706	  break;
707	case 1:
708	  printf ("%.128s%f\n", data->msg, data->dvalue);
709	  break;
710	case 2:
711	  printf ("%.128s%.128s\n", data->msg, data->text);
712	  break;
713	case 3:
714	  printf ("%.128s%.128s", data->msg, data->text);
715	  break;
716	default:
717	  printf ("GCN print buffer error!\n");
718	  break;
719	}
720
721      data->written = 0;
722      __atomic_store_n (&kernargs->output_data.consumed, i+1,
723			__ATOMIC_RELEASE);
724    }
725  fflush (stdout);
726}
727
728/* Execute an already-loaded kernel on the device.  */
729
730static void
731run (uint64_t kernel, void *kernargs)
732{
733  /* A "signal" is used to launch and monitor the kernel.  */
734  hsa_signal_t signal;
735  XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
736	"Create signal");
737
738  /* Configure for a single-worker kernel.  */
739  uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
740  const uint32_t queueMask = queue->size - 1;
741  hsa_kernel_dispatch_packet_t *dispatch_packet =
742    &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
743							       queueMask]);
744  dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
745  dispatch_packet->workgroup_size_x = (uint16_t) 1;
746  dispatch_packet->workgroup_size_y = (uint16_t) 64;
747  dispatch_packet->workgroup_size_z = (uint16_t) 1;
748  dispatch_packet->grid_size_x = 1;
749  dispatch_packet->grid_size_y = 64;
750  dispatch_packet->grid_size_z = 1;
751  dispatch_packet->completion_signal = signal;
752  dispatch_packet->kernel_object = kernel;
753  dispatch_packet->kernarg_address = (void *) kernargs;
754  dispatch_packet->private_segment_size = private_segment_size;
755  dispatch_packet->group_segment_size = group_segment_size;
756
757  uint16_t header = 0;
758  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
759  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
760  header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
761
762  __atomic_store_n ((uint32_t *) dispatch_packet,
763		    header | (dispatch_packet->setup << 16),
764		    __ATOMIC_RELEASE);
765
766  if (debug)
767    fprintf (stderr, "Launch kernel\n");
768
769  hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
770  hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
771  /* Kernel running ......  */
772  while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
773					     1, 1000000,
774					     HSA_WAIT_STATE_ACTIVE) != 0)
775    {
776      usleep (10000);
777      gomp_print_output (kernargs, false);
778    }
779
780  gomp_print_output (kernargs, true);
781
782  if (debug)
783    fprintf (stderr, "Kernel exited\n");
784
785  XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
786	"Clean up signal");
787}
788
789int
790main (int argc, char *argv[])
791{
792  int kernel_arg = 0;
793  for (int i = 1; i < argc; i++)
794    {
795      if (!strcmp (argv[i], "--help"))
796	{
797	  usage (argv[0]);
798	  return 0;
799	}
800      else if (!strcmp (argv[i], "--version"))
801	{
802	  version (argv[0]);
803	  return 0;
804	}
805      else if (!strcmp (argv[i], "--debug"))
806	debug = true;
807      else if (argv[i][0] == '-')
808	{
809	  usage (argv[0]);
810	  return 1;
811	}
812      else
813	{
814	  kernel_arg = i;
815	  break;
816	}
817    }
818
819  if (!kernel_arg)
820    {
821      /* No kernel arguments were found.  */
822      usage (argv[0]);
823      return 1;
824    }
825
826  /* The remaining arguments are for the GCN kernel.  */
827  int kernel_argc = argc - kernel_arg;
828  char **kernel_argv = &argv[kernel_arg];
829
830  init_device ();
831  load_image (kernel_argv[0]);
832
833  /* Calculate size of function parameters + argv data.  */
834  size_t args_size = 0;
835  for (int i = 0; i < kernel_argc; i++)
836    args_size += strlen (kernel_argv[i]) + 1;
837
838  /* Allocate device memory for both function parameters and the argv
839     data.  */
840  struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
841					     kernargs_region);
842  struct argdata
843  {
844    int64_t argv_data[kernel_argc];
845    char strings[args_size];
846  } *args = device_malloc (sizeof (struct argdata), kernargs_region);
847
848  size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
849  struct heap *heap = device_malloc (heap_size, heap_region);
850  XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
851					    HSA_ACCESS_PERMISSION_RW),
852	"Assign heap to device agent");
853
854  /* Write the data to the target.  */
855  kernargs->argc = kernel_argc;
856  kernargs->argv = (int64_t) args->argv_data;
857  kernargs->out_ptr = (int64_t) &kernargs->output_data;
858  kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
859  kernargs->output_data.next_output = 0;
860  for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
861			    / sizeof (kernargs->output_data.queue[0])); i++)
862    kernargs->output_data.queue[i].written = 0;
863  kernargs->output_data.consumed = 0;
864  int offset = 0;
865  for (int i = 0; i < kernel_argc; i++)
866    {
867      size_t arg_len = strlen (kernel_argv[i]) + 1;
868      args->argv_data[i] = (int64_t) &args->strings[offset];
869      memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
870      offset += arg_len;
871    }
872  kernargs->heap_ptr = (int64_t) heap;
873  hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
874
875  /* Run constructors on the GPU.  */
876  run (init_array_kernel, kernargs);
877
878  /* Run the kernel on the GPU.  */
879  run (main_kernel, kernargs);
880  unsigned int return_value =
881    (unsigned int) kernargs->output_data.return_value;
882
883  /* Run destructors on the GPU.  */
884  run (fini_array_kernel, kernargs);
885
886  unsigned int upper = (return_value & ~0xffff) >> 16;
887  if (upper == 0xcafe)
888    {
889      printf ("Kernel exit value was never set\n");
890      return_value = 0xff;
891    }
892  else if (upper == 0xffff)
893    ; /* Set by exit.  */
894  else if (upper == 0)
895    ; /* Set by return from main.  */
896  else
897    printf ("Possible kernel exit value corruption, 2 most significant bytes "
898	    "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
899
900  if (upper == 0xffff)
901    {
902      unsigned int signal = (return_value >> 8) & 0xff;
903      if (signal == SIGABRT)
904	printf ("Kernel aborted\n");
905      else if (signal != 0)
906	printf ("Kernel received unkown signal\n");
907    }
908
909  if (debug)
910    printf ("Kernel exit value: %d\n", return_value & 0xff);
911
912  /* Clean shut down.  */
913  XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
914	"Clean up device memory");
915  XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
916	"Clean up GCN executable");
917  XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
918	"Clean up device queue");
919  XHSA (hsa_fns.hsa_shut_down_fn (),
920	"Shut down run-time");
921
922  return return_value & 0xff;
923}
924