1/* Copyright (C) 2016-2022 Free Software Foundation, Inc.
2
3   This file is free software; you can redistribute it and/or modify it under
4   the terms of the GNU General Public License as published by the Free
5   Software Foundation; either version 3 of the License, or (at your option)
6   any later version.
7
8   This file is distributed in the hope that it will be useful, but WITHOUT
9   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11   for more details.
12
13   You should have received a copy of the GNU General Public License
14   along with GCC; see the file COPYING3.  If not see
15   <http://www.gnu.org/licenses/>.  */
16
17/* {{{ Includes.  */
18
19/* We want GET_MODE_SIZE et al to return integers, please.  */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
53#include "dwarf2.h"
54#include "gimple.h"
55
56/* This file should be included last.  */
57#include "target-def.h"
58
59/* }}}  */
60/* {{{ Global variables.  */
61
62/* Constants used by FP instructions.  */
63
64static REAL_VALUE_TYPE dconst4, dconst1over2pi;
65static bool ext_gcn_constants_init = 0;
66
67/* Holds the ISA variant, derived from the command line parameters.  */
68
69int gcn_isa = 3;		/* Default to GCN3.  */
70
71/* Reserve this much space for LDS (for propagating variables from
72   worker-single mode to worker-partitioned mode), per workgroup.  Global
73   analysis could calculate an exact bound, but we don't do that yet.
74
75   We want to permit full occupancy, so size accordingly.  */
76
77/* Use this as a default, but allow it to grow if the user requests a large
78   amount of gang-private shared-memory space.  */
79static int acc_lds_size = 0x600;
80
81#define OMP_LDS_SIZE 0x600    /* 0x600 is 1/40 total, rounded down.  */
82#define ACC_LDS_SIZE acc_lds_size
83#define OTHER_LDS_SIZE 65536  /* If in doubt, reserve all of it.  */
84
85#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
86		  : flag_openmp ? OMP_LDS_SIZE \
87		  : OTHER_LDS_SIZE)
88
89static int gang_private_hwm = 32;
90static hash_map<tree, int> lds_allocs;
91
92/* The number of registers usable by normal non-kernel functions.
93   The SGPR count includes any special extra registers such as VCC.  */
94
95#define MAX_NORMAL_SGPR_COUNT	62  // i.e. 64 with VCC
96#define MAX_NORMAL_VGPR_COUNT	24
97
98/* }}}  */
99/* {{{ Initialization and options.  */
100
101/* Initialize machine_function.  */
102
103static struct machine_function *
104gcn_init_machine_status (void)
105{
106  struct machine_function *f;
107
108  f = ggc_cleared_alloc<machine_function> ();
109
110  if (TARGET_GCN3)
111    f->use_flat_addressing = true;
112
113  return f;
114}
115
116/* Implement TARGET_OPTION_OVERRIDE.
117
118   Override option settings where defaults are variable, or we have specific
119   needs to consider.  */
120
121static void
122gcn_option_override (void)
123{
124  init_machine_status = gcn_init_machine_status;
125
126  /* The HSA runtime does not respect ELF load addresses, so force PIE.  */
127  if (!flag_pie)
128    flag_pie = 2;
129  if (!flag_pic)
130    flag_pic = flag_pie;
131
132  gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
133
134  /* The default stack size needs to be small for offload kernels because
135     there may be many, many threads.  Also, a smaller stack gives a
136     measureable performance boost.  But, a small stack is insufficient
137     for running the testsuite, so we use a larger default for the stand
138     alone case.  */
139  if (stack_size_opt == -1)
140    {
141      if (flag_openacc || flag_openmp)
142	/* 512 bytes per work item = 32kB total.  */
143	stack_size_opt = 512 * 64;
144      else
145	/* 1MB total.  */
146	stack_size_opt = 1048576;
147    }
148
149  /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
150     worker broadcasts.  */
151  if (gang_private_size_opt == -1)
152    gang_private_size_opt = 512;
153  else if (gang_private_size_opt < gang_private_hwm)
154    gang_private_size_opt = gang_private_hwm;
155  else if (gang_private_size_opt >= acc_lds_size - 1024)
156    {
157      /* We need some space for reductions and worker broadcasting.  If the
158	 user requests a large amount of gang-private LDS space, we might not
159	 have enough left for the former.  Increase the LDS allocation in that
160	 case, although this may reduce the maximum occupancy on the
161	 hardware.  */
162      acc_lds_size = gang_private_size_opt + 1024;
163      if (acc_lds_size > 32768)
164	acc_lds_size = 32768;
165    }
166
167  /* The xnack option is a placeholder, for now.  */
168  if (flag_xnack)
169    sorry ("XNACK support");
170}
171
172/* }}}  */
173/* {{{ Attributes.  */
174
175/* This table defines the arguments that are permitted in
176   __attribute__ ((amdgpu_hsa_kernel (...))).
177
178   The names and values correspond to the HSA metadata that is encoded
179   into the assembler file and binary.  */
180
181static const struct gcn_kernel_arg_type
182{
183  const char *name;
184  const char *header_pseudo;
185  machine_mode mode;
186
187  /* This should be set to -1 or -2 for a dynamically allocated register
188     number.  Use -1 if this argument contributes to the user_sgpr_count,
189     -2 otherwise.  */
190  int fixed_regno;
191} gcn_kernel_arg_types[] = {
192  {"exec", NULL, DImode, EXEC_REG},
193#define PRIVATE_SEGMENT_BUFFER_ARG 1
194  {"private_segment_buffer",
195    ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
196#define DISPATCH_PTR_ARG 2
197  {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
198#define QUEUE_PTR_ARG 3
199  {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
200#define KERNARG_SEGMENT_PTR_ARG 4
201  {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
202  {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
203#define FLAT_SCRATCH_INIT_ARG 6
204  {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
205#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
206  {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
207#define WORKGROUP_ID_X_ARG 8
208  {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
209  {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
210  {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
211  {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
212#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
213  {"private_segment_wave_offset",
214    ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
215#define WORK_ITEM_ID_X_ARG 13
216  {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
217#define WORK_ITEM_ID_Y_ARG 14
218  {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
219#define WORK_ITEM_ID_Z_ARG 15
220  {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
221};
222
223static const long default_requested_args
224	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
225	  | (1 << DISPATCH_PTR_ARG)
226	  | (1 << QUEUE_PTR_ARG)
227	  | (1 << KERNARG_SEGMENT_PTR_ARG)
228	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
229	  | (1 << WORKGROUP_ID_X_ARG)
230	  | (1 << WORK_ITEM_ID_X_ARG)
231	  | (1 << WORK_ITEM_ID_Y_ARG)
232	  | (1 << WORK_ITEM_ID_Z_ARG);
233
234/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
235   This function also sets the default values for some arguments.
236
237   Return true on success, with ARGS populated.  */
238
239static bool
240gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
241				       tree list)
242{
243  bool err = false;
244  args->requested = default_requested_args;
245  args->nargs = 0;
246
247  for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
248    args->reg[a] = -1;
249
250  for (; list; list = TREE_CHAIN (list))
251    {
252      const char *str;
253      if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
254	{
255	  error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
256		 "arguments");
257	  break;
258	}
259      str = TREE_STRING_POINTER (TREE_VALUE (list));
260      int a;
261      for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
262	{
263	  if (!strcmp (str, gcn_kernel_arg_types[a].name))
264	    break;
265	}
266      if (a == GCN_KERNEL_ARG_TYPES)
267	{
268	  error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
269		 str);
270	  err = true;
271	  break;
272	}
273      if (args->requested & (1 << a))
274	{
275	  error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
276		 "attribute", str);
277	  err = true;
278	  break;
279	}
280      args->requested |= (1 << a);
281      args->order[args->nargs++] = a;
282    }
283
284  /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
285     WORK_ITEM_ID_Y_ARG.  Similarly, requesting WORK_ITEM_ID_Y_ARG implies
286     requesting WORK_ITEM_ID_X_ARG.  */
287  if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
288    args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
289  if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
290    args->requested |= (1 << WORK_ITEM_ID_X_ARG);
291
292  int sgpr_regno = FIRST_SGPR_REG;
293  args->nsgprs = 0;
294  for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
295    {
296      if (!(args->requested & (1 << a)))
297	continue;
298
299      if (gcn_kernel_arg_types[a].fixed_regno >= 0)
300	args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
301      else
302	{
303	  int reg_count;
304
305	  switch (gcn_kernel_arg_types[a].mode)
306	    {
307	    case E_SImode:
308	      reg_count = 1;
309	      break;
310	    case E_DImode:
311	      reg_count = 2;
312	      break;
313	    case E_TImode:
314	      reg_count = 4;
315	      break;
316	    default:
317	      gcc_unreachable ();
318	    }
319	  args->reg[a] = sgpr_regno;
320	  sgpr_regno += reg_count;
321	  if (gcn_kernel_arg_types[a].fixed_regno == -1)
322	    args->nsgprs += reg_count;
323	}
324    }
325  if (sgpr_regno > FIRST_SGPR_REG + 16)
326    {
327      error ("too many arguments passed in sgpr registers");
328    }
329  return err;
330}
331
332/* Referenced by TARGET_ATTRIBUTE_TABLE.
333
334   Validates target specific attributes.  */
335
336static tree
337gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
338					tree args, int, bool *no_add_attrs)
339{
340  if (!FUNC_OR_METHOD_TYPE_P (*node))
341    {
342      warning (OPT_Wattributes, "%qE attribute only applies to functions",
343	       name);
344      *no_add_attrs = true;
345      return NULL_TREE;
346    }
347
348  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
349  if (is_attribute_p ("gcnhsa_kernel", name))
350    {
351      struct gcn_kernel_args kernelarg;
352
353      if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
354	*no_add_attrs = true;
355
356      return NULL_TREE;
357    }
358
359  return NULL_TREE;
360}
361
362/* Implement TARGET_ATTRIBUTE_TABLE.
363
364   Create target-specific __attribute__ types.  */
365
366static const struct attribute_spec gcn_attribute_table[] = {
367  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
368     affects_type_identity } */
369  {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
370   true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
371  /* End element.  */
372  {NULL, 0, 0, false, false, false, false, NULL, NULL}
373};
374
375/* }}}  */
376/* {{{ Registers and modes.  */
377
378/* Implement TARGET_SCALAR_MODE_SUPPORTED_P.  */
379
380bool
381gcn_scalar_mode_supported_p (scalar_mode mode)
382{
383  return (mode == BImode
384	  || mode == QImode
385	  || mode == HImode /* || mode == HFmode  */
386	  || mode == SImode || mode == SFmode
387	  || mode == DImode || mode == DFmode
388	  || mode == TImode);
389}
390
391/* Implement TARGET_CLASS_MAX_NREGS.
392
393   Return the number of hard registers needed to hold a value of MODE in
394   a register of class RCLASS.  */
395
396static unsigned char
397gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
398{
399  /* Scalar registers are 32bit, vector registers are in fact tuples of
400     64 lanes.  */
401  if (rclass == VGPR_REGS)
402    {
403      if (vgpr_1reg_mode_p (mode))
404	return 1;
405      if (vgpr_2reg_mode_p (mode))
406	return 2;
407      /* TImode is used by DImode compare_and_swap.  */
408      if (mode == TImode)
409	return 4;
410    }
411  else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
412    return 2;
413  return CEIL (GET_MODE_SIZE (mode), 4);
414}
415
416/* Implement TARGET_HARD_REGNO_NREGS.
417
418   Return the number of hard registers needed to hold a value of MODE in
419   REGNO.  */
420
421unsigned int
422gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
423{
424  return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
425}
426
427/* Implement TARGET_HARD_REGNO_MODE_OK.
428
429   Return true if REGNO can hold value in MODE.  */
430
431bool
432gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
433{
434  /* Treat a complex mode as if it were a scalar mode of the same overall
435     size for the purposes of allocating hard registers.  */
436  if (COMPLEX_MODE_P (mode))
437    switch (mode)
438      {
439      case E_CQImode:
440      case E_CHImode:
441	mode = SImode;
442	break;
443      case E_CSImode:
444	mode = DImode;
445	break;
446      case E_CDImode:
447	mode = TImode;
448	break;
449      case E_HCmode:
450	mode = SFmode;
451	break;
452      case E_SCmode:
453	mode = DFmode;
454	break;
455      default:
456	/* Not supported.  */
457	return false;
458      }
459
460  switch (regno)
461    {
462    case FLAT_SCRATCH_LO_REG:
463    case XNACK_MASK_LO_REG:
464    case TBA_LO_REG:
465    case TMA_LO_REG:
466      return (mode == SImode || mode == DImode);
467    case VCC_LO_REG:
468    case EXEC_LO_REG:
469      return (mode == BImode || mode == SImode || mode == DImode);
470    case M0_REG:
471    case FLAT_SCRATCH_HI_REG:
472    case XNACK_MASK_HI_REG:
473    case TBA_HI_REG:
474    case TMA_HI_REG:
475      return mode == SImode;
476    case VCC_HI_REG:
477      return false;
478    case EXEC_HI_REG:
479      return mode == SImode /*|| mode == V32BImode */ ;
480    case SCC_REG:
481    case VCCZ_REG:
482    case EXECZ_REG:
483      return mode == BImode;
484    }
485  if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
486    return true;
487  if (SGPR_REGNO_P (regno))
488    /* We restrict double register values to aligned registers.  */
489    return (sgpr_1reg_mode_p (mode)
490	    || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
491	    || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
492  if (VGPR_REGNO_P (regno))
493    /* Vector instructions do not care about the alignment of register
494       pairs, but where there is no 64-bit instruction, many of the
495       define_split do not work if the input and output registers partially
496       overlap.  We tried to fix this with early clobber and match
497       constraints, but it was bug prone, added complexity, and conflicts
498       with the 'U0' constraints on vec_merge.
499       Therefore, we restrict ourselved to aligned registers.  */
500    return (vgpr_1reg_mode_p (mode)
501	    || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
502	    /* TImode is used by DImode compare_and_swap.  */
503	    || (mode == TImode
504		&& !((regno - FIRST_VGPR_REG) & 3)));
505  return false;
506}
507
508/* Implement REGNO_REG_CLASS via gcn.h.
509
510   Return smallest class containing REGNO.  */
511
512enum reg_class
513gcn_regno_reg_class (int regno)
514{
515  switch (regno)
516    {
517    case SCC_REG:
518      return SCC_CONDITIONAL_REG;
519    case VCC_LO_REG:
520    case VCC_HI_REG:
521      return VCC_CONDITIONAL_REG;
522    case VCCZ_REG:
523      return VCCZ_CONDITIONAL_REG;
524    case EXECZ_REG:
525      return EXECZ_CONDITIONAL_REG;
526    case EXEC_LO_REG:
527    case EXEC_HI_REG:
528      return EXEC_MASK_REG;
529    }
530  if (VGPR_REGNO_P (regno))
531    return VGPR_REGS;
532  if (SGPR_REGNO_P (regno))
533    return SGPR_REGS;
534  if (regno < FIRST_VGPR_REG)
535    return GENERAL_REGS;
536  if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
537    return AFP_REGS;
538  return ALL_REGS;
539}
540
541/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
542
543   GCC assumes that lowpart contains first part of value as stored in memory.
544   This is not the case for vector registers.  */
545
546bool
547gcn_can_change_mode_class (machine_mode from, machine_mode to,
548			   reg_class_t regclass)
549{
550  if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
551    return true;
552  return (gcn_class_max_nregs (regclass, from)
553	  == gcn_class_max_nregs (regclass, to));
554}
555
556/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
557
558   When this hook returns true for MODE, the compiler allows
559   registers explicitly used in the rtl to be used as spill registers
560   but prevents the compiler from extending the lifetime of these
561   registers.  */
562
563bool
564gcn_small_register_classes_for_mode_p (machine_mode mode)
565{
566  /* We allocate into exec and vcc regs.  Those make small register class.  */
567  return mode == DImode || mode == SImode;
568}
569
570/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
571
572   Returns true if pseudos that have been assigned to registers of class RCLASS
573   would likely be spilled because registers of RCLASS are needed for spill
574   registers.  */
575
576static bool
577gcn_class_likely_spilled_p (reg_class_t rclass)
578{
579  return (rclass == EXEC_MASK_REG
580	  || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
581}
582
583/* Implement TARGET_MODES_TIEABLE_P.
584
585   Returns true if a value of MODE1 is accessible in MODE2 without
586   copying.  */
587
588bool
589gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
590{
591  return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
592	  && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
593}
594
595/* Implement TARGET_TRULY_NOOP_TRUNCATION.
596
597   Returns true if it is safe to ���convert��� a value of INPREC bits to one of
598   OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
599   it as if it had only OUTPREC bits.  */
600
601bool
602gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
603{
604  return ((inprec <= 32) && (outprec <= inprec));
605}
606
607/* Return N-th part of value occupying multiple registers.  */
608
609rtx
610gcn_operand_part (machine_mode mode, rtx op, int n)
611{
612  if (GET_MODE_SIZE (mode) >= 256)
613    {
614      /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0);  */
615
616      if (REG_P (op))
617	{
618	  gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
619	  return gen_rtx_REG (V64SImode, REGNO (op) + n);
620	}
621      if (GET_CODE (op) == CONST_VECTOR)
622	{
623	  int units = GET_MODE_NUNITS (mode);
624	  rtvec v = rtvec_alloc (units);
625
626	  for (int i = 0; i < units; ++i)
627	    RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
628						 CONST_VECTOR_ELT (op, i), n);
629
630	  return gen_rtx_CONST_VECTOR (V64SImode, v);
631	}
632      if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
633	return gcn_gen_undef (V64SImode);
634      gcc_unreachable ();
635    }
636  else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
637    {
638      gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
639      return gen_rtx_REG (SImode, REGNO (op) + n);
640    }
641  else
642    {
643      if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
644	return gcn_gen_undef (SImode);
645
646      /* If it's a constant then let's assume it is of the largest mode
647	 available, otherwise simplify_gen_subreg will fail.  */
648      if (mode == VOIDmode && CONST_INT_P (op))
649	mode = DImode;
650      return simplify_gen_subreg (SImode, op, mode, n * 4);
651    }
652}
653
654/* Return N-th part of value occupying multiple registers.  */
655
656rtx
657gcn_operand_doublepart (machine_mode mode, rtx op, int n)
658{
659  return simplify_gen_subreg (DImode, op, mode, n * 8);
660}
661
662/* Return true if OP can be split into subregs or high/low parts.
663   This is always true for scalars, but not normally true for vectors.
664   However, for vectors in hardregs we can use the low and high registers.  */
665
666bool
667gcn_can_split_p (machine_mode, rtx op)
668{
669  if (vgpr_vector_mode_p (GET_MODE (op)))
670    {
671      if (GET_CODE (op) == SUBREG)
672	op = SUBREG_REG (op);
673      if (!REG_P (op))
674	return true;
675      return REGNO (op) <= FIRST_PSEUDO_REGISTER;
676    }
677  return true;
678}
679
680/* Implement TARGET_SPILL_CLASS.
681
682   Return class of registers which could be used for pseudo of MODE
683   and of class RCLASS for spilling instead of memory.  Return NO_REGS
684   if it is not possible or non-profitable.  */
685
686static reg_class_t
687gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
688{
689  if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
690      || c == VCC_CONDITIONAL_REG)
691    return SGPR_REGS;
692  else
693    return NO_REGS;
694}
695
696/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
697
698   Change allocno class for given pseudo from allocno and best class
699   calculated by IRA.  */
700
701static reg_class_t
702gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
703				     reg_class_t best_cl)
704{
705  /* Avoid returning classes that contain both vgpr and sgpr registers.  */
706  if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
707    return cl;
708  if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
709      && best_cl != ALL_GPR_REGS)
710    return best_cl;
711
712  machine_mode mode = PSEUDO_REGNO_MODE (regno);
713  if (vgpr_vector_mode_p (mode))
714    return VGPR_REGS;
715
716  return GENERAL_REGS;
717}
718
719/* Create a new DImode pseudo reg and emit an instruction to initialize
720   it to VAL.  */
721
722static rtx
723get_exec (int64_t val)
724{
725  rtx reg = gen_reg_rtx (DImode);
726  emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
727  return reg;
728}
729
730/* Return value of scalar exec register.  */
731
732rtx
733gcn_scalar_exec ()
734{
735  return const1_rtx;
736}
737
738/* Return pseudo holding scalar exec register.  */
739
740rtx
741gcn_scalar_exec_reg ()
742{
743  return get_exec (1);
744}
745
746/* Return value of full exec register.  */
747
748rtx
749gcn_full_exec ()
750{
751  return constm1_rtx;
752}
753
754/* Return pseudo holding full exec register.  */
755
756rtx
757gcn_full_exec_reg ()
758{
759  return get_exec (-1);
760}
761
762/* }}}  */
763/* {{{ Immediate constants.  */
764
765/* Initialize shared numeric constants.  */
766
767static void
768init_ext_gcn_constants (void)
769{
770  real_from_integer (&dconst4, DFmode, 4, SIGNED);
771
772  /* FIXME: this constant probably does not match what hardware really loads.
773     Reality check it eventually.  */
774  real_from_string (&dconst1over2pi,
775		    "0.1591549430918953357663423455968866839");
776  real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
777
778  ext_gcn_constants_init = 1;
779}
780
781/* Return non-zero if X is a constant that can appear as an inline operand.
782   This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
783   Or a vector of those.
784   The value returned should be the encoding of this constant.  */
785
786int
787gcn_inline_fp_constant_p (rtx x, bool allow_vector)
788{
789  machine_mode mode = GET_MODE (x);
790
791  if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
792      && allow_vector)
793    {
794      int n;
795      if (GET_CODE (x) != CONST_VECTOR)
796	return 0;
797      n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
798      if (!n)
799	return 0;
800      for (int i = 1; i < 64; i++)
801	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
802	  return 0;
803      return 1;
804    }
805
806  if (mode != HFmode && mode != SFmode && mode != DFmode)
807    return 0;
808
809  const REAL_VALUE_TYPE *r;
810
811  if (x == CONST0_RTX (mode))
812    return 128;
813  if (x == CONST1_RTX (mode))
814    return 242;
815
816  r = CONST_DOUBLE_REAL_VALUE (x);
817
818  if (real_identical (r, &dconstm1))
819    return 243;
820
821  if (real_identical (r, &dconsthalf))
822    return 240;
823  if (real_identical (r, &dconstm1))
824    return 243;
825  if (real_identical (r, &dconst2))
826    return 244;
827  if (real_identical (r, &dconst4))
828    return 246;
829  if (real_identical (r, &dconst1over2pi))
830    return 248;
831  if (!ext_gcn_constants_init)
832    init_ext_gcn_constants ();
833  real_value_negate (r);
834  if (real_identical (r, &dconsthalf))
835    return 241;
836  if (real_identical (r, &dconst2))
837    return 245;
838  if (real_identical (r, &dconst4))
839    return 247;
840
841  /* FIXME: add 4, -4 and 1/(2*PI).  */
842
843  return 0;
844}
845
846/* Return non-zero if X is a constant that can appear as an immediate operand.
847   This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
848   Or a vector of those.
849   The value returned should be the encoding of this constant.  */
850
851bool
852gcn_fp_constant_p (rtx x, bool allow_vector)
853{
854  machine_mode mode = GET_MODE (x);
855
856  if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
857      && allow_vector)
858    {
859      int n;
860      if (GET_CODE (x) != CONST_VECTOR)
861	return false;
862      n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
863      if (!n)
864	return false;
865      for (int i = 1; i < 64; i++)
866	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867	  return false;
868      return true;
869    }
870  if (mode != HFmode && mode != SFmode && mode != DFmode)
871    return false;
872
873  if (gcn_inline_fp_constant_p (x, false))
874    return true;
875  /* FIXME: It is not clear how 32bit immediates are interpreted here.  */
876  return (mode != DFmode);
877}
878
879/* Return true if X is a constant representable as an inline immediate
880   constant in a 32-bit instruction encoding.  */
881
882bool
883gcn_inline_constant_p (rtx x)
884{
885  if (GET_CODE (x) == CONST_INT)
886    return INTVAL (x) >= -16 && INTVAL (x) <= 64;
887  if (GET_CODE (x) == CONST_DOUBLE)
888    return gcn_inline_fp_constant_p (x, false);
889  if (GET_CODE (x) == CONST_VECTOR)
890    {
891      int n;
892      if (!vgpr_vector_mode_p (GET_MODE (x)))
893	return false;
894      n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
895      if (!n)
896	return false;
897      for (int i = 1; i < 64; i++)
898	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
899	  return false;
900      return 1;
901    }
902  return false;
903}
904
905/* Return true if X is a constant representable as an immediate constant
906   in a 32 or 64-bit instruction encoding.  */
907
908bool
909gcn_constant_p (rtx x)
910{
911  switch (GET_CODE (x))
912    {
913    case CONST_INT:
914      return true;
915
916    case CONST_DOUBLE:
917      return gcn_fp_constant_p (x, false);
918
919    case CONST_VECTOR:
920      {
921	int n;
922	if (!vgpr_vector_mode_p (GET_MODE (x)))
923	  return false;
924	n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
925	if (!n)
926	  return false;
927	for (int i = 1; i < 64; i++)
928	  if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
929	    return false;
930	return true;
931      }
932
933    case SYMBOL_REF:
934    case LABEL_REF:
935      return true;
936
937    default:
938      ;
939    }
940
941  return false;
942}
943
944/* Return true if X is a constant representable as two inline immediate
945   constants in a 64-bit instruction that is split into two 32-bit
946   instructions.
947   When MIXED is set, the low-part is permitted to use the full 32-bits.  */
948
949bool
950gcn_inline_constant64_p (rtx x, bool mixed)
951{
952  if (GET_CODE (x) == CONST_VECTOR)
953    {
954      if (!vgpr_vector_mode_p (GET_MODE (x)))
955	return false;
956      if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
957	return false;
958      for (int i = 1; i < 64; i++)
959	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
960	  return false;
961
962      return true;
963    }
964
965  if (GET_CODE (x) != CONST_INT)
966    return false;
967
968  rtx val_lo = gcn_operand_part (DImode, x, 0);
969  rtx val_hi = gcn_operand_part (DImode, x, 1);
970  return ((mixed || gcn_inline_constant_p (val_lo))
971	  && gcn_inline_constant_p (val_hi));
972}
973
974/* Return true if X is a constant representable as an immediate constant
975   in a 32 or 64-bit instruction encoding where the hardware will
976   extend the immediate to 64-bits.  */
977
978bool
979gcn_constant64_p (rtx x)
980{
981  if (!gcn_constant_p (x))
982    return false;
983
984  if (GET_CODE (x) != CONST_INT)
985    return true;
986
987  /* Negative numbers are only allowed if they can be encoded within src0,
988     because the 32-bit immediates do not get sign-extended.
989     Unsigned numbers must not be encodable as 32-bit -1..-16, because the
990     assembler will use a src0 inline immediate and that will get
991     sign-extended.  */
992  HOST_WIDE_INT val = INTVAL (x);
993  return (((val & 0xffffffff) == val	/* Positive 32-bit.  */
994	   && (val & 0xfffffff0) != 0xfffffff0)	/* Not -1..-16.  */
995	  || gcn_inline_constant_p (x));	/* Src0.  */
996}
997
998/* Implement TARGET_LEGITIMATE_CONSTANT_P.
999
1000   Returns true if X is a legitimate constant for a MODE immediate operand.  */
1001
1002bool
1003gcn_legitimate_constant_p (machine_mode, rtx x)
1004{
1005  return gcn_constant_p (x);
1006}
1007
1008/* Return true if X is a CONST_VECTOR of single constant.  */
1009
1010static bool
1011single_cst_vector_p (rtx x)
1012{
1013  if (GET_CODE (x) != CONST_VECTOR)
1014    return false;
1015  for (int i = 1; i < 64; i++)
1016    if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1017      return false;
1018  return true;
1019}
1020
1021/* Create a CONST_VECTOR of duplicated value A.  */
1022
1023rtx
1024gcn_vec_constant (machine_mode mode, int a)
1025{
1026  /*if (!a)
1027    return CONST0_RTX (mode);
1028  if (a == -1)
1029    return CONSTM1_RTX (mode);
1030  if (a == 1)
1031    return CONST1_RTX (mode);
1032  if (a == 2)
1033    return CONST2_RTX (mode);*/
1034
1035  int units = GET_MODE_NUNITS (mode);
1036  machine_mode innermode = GET_MODE_INNER (mode);
1037
1038  rtx tem;
1039  if (FLOAT_MODE_P (innermode))
1040    {
1041      REAL_VALUE_TYPE rv;
1042      real_from_integer (&rv, NULL, a, SIGNED);
1043      tem = const_double_from_real_value (rv, innermode);
1044    }
1045  else
1046    tem = gen_int_mode (a, innermode);
1047
1048  rtvec v = rtvec_alloc (units);
1049  for (int i = 0; i < units; ++i)
1050    RTVEC_ELT (v, i) = tem;
1051
1052  return gen_rtx_CONST_VECTOR (mode, v);
1053}
1054
1055/* Create a CONST_VECTOR of duplicated value A.  */
1056
1057rtx
1058gcn_vec_constant (machine_mode mode, rtx a)
1059{
1060  int units = GET_MODE_NUNITS (mode);
1061  rtvec v = rtvec_alloc (units);
1062
1063  for (int i = 0; i < units; ++i)
1064    RTVEC_ELT (v, i) = a;
1065
1066  return gen_rtx_CONST_VECTOR (mode, v);
1067}
1068
1069/* Create an undefined vector value, used where an insn operand is
1070   optional.  */
1071
1072rtx
1073gcn_gen_undef (machine_mode mode)
1074{
1075  return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1076}
1077
1078/* }}}  */
1079/* {{{ Addresses, pointers and moves.  */
1080
1081/* Return true is REG is a valid place to store a pointer,
1082   for instructions that require an SGPR.
1083   FIXME rename. */
1084
1085static bool
1086gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1087{
1088  if (GET_CODE (reg) == SUBREG)
1089    reg = SUBREG_REG (reg);
1090
1091  if (!REG_P (reg))
1092    return false;
1093
1094  if (GET_MODE (reg) != mode)
1095    return false;
1096
1097  int regno = REGNO (reg);
1098
1099  if (regno >= FIRST_PSEUDO_REGISTER)
1100    {
1101      if (!strict)
1102	return true;
1103
1104      if (!reg_renumber)
1105	return false;
1106
1107      regno = reg_renumber[regno];
1108    }
1109
1110  return (SGPR_REGNO_P (regno) || regno == M0_REG
1111	  || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1112}
1113
1114/* Return true is REG is a valid place to store a pointer,
1115   for instructions that require a VGPR.  */
1116
1117static bool
1118gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1119{
1120  if (GET_CODE (reg) == SUBREG)
1121    reg = SUBREG_REG (reg);
1122
1123  if (!REG_P (reg))
1124    return false;
1125
1126  if (GET_MODE (reg) != mode)
1127    return false;
1128
1129  int regno = REGNO (reg);
1130
1131  if (regno >= FIRST_PSEUDO_REGISTER)
1132    {
1133      if (!strict)
1134	return true;
1135
1136      if (!reg_renumber)
1137	return false;
1138
1139      regno = reg_renumber[regno];
1140    }
1141
1142  return VGPR_REGNO_P (regno);
1143}
1144
1145/* Return true if X would be valid inside a MEM using the Flat address
1146   space.  */
1147
1148bool
1149gcn_flat_address_p (rtx x, machine_mode mode)
1150{
1151  bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1152		   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1153
1154  if (vec_mode && gcn_address_register_p (x, DImode, false))
1155    return true;
1156
1157  if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1158    return true;
1159
1160  if (TARGET_GCN5_PLUS
1161      && GET_CODE (x) == PLUS
1162      && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1163      && CONST_INT_P (XEXP (x, 1)))
1164    return true;
1165
1166  return false;
1167}
1168
1169/* Return true if X would be valid inside a MEM using the Scalar Flat
1170   address space.  */
1171
1172bool
1173gcn_scalar_flat_address_p (rtx x)
1174{
1175  if (gcn_address_register_p (x, DImode, false))
1176    return true;
1177
1178  if (GET_CODE (x) == PLUS
1179      && gcn_address_register_p (XEXP (x, 0), DImode, false)
1180      && CONST_INT_P (XEXP (x, 1)))
1181    return true;
1182
1183  return false;
1184}
1185
1186/* Return true if MEM X would be valid for the Scalar Flat address space.  */
1187
1188bool
1189gcn_scalar_flat_mem_p (rtx x)
1190{
1191  if (!MEM_P (x))
1192    return false;
1193
1194  if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1195    return false;
1196
1197  return gcn_scalar_flat_address_p (XEXP (x, 0));
1198}
1199
1200/* Return true if X would be valid inside a MEM using the LDS or GDS
1201   address spaces.  */
1202
1203bool
1204gcn_ds_address_p (rtx x)
1205{
1206  if (gcn_vec_address_register_p (x, SImode, false))
1207    return true;
1208
1209  if (GET_CODE (x) == PLUS
1210      && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1211      && CONST_INT_P (XEXP (x, 1)))
1212    return true;
1213
1214  return false;
1215}
1216
1217/* Return true if ADDR would be valid inside a MEM using the Global
1218   address space.  */
1219
1220bool
1221gcn_global_address_p (rtx addr)
1222{
1223  if (gcn_address_register_p (addr, DImode, false)
1224      || gcn_vec_address_register_p (addr, DImode, false))
1225    return true;
1226
1227  if (GET_CODE (addr) == PLUS)
1228    {
1229      rtx base = XEXP (addr, 0);
1230      rtx offset = XEXP (addr, 1);
1231      bool immediate_p = (CONST_INT_P (offset)
1232			  && INTVAL (offset) >= -(1 << 12)
1233			  && INTVAL (offset) < (1 << 12));
1234
1235      if ((gcn_address_register_p (base, DImode, false)
1236	   || gcn_vec_address_register_p (base, DImode, false))
1237	  && immediate_p)
1238	/* SGPR + CONST or VGPR + CONST  */
1239	return true;
1240
1241      if (gcn_address_register_p (base, DImode, false)
1242	  && gcn_vgpr_register_operand (offset, SImode))
1243	/* SPGR + VGPR  */
1244	return true;
1245
1246      if (GET_CODE (base) == PLUS
1247	  && gcn_address_register_p (XEXP (base, 0), DImode, false)
1248	  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1249	  && immediate_p)
1250	/* (SGPR + VGPR) + CONST  */
1251	return true;
1252    }
1253
1254  return false;
1255}
1256
1257/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1258
1259   Recognizes RTL expressions that are valid memory addresses for an
1260   instruction.  The MODE argument is the machine mode for the MEM
1261   expression that wants to use this address.
1262
1263   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
1264   convert common non-canonical forms to canonical form so that they will
1265   be recognized.  */
1266
1267static bool
1268gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1269				     addr_space_t as)
1270{
1271  /* All vector instructions need to work on addresses in registers.  */
1272  if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1273    return false;
1274
1275  if (AS_SCALAR_FLAT_P (as))
1276    {
1277      if (mode == QImode || mode == HImode)
1278	return 0;
1279
1280      switch (GET_CODE (x))
1281	{
1282	case REG:
1283	  return gcn_address_register_p (x, DImode, strict);
1284	/* Addresses are in the form BASE+OFFSET
1285	   OFFSET is either 20bit unsigned immediate, SGPR or M0.
1286	   Writes and atomics do not accept SGPR.  */
1287	case PLUS:
1288	  {
1289	    rtx x0 = XEXP (x, 0);
1290	    rtx x1 = XEXP (x, 1);
1291	    if (!gcn_address_register_p (x0, DImode, strict))
1292	      return false;
1293	    /* FIXME: This is disabled because of the mode mismatch between
1294	       SImode (for the address or m0 register) and the DImode PLUS.
1295	       We'll need a zero_extend or similar.
1296
1297	    if (gcn_m0_register_p (x1, SImode, strict)
1298		|| gcn_address_register_p (x1, SImode, strict))
1299	      return true;
1300	    else*/
1301	    if (GET_CODE (x1) == CONST_INT)
1302	      {
1303		if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1304		    /* The low bits of the offset are ignored, even when
1305		       they're meant to realign the pointer.  */
1306		    && !(INTVAL (x1) & 0x3))
1307		  return true;
1308	      }
1309	    return false;
1310	  }
1311
1312	default:
1313	  break;
1314	}
1315    }
1316  else if (AS_SCRATCH_P (as))
1317    return gcn_address_register_p (x, SImode, strict);
1318  else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1319    {
1320      if (TARGET_GCN3 || GET_CODE (x) == REG)
1321       return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1322		|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1323	       ? gcn_address_register_p (x, DImode, strict)
1324	       : gcn_vec_address_register_p (x, DImode, strict));
1325      else
1326	{
1327	  gcc_assert (TARGET_GCN5_PLUS);
1328
1329	  if (GET_CODE (x) == PLUS)
1330	    {
1331	      rtx x1 = XEXP (x, 1);
1332
1333	      if (VECTOR_MODE_P (mode)
1334		  ? !gcn_address_register_p (x, DImode, strict)
1335		  : !gcn_vec_address_register_p (x, DImode, strict))
1336		return false;
1337
1338	      if (GET_CODE (x1) == CONST_INT)
1339		{
1340		  if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1341		      /* The low bits of the offset are ignored, even when
1342		         they're meant to realign the pointer.  */
1343		      && !(INTVAL (x1) & 0x3))
1344		    return true;
1345		}
1346	    }
1347	  return false;
1348	}
1349    }
1350  else if (AS_GLOBAL_P (as))
1351    {
1352      gcc_assert (TARGET_GCN5_PLUS);
1353
1354      if (GET_CODE (x) == REG)
1355       return (gcn_address_register_p (x, DImode, strict)
1356	       || (!VECTOR_MODE_P (mode)
1357		   && gcn_vec_address_register_p (x, DImode, strict)));
1358      else if (GET_CODE (x) == PLUS)
1359	{
1360	  rtx base = XEXP (x, 0);
1361	  rtx offset = XEXP (x, 1);
1362
1363	  bool immediate_p = (GET_CODE (offset) == CONST_INT
1364			      /* Signed 13-bit immediate.  */
1365			      && INTVAL (offset) >= -(1 << 12)
1366			      && INTVAL (offset) < (1 << 12)
1367			      /* The low bits of the offset are ignored, even
1368			         when they're meant to realign the pointer.  */
1369			      && !(INTVAL (offset) & 0x3));
1370
1371	  if (!VECTOR_MODE_P (mode))
1372	    {
1373	      if ((gcn_address_register_p (base, DImode, strict)
1374		   || gcn_vec_address_register_p (base, DImode, strict))
1375		  && immediate_p)
1376		/* SGPR + CONST or VGPR + CONST  */
1377		return true;
1378
1379	      if (gcn_address_register_p (base, DImode, strict)
1380		  && gcn_vgpr_register_operand (offset, SImode))
1381		/* SGPR + VGPR  */
1382		return true;
1383
1384	      if (GET_CODE (base) == PLUS
1385		  && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1386		  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1387		  && immediate_p)
1388		/* (SGPR + VGPR) + CONST  */
1389		return true;
1390	    }
1391	  else
1392	    {
1393	      if (gcn_address_register_p (base, DImode, strict)
1394		  && immediate_p)
1395		/* SGPR + CONST  */
1396		return true;
1397	    }
1398	}
1399      else
1400	return false;
1401    }
1402  else if (AS_ANY_DS_P (as))
1403    switch (GET_CODE (x))
1404      {
1405      case REG:
1406	return (VECTOR_MODE_P (mode)
1407		? gcn_address_register_p (x, SImode, strict)
1408		: gcn_vec_address_register_p (x, SImode, strict));
1409      /* Addresses are in the form BASE+OFFSET
1410	 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1411	 Writes and atomics do not accept SGPR.  */
1412      case PLUS:
1413	{
1414	  rtx x0 = XEXP (x, 0);
1415	  rtx x1 = XEXP (x, 1);
1416	  if (!gcn_vec_address_register_p (x0, DImode, strict))
1417	    return false;
1418	  if (GET_CODE (x1) == REG)
1419	    {
1420	      if (GET_CODE (x1) != REG
1421		  || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1422		      && !gcn_ssrc_register_operand (x1, DImode)))
1423		return false;
1424	    }
1425	  else if (GET_CODE (x1) == CONST_VECTOR
1426		   && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1427		   && single_cst_vector_p (x1))
1428	    {
1429	      x1 = CONST_VECTOR_ELT (x1, 0);
1430	      if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1431		return true;
1432	    }
1433	  return false;
1434	}
1435
1436      default:
1437	break;
1438      }
1439  else
1440    gcc_unreachable ();
1441  return false;
1442}
1443
1444/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1445
1446   Return the appropriate mode for a named address pointer.  */
1447
1448static scalar_int_mode
1449gcn_addr_space_pointer_mode (addr_space_t addrspace)
1450{
1451  switch (addrspace)
1452    {
1453    case ADDR_SPACE_SCRATCH:
1454    case ADDR_SPACE_LDS:
1455    case ADDR_SPACE_GDS:
1456      return SImode;
1457    case ADDR_SPACE_DEFAULT:
1458    case ADDR_SPACE_FLAT:
1459    case ADDR_SPACE_FLAT_SCRATCH:
1460    case ADDR_SPACE_SCALAR_FLAT:
1461      return DImode;
1462    default:
1463      gcc_unreachable ();
1464    }
1465}
1466
1467/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1468
1469   Return the appropriate mode for a named address space address.  */
1470
1471static scalar_int_mode
1472gcn_addr_space_address_mode (addr_space_t addrspace)
1473{
1474  return gcn_addr_space_pointer_mode (addrspace);
1475}
1476
1477/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1478
1479   Determine if one named address space is a subset of another.  */
1480
1481static bool
1482gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1483{
1484  if (subset == superset)
1485    return true;
1486  /* FIXME is this true?  */
1487  if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1488    return true;
1489  return false;
1490}
1491
1492/* Convert from one address space to another.  */
1493
1494static rtx
1495gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1496{
1497  gcc_assert (POINTER_TYPE_P (from_type));
1498  gcc_assert (POINTER_TYPE_P (to_type));
1499
1500  addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1501  addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1502
1503  if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1504    {
1505      rtx queue = gen_rtx_REG (DImode,
1506			       cfun->machine->args.reg[QUEUE_PTR_ARG]);
1507      rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1508				     gen_rtx_PLUS (DImode, queue,
1509						   gen_int_mode (64, SImode)));
1510      rtx tmp = gen_reg_rtx (DImode);
1511
1512      emit_move_insn (gen_lowpart (SImode, tmp), op);
1513      emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1514		      group_seg_aperture_hi);
1515
1516      return tmp;
1517    }
1518  else if (as_from == as_to)
1519    return op;
1520  else
1521    gcc_unreachable ();
1522}
1523
1524/* Implement TARGET_ADDR_SPACE_DEBUG.
1525
1526   Return the dwarf address space class for each hardware address space.  */
1527
1528static int
1529gcn_addr_space_debug (addr_space_t as)
1530{
1531  switch (as)
1532    {
1533      case ADDR_SPACE_DEFAULT:
1534      case ADDR_SPACE_FLAT:
1535      case ADDR_SPACE_SCALAR_FLAT:
1536      case ADDR_SPACE_FLAT_SCRATCH:
1537	return DW_ADDR_none;
1538      case ADDR_SPACE_GLOBAL:
1539	return 1;      // DW_ADDR_LLVM_global
1540      case ADDR_SPACE_LDS:
1541	return 3;      // DW_ADDR_LLVM_group
1542      case ADDR_SPACE_SCRATCH:
1543	return 4;      // DW_ADDR_LLVM_private
1544      case ADDR_SPACE_GDS:
1545	return 0x8000; // DW_ADDR_AMDGPU_region
1546    }
1547  gcc_unreachable ();
1548}
1549
1550
1551/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1552
1553   Retun true if REGNO is OK for memory adressing.  */
1554
1555bool
1556gcn_regno_mode_code_ok_for_base_p (int regno,
1557				   machine_mode, addr_space_t as, int, int)
1558{
1559  if (regno >= FIRST_PSEUDO_REGISTER)
1560    {
1561      if (reg_renumber)
1562	regno = reg_renumber[regno];
1563      else
1564	return true;
1565    }
1566  if (AS_FLAT_P (as))
1567    return (VGPR_REGNO_P (regno)
1568	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1569  else if (AS_SCALAR_FLAT_P (as))
1570    return (SGPR_REGNO_P (regno)
1571	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1572  else if (AS_GLOBAL_P (as))
1573    {
1574      return (SGPR_REGNO_P (regno)
1575	      || VGPR_REGNO_P (regno)
1576	      || regno == ARG_POINTER_REGNUM
1577	      || regno == FRAME_POINTER_REGNUM);
1578    }
1579  else
1580    /* For now.  */
1581    return false;
1582}
1583
1584/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1585
1586   Return a suitable register class for memory addressing.  */
1587
1588reg_class
1589gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1590			      int ic)
1591{
1592  switch (as)
1593    {
1594    case ADDR_SPACE_DEFAULT:
1595      return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1596    case ADDR_SPACE_SCALAR_FLAT:
1597    case ADDR_SPACE_SCRATCH:
1598      return SGPR_REGS;
1599      break;
1600    case ADDR_SPACE_FLAT:
1601    case ADDR_SPACE_FLAT_SCRATCH:
1602    case ADDR_SPACE_LDS:
1603    case ADDR_SPACE_GDS:
1604      return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1605	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1606	      ? SGPR_REGS : VGPR_REGS);
1607    case ADDR_SPACE_GLOBAL:
1608      return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1609	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1610	      ? SGPR_REGS : ALL_GPR_REGS);
1611    }
1612  gcc_unreachable ();
1613}
1614
1615/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1616
1617   Return true if REGNO is OK for index of memory addressing.  */
1618
1619bool
1620regno_ok_for_index_p (int regno)
1621{
1622  if (regno >= FIRST_PSEUDO_REGISTER)
1623    {
1624      if (reg_renumber)
1625	regno = reg_renumber[regno];
1626      else
1627	return true;
1628    }
1629  return regno == M0_REG || VGPR_REGNO_P (regno);
1630}
1631
1632/* Generate move which uses the exec flags.  If EXEC is NULL, then it is
1633   assumed that all lanes normally relevant to the mode of the move are
1634   affected.  If PREV is NULL, then a sensible default is supplied for
1635   the inactive lanes.  */
1636
1637static rtx
1638gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1639{
1640  machine_mode mode = GET_MODE (op0);
1641
1642  if (vgpr_vector_mode_p (mode))
1643    {
1644      if (exec && exec != CONSTM1_RTX (DImode))
1645	{
1646	  if (!prev)
1647	    prev = op0;
1648	}
1649      else
1650	{
1651	  if (!prev)
1652	    prev = gcn_gen_undef (mode);
1653	  exec = gcn_full_exec_reg ();
1654	}
1655
1656      rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1657
1658      return gen_rtx_PARALLEL (VOIDmode,
1659	       gen_rtvec (2, set,
1660			 gen_rtx_CLOBBER (VOIDmode,
1661					  gen_rtx_SCRATCH (V64DImode))));
1662    }
1663
1664  return (gen_rtx_PARALLEL
1665	  (VOIDmode,
1666	   gen_rtvec (2, gen_rtx_SET (op0, op1),
1667		      gen_rtx_USE (VOIDmode,
1668				   exec ? exec : gcn_scalar_exec ()))));
1669}
1670
1671/* Generate masked move.  */
1672
1673static rtx
1674gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1675{
1676  if (exec)
1677    return (gen_rtx_SET (op0,
1678			 gen_rtx_VEC_MERGE (GET_MODE (op0),
1679					    gen_rtx_VEC_DUPLICATE (GET_MODE
1680								   (op0), op1),
1681					    op2, exec)));
1682  else
1683    return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1684}
1685
1686/* Expand vector init of OP0 by VEC.
1687   Implements vec_init instruction pattern.  */
1688
1689void
1690gcn_expand_vector_init (rtx op0, rtx vec)
1691{
1692  int64_t initialized_mask = 0;
1693  int64_t curr_mask = 1;
1694  machine_mode mode = GET_MODE (op0);
1695
1696  rtx val = XVECEXP (vec, 0, 0);
1697
1698  for (int i = 1; i < 64; i++)
1699    if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1700      curr_mask |= (int64_t) 1 << i;
1701
1702  if (gcn_constant_p (val))
1703    emit_move_insn (op0, gcn_vec_constant (mode, val));
1704  else
1705    {
1706      val = force_reg (GET_MODE_INNER (mode), val);
1707      emit_insn (gen_duplicate_load (op0, val));
1708    }
1709  initialized_mask |= curr_mask;
1710  for (int i = 1; i < 64; i++)
1711    if (!(initialized_mask & ((int64_t) 1 << i)))
1712      {
1713	curr_mask = (int64_t) 1 << i;
1714	rtx val = XVECEXP (vec, 0, i);
1715
1716	for (int j = i + 1; j < 64; j++)
1717	  if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1718	    curr_mask |= (int64_t) 1 << j;
1719	if (gcn_constant_p (val))
1720	  emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1721					get_exec (curr_mask)));
1722	else
1723	  {
1724	    val = force_reg (GET_MODE_INNER (mode), val);
1725	    emit_insn (gen_duplicate_load (op0, val, op0,
1726					   get_exec (curr_mask)));
1727	  }
1728	initialized_mask |= curr_mask;
1729      }
1730}
1731
1732/* Load vector constant where n-th lane contains BASE+n*VAL.  */
1733
1734static rtx
1735strided_constant (machine_mode mode, int base, int val)
1736{
1737  rtx x = gen_reg_rtx (mode);
1738  emit_move_insn (x, gcn_vec_constant (mode, base));
1739  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1740				 x, get_exec (0xffffffff00000000)));
1741  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1742				 x, get_exec (0xffff0000ffff0000)));
1743  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1744				 x, get_exec (0xff00ff00ff00ff00)));
1745  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1746				 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1747  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1748				 x, get_exec (0xcccccccccccccccc)));
1749  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1750				 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1751  return x;
1752}
1753
1754/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS.  */
1755
1756static rtx
1757gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1758				   addr_space_t as)
1759{
1760  switch (as)
1761    {
1762    case ADDR_SPACE_DEFAULT:
1763      return gcn_addr_space_legitimize_address (x, old, mode,
1764						DEFAULT_ADDR_SPACE);
1765    case ADDR_SPACE_SCALAR_FLAT:
1766    case ADDR_SPACE_SCRATCH:
1767      /* Instructions working on vectors need the address to be in
1768         a register.  */
1769      if (vgpr_vector_mode_p (mode))
1770	return force_reg (GET_MODE (x), x);
1771
1772      return x;
1773    case ADDR_SPACE_FLAT:
1774    case ADDR_SPACE_FLAT_SCRATCH:
1775    case ADDR_SPACE_GLOBAL:
1776      return TARGET_GCN3 ? force_reg (DImode, x) : x;
1777    case ADDR_SPACE_LDS:
1778    case ADDR_SPACE_GDS:
1779      /* FIXME: LDS support offsets, handle them!.  */
1780      if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1781	{
1782	  rtx addrs = gen_reg_rtx (V64SImode);
1783	  rtx base = force_reg (SImode, x);
1784	  rtx offsets = strided_constant (V64SImode, 0,
1785					  GET_MODE_UNIT_SIZE (mode));
1786
1787	  emit_insn (gen_vec_duplicatev64si (addrs, base));
1788	  emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1789	  return addrs;
1790	}
1791      return x;
1792    }
1793  gcc_unreachable ();
1794}
1795
1796/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1797   proper vector of stepped addresses.
1798
1799   MEM will be a DImode address of a vector in an SGPR.
1800   TMP will be a V64DImode VGPR pair or (scratch:V64DI).  */
1801
1802rtx
1803gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1804				     rtx tmp)
1805{
1806  gcc_assert (MEM_P (mem));
1807  rtx mem_base = XEXP (mem, 0);
1808  rtx mem_index = NULL_RTX;
1809
1810  if (!TARGET_GCN5_PLUS)
1811    {
1812      /* gcn_addr_space_legitimize_address should have put the address in a
1813         register.  If not, it is too late to do anything about it.  */
1814      gcc_assert (REG_P (mem_base));
1815    }
1816
1817  if (GET_CODE (mem_base) == PLUS)
1818    {
1819      mem_index = XEXP (mem_base, 1);
1820      mem_base = XEXP (mem_base, 0);
1821    }
1822
1823  /* RF and RM base registers for vector modes should be always an SGPR.  */
1824  gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1825	      || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1826
1827  machine_mode inner = GET_MODE_INNER (mode);
1828  int shift = exact_log2 (GET_MODE_SIZE (inner));
1829  rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1830  rtx undef_v64si = gcn_gen_undef (V64SImode);
1831  rtx new_base = NULL_RTX;
1832  addr_space_t as = MEM_ADDR_SPACE (mem);
1833
1834  rtx tmplo = (REG_P (tmp)
1835	       ? gcn_operand_part (V64DImode, tmp, 0)
1836	       : gen_reg_rtx (V64SImode));
1837
1838  /* tmplo[:] = ramp[:] << shift  */
1839  if (exec)
1840    emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1841				    gen_int_mode (shift, SImode),
1842				    undef_v64si, exec));
1843  else
1844    emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1845
1846  if (AS_FLAT_P (as))
1847    {
1848      rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1849
1850      if (REG_P (tmp))
1851	{
1852	  rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1853	  rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1854	  rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1855
1856	  /* tmphi[:] = mem_base_hi  */
1857	  if (exec)
1858	    emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1859						    undef_v64si, exec));
1860	  else
1861	    emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1862
1863	  /* tmp[:] += zext (mem_base)  */
1864	  if (exec)
1865	    {
1866	      emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1867						     vcc, undef_v64si, exec));
1868	      emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1869					      vcc, vcc, undef_v64si, exec));
1870	    }
1871	  else
1872	    emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1873	}
1874      else
1875	{
1876	  tmp = gen_reg_rtx (V64DImode);
1877	  if (exec)
1878	    emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1879		       (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1880			exec));
1881	  else
1882	    emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1883	}
1884
1885      new_base = tmp;
1886    }
1887  else if (AS_ANY_DS_P (as))
1888    {
1889      if (!exec)
1890	emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1891      else
1892        emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1893					   gcn_gen_undef (V64SImode), exec));
1894      new_base = tmplo;
1895    }
1896  else
1897    {
1898      mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1899      new_base = gen_rtx_PLUS (V64DImode, mem_base,
1900			       gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1901    }
1902
1903  return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1904		       gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1905					      (mem_index ? mem_index
1906					       : const0_rtx)));
1907}
1908
1909/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1910   suitable for the given address space.  This is indented for use in
1911   gather/scatter patterns.
1912
1913   The offsets may be signed or unsigned, according to UNSIGNED_P.
1914   If EXEC is set then _exec patterns will be used, otherwise plain.
1915
1916   Return values.
1917     ADDR_SPACE_FLAT   - return V64DImode vector of absolute addresses.
1918     ADDR_SPACE_GLOBAL - return V64SImode vector of offsets.  */
1919
1920rtx
1921gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1922			   bool unsigned_p, rtx exec)
1923{
1924  rtx tmpsi = gen_reg_rtx (V64SImode);
1925  rtx tmpdi = gen_reg_rtx (V64DImode);
1926  rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1927  rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1928
1929  if (CONST_INT_P (scale)
1930      && INTVAL (scale) > 0
1931      && exact_log2 (INTVAL (scale)) >= 0)
1932    emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1933			       GEN_INT (exact_log2 (INTVAL (scale)))));
1934  else
1935    (exec
1936     ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1937					  exec))
1938     : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1939
1940  /* "Global" instructions do not support negative register offsets.  */
1941  if (as == ADDR_SPACE_FLAT || !unsigned_p)
1942    {
1943      if (unsigned_p)
1944	(exec
1945	 ?  emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1946						    undefdi, exec))
1947	 :  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1948      else
1949	(exec
1950	 ?  emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1951						     undefdi, exec))
1952	 :  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1953      return tmpdi;
1954    }
1955  else if (as == ADDR_SPACE_GLOBAL)
1956    return tmpsi;
1957
1958  gcc_unreachable ();
1959}
1960
1961/* Return true if move from OP0 to OP1 is known to be executed in vector
1962   unit.  */
1963
1964bool
1965gcn_vgpr_move_p (rtx op0, rtx op1)
1966{
1967  if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1968    return true;
1969  if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1970    return true;
1971  return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1972	  || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1973	  || vgpr_vector_mode_p (GET_MODE (op0)));
1974}
1975
1976/* Return true if move from OP0 to OP1 is known to be executed in scalar
1977   unit.  Used in the machine description.  */
1978
1979bool
1980gcn_sgpr_move_p (rtx op0, rtx op1)
1981{
1982  if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1983    return true;
1984  if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1985    return true;
1986  if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1987      || VGPR_REGNO_P (REGNO (op0)))
1988    return false;
1989  if (REG_P (op1)
1990      && REGNO (op1) < FIRST_PSEUDO_REGISTER
1991      && !VGPR_REGNO_P (REGNO (op1)))
1992    return true;
1993  return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1994}
1995
1996/* Implement TARGET_SECONDARY_RELOAD.
1997
1998   The address space determines which registers can be used for loads and
1999   stores.  */
2000
2001static reg_class_t
2002gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2003		      machine_mode reload_mode, secondary_reload_info *sri)
2004{
2005  reg_class_t result = NO_REGS;
2006  bool spilled_pseudo =
2007    (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2008
2009  if (dump_file && (dump_flags & TDF_DETAILS))
2010    {
2011      fprintf (dump_file, "gcn_secondary_reload: ");
2012      dump_value_slim (dump_file, x, 1);
2013      fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2014	       reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2015      if (REG_P (x) || GET_CODE (x) == SUBREG)
2016	fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2017		 (true_regnum (x) >= 0
2018		  && true_regnum (x) < FIRST_PSEUDO_REGISTER
2019		  ? reg_names[true_regnum (x)]
2020		  : (spilled_pseudo ? "stack spill" : "??")));
2021      fprintf (dump_file, "\n");
2022    }
2023
2024  /* Some callers don't use or initialize icode.  */
2025  sri->icode = CODE_FOR_nothing;
2026
2027  if (MEM_P (x) || spilled_pseudo)
2028    {
2029      addr_space_t as = DEFAULT_ADDR_SPACE;
2030
2031      /* If we have a spilled pseudo, we can't find the address space
2032	 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2033	 ADDR_SPACE_GLOBAL for GCN5.  */
2034      if (MEM_P (x))
2035	as = MEM_ADDR_SPACE (x);
2036
2037      if (as == ADDR_SPACE_DEFAULT)
2038	as = DEFAULT_ADDR_SPACE;
2039
2040      switch (as)
2041	{
2042	case ADDR_SPACE_SCALAR_FLAT:
2043	  result =
2044	    ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2045	  break;
2046	case ADDR_SPACE_FLAT:
2047	case ADDR_SPACE_FLAT_SCRATCH:
2048	case ADDR_SPACE_GLOBAL:
2049	  if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2050	      || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2051	    {
2052	      if (in_p)
2053		switch (reload_mode)
2054		  {
2055		  case E_V64SImode:
2056		    sri->icode = CODE_FOR_reload_inv64si;
2057		    break;
2058		  case E_V64SFmode:
2059		    sri->icode = CODE_FOR_reload_inv64sf;
2060		    break;
2061		  case E_V64HImode:
2062		    sri->icode = CODE_FOR_reload_inv64hi;
2063		    break;
2064		  case E_V64HFmode:
2065		    sri->icode = CODE_FOR_reload_inv64hf;
2066		    break;
2067		  case E_V64QImode:
2068		    sri->icode = CODE_FOR_reload_inv64qi;
2069		    break;
2070		  case E_V64DImode:
2071		    sri->icode = CODE_FOR_reload_inv64di;
2072		    break;
2073		  case E_V64DFmode:
2074		    sri->icode = CODE_FOR_reload_inv64df;
2075		    break;
2076		  default:
2077		    gcc_unreachable ();
2078		  }
2079	      else
2080		switch (reload_mode)
2081		  {
2082		  case E_V64SImode:
2083		    sri->icode = CODE_FOR_reload_outv64si;
2084		    break;
2085		  case E_V64SFmode:
2086		    sri->icode = CODE_FOR_reload_outv64sf;
2087		    break;
2088		  case E_V64HImode:
2089		    sri->icode = CODE_FOR_reload_outv64hi;
2090		    break;
2091		  case E_V64HFmode:
2092		    sri->icode = CODE_FOR_reload_outv64hf;
2093		    break;
2094		  case E_V64QImode:
2095		    sri->icode = CODE_FOR_reload_outv64qi;
2096		    break;
2097		  case E_V64DImode:
2098		    sri->icode = CODE_FOR_reload_outv64di;
2099		    break;
2100		  case E_V64DFmode:
2101		    sri->icode = CODE_FOR_reload_outv64df;
2102		    break;
2103		  default:
2104		    gcc_unreachable ();
2105		  }
2106	      break;
2107	    }
2108	  /* Fallthrough.  */
2109	case ADDR_SPACE_LDS:
2110	case ADDR_SPACE_GDS:
2111	case ADDR_SPACE_SCRATCH:
2112	  result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2113	  break;
2114	}
2115    }
2116
2117  if (dump_file && (dump_flags & TDF_DETAILS))
2118    fprintf (dump_file, "   <= %s (icode: %s)\n", reg_class_names[result],
2119	     get_insn_name (sri->icode));
2120
2121  return result;
2122}
2123
2124/* Update register usage after having seen the compiler flags and kernel
2125   attributes.  We typically want to fix registers that contain values
2126   set by the HSA runtime.  */
2127
2128static void
2129gcn_conditional_register_usage (void)
2130{
2131  if (!cfun || !cfun->machine)
2132    return;
2133
2134  if (cfun->machine->normal_function)
2135    {
2136      /* Restrict the set of SGPRs and VGPRs used by non-kernel functions.  */
2137      for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
2138	   i <= LAST_SGPR_REG; i++)
2139	fixed_regs[i] = 1, call_used_regs[i] = 1;
2140
2141      for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2142	   i <= LAST_VGPR_REG; i++)
2143	fixed_regs[i] = 1, call_used_regs[i] = 1;
2144
2145      return;
2146    }
2147
2148  /* If the set of requested args is the default set, nothing more needs to
2149     be done.  */
2150  if (cfun->machine->args.requested == default_requested_args)
2151    return;
2152
2153  /* Requesting a set of args different from the default violates the ABI.  */
2154  if (!leaf_function_p ())
2155    warning (0, "A non-default set of initial values has been requested, "
2156		"which violates the ABI");
2157
2158  for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2159    fixed_regs[i] = 0;
2160
2161  /* Fix the runtime argument register containing values that may be
2162     needed later.  DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2163     needed after the prologue so there's no need to fix them.  */
2164  if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2165    fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2166  if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2167    {
2168      /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2169	the containing registers to be used for other purposes.  */
2170      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2171      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2172    }
2173  if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2174    {
2175      fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2176      fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2177    }
2178  if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2179    {
2180      fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2181      fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2182    }
2183  if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2184    fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2185  if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2186    fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2187  if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2188    fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2189  if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2190    fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2191}
2192
2193/* Determine if a load or store is valid, according to the register classes
2194   and address space.  Used primarily by the machine description to decide
2195   when to split a move into two steps.  */
2196
2197bool
2198gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2199{
2200  if (!MEM_P (dest) && !MEM_P (src))
2201    return true;
2202
2203  if (MEM_P (dest)
2204      && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2205      && (gcn_flat_address_p (XEXP (dest, 0), mode)
2206	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2207	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2208      && gcn_vgpr_register_operand (src, mode))
2209    return true;
2210  else if (MEM_P (src)
2211	   && AS_FLAT_P (MEM_ADDR_SPACE (src))
2212	   && (gcn_flat_address_p (XEXP (src, 0), mode)
2213	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2214	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2215	   && gcn_vgpr_register_operand (dest, mode))
2216    return true;
2217
2218  if (MEM_P (dest)
2219      && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2220      && (gcn_global_address_p (XEXP (dest, 0))
2221	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2222	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2223      && gcn_vgpr_register_operand (src, mode))
2224    return true;
2225  else if (MEM_P (src)
2226	   && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2227	   && (gcn_global_address_p (XEXP (src, 0))
2228	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2229	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2230	   && gcn_vgpr_register_operand (dest, mode))
2231    return true;
2232
2233  if (MEM_P (dest)
2234      && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2235      && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2236	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2237	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2238      && gcn_ssrc_register_operand (src, mode))
2239    return true;
2240  else if (MEM_P (src)
2241	   && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2242	   && (gcn_scalar_flat_address_p (XEXP (src, 0))
2243	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2244	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2245	   && gcn_sdst_register_operand (dest, mode))
2246    return true;
2247
2248  if (MEM_P (dest)
2249      && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2250      && gcn_ds_address_p (XEXP (dest, 0))
2251      && gcn_vgpr_register_operand (src, mode))
2252    return true;
2253  else if (MEM_P (src)
2254	   && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2255	   && gcn_ds_address_p (XEXP (src, 0))
2256	   && gcn_vgpr_register_operand (dest, mode))
2257    return true;
2258
2259  return false;
2260}
2261
2262/* }}}  */
2263/* {{{ Functions and ABI.  */
2264
2265/* Implement TARGET_FUNCTION_VALUE.
2266
2267   Define how to find the value returned by a function.
2268   The register location is always the same, but the mode depends on
2269   VALTYPE.  */
2270
2271static rtx
2272gcn_function_value (const_tree valtype, const_tree, bool)
2273{
2274  machine_mode mode = TYPE_MODE (valtype);
2275
2276  if (INTEGRAL_TYPE_P (valtype)
2277      && GET_MODE_CLASS (mode) == MODE_INT
2278      && GET_MODE_SIZE (mode) < 4)
2279    mode = SImode;
2280
2281  return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2282}
2283
2284/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2285
2286   Return true if N is a possible register number for the function return
2287   value.  */
2288
2289static bool
2290gcn_function_value_regno_p (const unsigned int n)
2291{
2292  return n == RETURN_VALUE_REG;
2293}
2294
2295/* Calculate the number of registers required to hold function argument
2296   ARG.  */
2297
2298static int
2299num_arg_regs (const function_arg_info &arg)
2300{
2301  if (targetm.calls.must_pass_in_stack (arg))
2302    return 0;
2303
2304  int size = arg.promoted_size_in_bytes ();
2305  return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2306}
2307
2308/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2309
2310   Return true if the location where a function argument is passed
2311   depends on whether or not it is a named argument
2312
2313   For gcn, we know how to handle functions declared as stdarg: by
2314   passing an extra pointer to the unnamed arguments.  However, the
2315   Fortran frontend can produce a different situation, where a
2316   function pointer is declared with no arguments, but the actual
2317   function and calls to it take more arguments.  In that case, we
2318   want to ensure the call matches the definition of the function.  */
2319
2320static bool
2321gcn_strict_argument_naming (cumulative_args_t cum_v)
2322{
2323  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2324
2325  return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2326}
2327
2328/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2329
2330   See comment on gcn_strict_argument_naming.  */
2331
2332static bool
2333gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2334{
2335  return !gcn_strict_argument_naming (cum_v);
2336}
2337
2338/* Implement TARGET_FUNCTION_ARG.
2339
2340   Return an RTX indicating whether a function argument is passed in a register
2341   and if so, which register.  */
2342
2343static rtx
2344gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2345{
2346  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2347  if (cum->normal_function)
2348    {
2349      if (!arg.named || arg.end_marker_p ())
2350	return 0;
2351
2352      if (targetm.calls.must_pass_in_stack (arg))
2353	return 0;
2354
2355      /* Vector parameters are not supported yet.  */
2356      if (VECTOR_MODE_P (arg.mode))
2357	return 0;
2358
2359      int reg_num = FIRST_PARM_REG + cum->num;
2360      int num_regs = num_arg_regs (arg);
2361      if (num_regs > 0)
2362	while (reg_num % num_regs != 0)
2363	  reg_num++;
2364      if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2365	return gen_rtx_REG (arg.mode, reg_num);
2366    }
2367  else
2368    {
2369      if (cum->num >= cum->args.nargs)
2370	{
2371	  cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2372	    & -(TYPE_ALIGN (arg.type) / 8);
2373	  cfun->machine->kernarg_segment_alignment
2374	    = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2375		   TYPE_ALIGN (arg.type) / 8);
2376	  rtx addr = gen_rtx_REG (DImode,
2377				  cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2378	  if (cum->offset)
2379	    addr = gen_rtx_PLUS (DImode, addr,
2380				 gen_int_mode (cum->offset, DImode));
2381	  rtx mem = gen_rtx_MEM (arg.mode, addr);
2382	  set_mem_attributes (mem, arg.type, 1);
2383	  set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2384	  MEM_READONLY_P (mem) = 1;
2385	  return mem;
2386	}
2387
2388      int a = cum->args.order[cum->num];
2389      if (arg.mode != gcn_kernel_arg_types[a].mode)
2390	{
2391	  error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2392	  return 0;
2393	}
2394      return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2395			  cum->args.reg[a]);
2396    }
2397  return 0;
2398}
2399
2400/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2401
2402   Updates the summarizer variable pointed to by CUM_V to advance past an
2403   argument in the argument list.  */
2404
2405static void
2406gcn_function_arg_advance (cumulative_args_t cum_v,
2407			  const function_arg_info &arg)
2408{
2409  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2410
2411  if (cum->normal_function)
2412    {
2413      if (!arg.named)
2414	return;
2415
2416      int num_regs = num_arg_regs (arg);
2417      if (num_regs > 0)
2418	while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2419	  cum->num++;
2420      cum->num += num_regs;
2421    }
2422  else
2423    {
2424      if (cum->num < cum->args.nargs)
2425	cum->num++;
2426      else
2427	{
2428	  cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2429	  cfun->machine->kernarg_segment_byte_size = cum->offset;
2430	}
2431    }
2432}
2433
2434/* Implement TARGET_ARG_PARTIAL_BYTES.
2435
2436   Returns the number of bytes at the beginning of an argument that must be put
2437   in registers.  The value must be zero for arguments that are passed entirely
2438   in registers or that are entirely pushed on the stack.  */
2439
2440static int
2441gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2442{
2443  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2444
2445  if (!arg.named)
2446    return 0;
2447
2448  if (targetm.calls.must_pass_in_stack (arg))
2449    return 0;
2450
2451  if (cum->num >= NUM_PARM_REGS)
2452    return 0;
2453
2454  /* If the argument fits entirely in registers, return 0.  */
2455  if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2456    return 0;
2457
2458  return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2459}
2460
2461/* A normal function which takes a pointer argument may be passed a pointer to
2462   LDS space (via a high-bits-set aperture), and that only works with FLAT
2463   addressing, not GLOBAL.  Force FLAT addressing if the function has an
2464   incoming pointer parameter.  NOTE: This is a heuristic that works in the
2465   offloading case, but in general, a function might read global pointer
2466   variables, etc. that may refer to LDS space or other special memory areas
2467   not supported by GLOBAL instructions, and then this argument check would not
2468   suffice.  */
2469
2470static void
2471gcn_detect_incoming_pointer_arg (tree fndecl)
2472{
2473  gcc_assert (cfun && cfun->machine);
2474
2475  for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2476       arg;
2477       arg = TREE_CHAIN (arg))
2478    if (POINTER_TYPE_P (TREE_VALUE (arg)))
2479      cfun->machine->use_flat_addressing = true;
2480}
2481
2482/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2483
2484   Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2485   whose data type is FNTYPE.  For a library call, FNTYPE is 0.  */
2486
2487void
2488gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2489			  tree fntype /* tree ptr for function decl */ ,
2490			  rtx libname /* SYMBOL_REF of library name or 0 */ ,
2491			  tree fndecl, int caller)
2492{
2493  memset (cum, 0, sizeof (*cum));
2494  cum->fntype = fntype;
2495  if (libname)
2496    {
2497      gcc_assert (cfun && cfun->machine);
2498      cum->normal_function = true;
2499      if (!caller)
2500	{
2501	  cfun->machine->normal_function = true;
2502	  gcn_detect_incoming_pointer_arg (fndecl);
2503	}
2504      return;
2505    }
2506  tree attr = NULL;
2507  if (fndecl)
2508    attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2509  if (fndecl && !attr)
2510    attr = lookup_attribute ("amdgpu_hsa_kernel",
2511			     TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2512  if (!attr && fntype)
2513    attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2514  /* Handle main () as kernel, so we can run testsuite.
2515     Handle OpenACC kernels similarly to main.  */
2516  if (!attr && !caller && fndecl
2517      && (MAIN_NAME_P (DECL_NAME (fndecl))
2518	  || lookup_attribute ("omp target entrypoint",
2519			       DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2520    gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2521  else
2522    {
2523      if (!attr || caller)
2524	{
2525	  gcc_assert (cfun && cfun->machine);
2526	  cum->normal_function = true;
2527	  if (!caller)
2528	    cfun->machine->normal_function = true;
2529	}
2530      gcn_parse_amdgpu_hsa_kernel_attribute
2531	(&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2532    }
2533  cfun->machine->args = cum->args;
2534  if (!caller && cfun->machine->normal_function)
2535    gcn_detect_incoming_pointer_arg (fndecl);
2536
2537  reinit_regs ();
2538}
2539
2540static bool
2541gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2542{
2543  machine_mode mode = TYPE_MODE (type);
2544  HOST_WIDE_INT size = int_size_in_bytes (type);
2545
2546  if (AGGREGATE_TYPE_P (type))
2547    return true;
2548
2549  /* Vector return values are not supported yet.  */
2550  if (VECTOR_TYPE_P (type))
2551    return true;
2552
2553  if (mode == BLKmode)
2554    return true;
2555
2556  if (size > 2 * UNITS_PER_WORD)
2557    return true;
2558
2559  return false;
2560}
2561
2562/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2563
2564   Return the mode to use for outgoing function arguments.  */
2565
2566machine_mode
2567gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2568			   int *ARG_UNUSED (punsignedp),
2569			   const_tree ARG_UNUSED (funtype),
2570			   int ARG_UNUSED (for_return))
2571{
2572  if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2573    return SImode;
2574
2575  return mode;
2576}
2577
2578/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2579
2580   Derived from hppa_gimplify_va_arg_expr.  The generic routine doesn't handle
2581   ARGS_GROW_DOWNWARDS.  */
2582
2583static tree
2584gcn_gimplify_va_arg_expr (tree valist, tree type,
2585			  gimple_seq *ARG_UNUSED (pre_p),
2586			  gimple_seq *ARG_UNUSED (post_p))
2587{
2588  tree ptr = build_pointer_type (type);
2589  tree valist_type;
2590  tree t, u;
2591  bool indirect;
2592
2593  indirect = pass_va_arg_by_reference (type);
2594  if (indirect)
2595    {
2596      type = ptr;
2597      ptr = build_pointer_type (type);
2598    }
2599  valist_type = TREE_TYPE (valist);
2600
2601  /* Args grow down.  Not handled by generic routines.  */
2602
2603  u = fold_convert (sizetype, size_in_bytes (type));
2604  u = fold_build1 (NEGATE_EXPR, sizetype, u);
2605  t = fold_build_pointer_plus (valist, u);
2606
2607  /* Align to 8 byte boundary.  */
2608
2609  u = build_int_cst (TREE_TYPE (t), -8);
2610  t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2611  t = fold_convert (valist_type, t);
2612
2613  t = build2 (MODIFY_EXPR, valist_type, valist, t);
2614
2615  t = fold_convert (ptr, t);
2616  t = build_va_arg_indirect_ref (t);
2617
2618  if (indirect)
2619    t = build_va_arg_indirect_ref (t);
2620
2621  return t;
2622}
2623
2624/* Return 1 if TRAIT NAME is present in the OpenMP context's
2625   device trait set, return 0 if not present in any OpenMP context in the
2626   whole translation unit, or -1 if not present in the current OpenMP context
2627   but might be present in another OpenMP context in the same TU.  */
2628
2629int
2630gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2631			      const char *name)
2632{
2633  switch (trait)
2634    {
2635    case omp_device_kind:
2636      return strcmp (name, "gpu") == 0;
2637    case omp_device_arch:
2638      return strcmp (name, "gcn") == 0;
2639    case omp_device_isa:
2640      if (strcmp (name, "fiji") == 0)
2641	return gcn_arch == PROCESSOR_FIJI;
2642      if (strcmp (name, "gfx900") == 0)
2643	return gcn_arch == PROCESSOR_VEGA10;
2644      if (strcmp (name, "gfx906") == 0)
2645	return gcn_arch == PROCESSOR_VEGA20;
2646      if (strcmp (name, "gfx908") == 0)
2647	return gcn_arch == PROCESSOR_GFX908;
2648      return 0;
2649    default:
2650      gcc_unreachable ();
2651    }
2652}
2653
2654/* Calculate stack offsets needed to create prologues and epilogues.  */
2655
2656static struct machine_function *
2657gcn_compute_frame_offsets (void)
2658{
2659  machine_function *offsets = cfun->machine;
2660
2661  if (reload_completed)
2662    return offsets;
2663
2664  offsets->need_frame_pointer = frame_pointer_needed;
2665
2666  offsets->outgoing_args_size = crtl->outgoing_args_size;
2667  offsets->pretend_size = crtl->args.pretend_args_size;
2668
2669  offsets->local_vars = get_frame_size ();
2670
2671  offsets->lr_needs_saving = (!leaf_function_p ()
2672			      || df_regs_ever_live_p (LR_REGNUM)
2673			      || df_regs_ever_live_p (LR_REGNUM + 1));
2674
2675  offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2676
2677  for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2678    if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2679	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2680	    && frame_pointer_needed))
2681      offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2682
2683  /* Round up to 64-bit boundary to maintain stack alignment.  */
2684  offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2685
2686  return offsets;
2687}
2688
2689/* Insert code into the prologue or epilogue to store or load any
2690   callee-save register to/from the stack.
2691
2692   Helper function for gcn_expand_prologue and gcn_expand_epilogue.  */
2693
2694static void
2695move_callee_saved_registers (rtx sp, machine_function *offsets,
2696			     bool prologue)
2697{
2698  int regno, offset, saved_scalars;
2699  rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2700  rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2701  rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2702  rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2703  HOST_WIDE_INT exec_set = 0;
2704  int offreg_set = 0;
2705  auto_vec<int> saved_sgprs;
2706
2707  start_sequence ();
2708
2709  /* Move scalars into two vector registers.  */
2710  for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2711    if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2712	|| ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2713	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2714	    && offsets->need_frame_pointer))
2715      {
2716	rtx reg = gen_rtx_REG (SImode, regno);
2717	rtx vreg = gen_rtx_REG (V64SImode,
2718				VGPR_REGNO (6 + (saved_scalars / 64)));
2719	int lane = saved_scalars % 64;
2720
2721	if (prologue)
2722	  {
2723	    emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2724	    saved_sgprs.safe_push (regno);
2725	  }
2726	else
2727	  emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2728
2729	saved_scalars++;
2730      }
2731
2732  rtx move_scalars = get_insns ();
2733  end_sequence ();
2734  start_sequence ();
2735
2736  /* Ensure that all vector lanes are moved.  */
2737  exec_set = -1;
2738  emit_move_insn (exec, GEN_INT (exec_set));
2739
2740  /* Set up a vector stack pointer.  */
2741  rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2742  rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2743  emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2744				  gcn_gen_undef (V64SImode), exec));
2745  rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2746  emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2747					  exec));
2748  emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2749				     gcn_operand_part (V64SImode, vsp, 0),
2750				     _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2751				     exec));
2752  emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2753				  gcn_operand_part (V64SImode, vsp, 1),
2754				  const0_rtx, vcc, vcc,
2755				  gcn_gen_undef (V64SImode), exec));
2756
2757  /* Move vectors.  */
2758  for (regno = FIRST_VGPR_REG, offset = 0;
2759       regno < FIRST_PSEUDO_REGISTER; regno++)
2760    if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2761	|| (regno == VGPR_REGNO (6) && saved_scalars > 0)
2762	|| (regno == VGPR_REGNO (7) && saved_scalars > 63))
2763      {
2764	rtx reg = gen_rtx_REG (V64SImode, regno);
2765	int size = 256;
2766
2767	if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2768	  size = saved_scalars * 4;
2769	else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2770	  size = (saved_scalars - 64) * 4;
2771
2772	if (size != 256 || exec_set != -1)
2773	  {
2774	    exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2775	    emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2776	  }
2777
2778	if (prologue)
2779	  {
2780	    rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
2781				  (vsp, const0_rtx, reg, as, const0_rtx,
2782				   exec));
2783
2784	    /* Add CFI metadata.  */
2785	    rtx note;
2786	    if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
2787	      {
2788		int start = (regno == VGPR_REGNO (7) ? 64 : 0);
2789		int count = MIN (saved_scalars - start, 64);
2790		int add_lr = (regno == VGPR_REGNO (6)
2791			      && offsets->lr_needs_saving);
2792		int lrdest = -1;
2793		rtvec seq = rtvec_alloc (count + add_lr);
2794
2795		/* Add an REG_FRAME_RELATED_EXPR entry for each scalar
2796		   register that was saved in this batch.  */
2797		for (int idx = 0; idx < count; idx++)
2798		  {
2799		    int stackaddr = offset + idx * 4;
2800		    rtx dest = gen_rtx_MEM (SImode,
2801					    gen_rtx_PLUS
2802					    (DImode, sp,
2803					     GEN_INT (stackaddr)));
2804		    rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
2805		    rtx set = gen_rtx_SET (dest, src);
2806		    RTX_FRAME_RELATED_P (set) = 1;
2807		    RTVEC_ELT (seq, idx) = set;
2808
2809		    if (saved_sgprs[start + idx] == LINK_REGNUM)
2810		      lrdest = stackaddr;
2811		  }
2812
2813		/* Add an additional expression for DWARF_LINK_REGISTER if
2814		   LINK_REGNUM was saved.  */
2815		if (lrdest != -1)
2816		  {
2817		    rtx dest = gen_rtx_MEM (DImode,
2818					    gen_rtx_PLUS
2819					    (DImode, sp,
2820					     GEN_INT (lrdest)));
2821		    rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
2822		    rtx set = gen_rtx_SET (dest, src);
2823		    RTX_FRAME_RELATED_P (set) = 1;
2824		    RTVEC_ELT (seq, count) = set;
2825		  }
2826
2827		note = gen_rtx_SEQUENCE (VOIDmode, seq);
2828	      }
2829	    else
2830	      {
2831		rtx dest = gen_rtx_MEM (V64SImode,
2832					gen_rtx_PLUS (DImode, sp,
2833						      GEN_INT (offset)));
2834		rtx src = gen_rtx_REG (V64SImode, regno);
2835		note = gen_rtx_SET (dest, src);
2836	      }
2837	    RTX_FRAME_RELATED_P (insn) = 1;
2838	    add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
2839	  }
2840	else
2841	  emit_insn (gen_gatherv64si_insn_1offset_exec
2842		     (reg, vsp, const0_rtx, as, const0_rtx,
2843		      gcn_gen_undef (V64SImode), exec));
2844
2845	/* Move our VSP to the next stack entry.  */
2846	if (offreg_set != size)
2847	  {
2848	    offreg_set = size;
2849	    emit_move_insn (offreg, GEN_INT (size));
2850	  }
2851	if (exec_set != -1)
2852	  {
2853	    exec_set = -1;
2854	    emit_move_insn (exec, GEN_INT (exec_set));
2855	  }
2856	emit_insn (gen_addv64si3_vcc_dup_exec
2857		   (gcn_operand_part (V64SImode, vsp, 0),
2858		    offreg, gcn_operand_part (V64SImode, vsp, 0),
2859		    vcc, gcn_gen_undef (V64SImode), exec));
2860	emit_insn (gen_addcv64si3_exec
2861		   (gcn_operand_part (V64SImode, vsp, 1),
2862		    gcn_operand_part (V64SImode, vsp, 1),
2863		    const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2864
2865	offset += size;
2866      }
2867
2868  rtx move_vectors = get_insns ();
2869  end_sequence ();
2870
2871  if (prologue)
2872    {
2873      emit_insn (move_scalars);
2874      emit_insn (move_vectors);
2875    }
2876  else
2877    {
2878      emit_insn (move_vectors);
2879      emit_insn (move_scalars);
2880    }
2881}
2882
2883/* Generate prologue.  Called from gen_prologue during pro_and_epilogue pass.
2884
2885   For a non-kernel function, the stack layout looks like this (interim),
2886   growing *upwards*:
2887
2888 hi | + ...
2889    |__________________| <-- current SP
2890    | outgoing args    |
2891    |__________________|
2892    | (alloca space)   |
2893    |__________________|
2894    | local vars       |
2895    |__________________| <-- FP/hard FP
2896    | callee-save regs |
2897    |__________________| <-- soft arg pointer
2898    | pretend args     |
2899    |__________________| <-- incoming SP
2900    | incoming args    |
2901 lo |..................|
2902
2903   This implies arguments (beyond the first N in registers) must grow
2904   downwards (as, apparently, PA has them do).
2905
2906   For a kernel function we have the simpler:
2907
2908 hi | + ...
2909    |__________________| <-- current SP
2910    | outgoing args    |
2911    |__________________|
2912    | (alloca space)   |
2913    |__________________|
2914    | local vars       |
2915 lo |__________________| <-- FP/hard FP
2916
2917*/
2918
2919void
2920gcn_expand_prologue ()
2921{
2922  machine_function *offsets = gcn_compute_frame_offsets ();
2923
2924  if (!cfun || !cfun->machine || cfun->machine->normal_function)
2925    {
2926      rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2927      rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
2928      rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
2929      rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2930      rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
2931      rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
2932
2933      start_sequence ();
2934
2935      if (offsets->pretend_size > 0)
2936	{
2937	  /* FIXME: Do the actual saving of register pretend args to the stack.
2938	     Register order needs consideration.  */
2939	}
2940
2941      /* Save callee-save regs.  */
2942      move_callee_saved_registers (sp, offsets, true);
2943
2944      HOST_WIDE_INT sp_adjust = offsets->pretend_size
2945	+ offsets->callee_saves
2946	+ offsets->local_vars + offsets->outgoing_args_size;
2947      if (sp_adjust > 0)
2948	{
2949	  /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2950	     we use split add explictly, and specify the DImode add in
2951	     the note.  */
2952	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
2953	  rtx adjustment = gen_int_mode (sp_adjust, SImode);
2954	  rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
2955							 adjustment, scc));
2956	  if (!offsets->need_frame_pointer)
2957	    {
2958	      RTX_FRAME_RELATED_P (insn) = 1;
2959	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
2960			    gen_rtx_SET (sp,
2961					 gen_rtx_PLUS (DImode, sp,
2962						       adjustment)));
2963	    }
2964	  emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
2965	}
2966
2967      if (offsets->need_frame_pointer)
2968	{
2969	  /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2970	     we use split add explictly, and specify the DImode add in
2971	     the note.  */
2972	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
2973	  int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
2974	  rtx adjustment = gen_int_mode (fp_adjust, SImode);
2975	  rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
2976							adjustment, scc));
2977	  emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
2978					 (fp_adjust < 0 ? GEN_INT (-1)
2979					  : const0_rtx),
2980					 scc, scc));
2981
2982	  /* Set the CFA to the entry stack address, as an offset from the
2983	     frame pointer.  This is preferred because the frame pointer is
2984	     saved in each frame, whereas the stack pointer is not.  */
2985	  RTX_FRAME_RELATED_P (insn) = 1;
2986	  add_reg_note (insn, REG_CFA_DEF_CFA,
2987			gen_rtx_PLUS (DImode, fp,
2988				      GEN_INT (-(offsets->pretend_size
2989						 + offsets->callee_saves))));
2990	}
2991
2992      rtx_insn *seq = get_insns ();
2993      end_sequence ();
2994
2995      emit_insn (seq);
2996    }
2997  else
2998    {
2999      rtx wave_offset = gen_rtx_REG (SImode,
3000				     cfun->machine->args.
3001				     reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
3002
3003      if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3004	{
3005	  rtx fs_init_lo =
3006	    gen_rtx_REG (SImode,
3007			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3008	  rtx fs_init_hi =
3009	    gen_rtx_REG (SImode,
3010			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3011	  rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3012	  rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3013
3014	  /*rtx queue = gen_rtx_REG(DImode,
3015				  cfun->machine->args.reg[QUEUE_PTR_ARG]);
3016	  rtx aperture = gen_rtx_MEM (SImode,
3017				      gen_rtx_PLUS (DImode, queue,
3018						    gen_int_mode (68, SImode)));
3019	  set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3020
3021	  /* Set up flat_scratch.  */
3022	  emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3023	  emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3024				      gen_int_mode (8, SImode)));
3025	  emit_move_insn (fs_reg_lo, fs_init_hi);
3026	}
3027
3028      /* Set up frame pointer and stack pointer.  */
3029      rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
3030      rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3031      rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
3032      rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3033      rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3034      rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3035
3036      HOST_WIDE_INT sp_adjust = (offsets->local_vars
3037				 + offsets->outgoing_args_size);
3038
3039      /* Initialise FP and SP from the buffer descriptor in s[0:3].  */
3040      emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
3041      emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
3042				 gen_int_mode (0xffff, SImode)));
3043      rtx scc = gen_rtx_REG (BImode, SCC_REG);
3044      emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3045      emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
3046
3047      /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3048	 split add explictly, and specify the DImode add in the note.
3049         The DWARF info expects that the callee-save data is in the frame,
3050         even though it isn't (because this is the entry point), so we
3051         make a notional adjustment to the DWARF frame offset here.  */
3052      rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3053					 DImode);
3054      rtx insn;
3055      if (sp_adjust > 0)
3056	{
3057	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
3058	  rtx adjustment = gen_int_mode (sp_adjust, DImode);
3059	  insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3060						    scc));
3061	  emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3062	}
3063      else
3064	insn = emit_move_insn (sp, fp);
3065      RTX_FRAME_RELATED_P (insn) = 1;
3066      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3067		    gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3068						   dbg_adjustment)));
3069
3070      if (offsets->need_frame_pointer)
3071	{
3072	  /* Set the CFA to the entry stack address, as an offset from the
3073	     frame pointer.  This is necessary when alloca is used, and
3074	     harmless otherwise.  */
3075	  rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3076	  add_reg_note (insn, REG_CFA_DEF_CFA,
3077			gen_rtx_PLUS (DImode, fp, neg_adjust));
3078	}
3079
3080      /* Make sure the flat scratch reg doesn't get optimised away.  */
3081      emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3082    }
3083
3084  /* Ensure that the scheduler doesn't do anything unexpected.  */
3085  emit_insn (gen_blockage ());
3086
3087  /* m0 is initialized for the usual LDS DS and FLAT memory case.
3088     The low-part is the address of the topmost addressable byte, which is
3089     size-1.  The high-part is an offset and should be zero.  */
3090  emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3091		  gen_int_mode (LDS_SIZE, SImode));
3092
3093  emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3094
3095  if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3096    {
3097      /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel.  */
3098      rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3099      emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3100						  "gomp_gcn_enter_kernel"));
3101      emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3102    }
3103}
3104
3105/* Generate epilogue.  Called from gen_epilogue during pro_and_epilogue pass.
3106
3107   See gcn_expand_prologue for stack details.  */
3108
3109void
3110gcn_expand_epilogue (void)
3111{
3112  /* Ensure that the scheduler doesn't do anything unexpected.  */
3113  emit_insn (gen_blockage ());
3114
3115  if (!cfun || !cfun->machine || cfun->machine->normal_function)
3116    {
3117      machine_function *offsets = gcn_compute_frame_offsets ();
3118      rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3119      rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3120
3121      HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3122
3123      if (offsets->need_frame_pointer)
3124	{
3125	  /* Restore old SP from the frame pointer.  */
3126	  if (sp_adjust > 0)
3127	    emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3128	  else
3129	    emit_move_insn (sp, fp);
3130	}
3131      else
3132	{
3133	  /* Restore old SP from current SP.  */
3134	  sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3135
3136	  if (sp_adjust > 0)
3137	    emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3138	}
3139
3140      move_callee_saved_registers (sp, offsets, false);
3141
3142      /* There's no explicit use of the link register on the return insn.  Emit
3143         one here instead.  */
3144      if (offsets->lr_needs_saving)
3145	emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3146
3147      /* Similar for frame pointer.  */
3148      if (offsets->need_frame_pointer)
3149	emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3150    }
3151  else if (flag_openmp)
3152    {
3153      /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel.  */
3154      rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3155      emit_move_insn (fn_reg,
3156		      gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3157      emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3158    }
3159  else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3160    {
3161      /* Assume that an exit value compatible with gcn-run is expected.
3162         That is, the third input parameter is an int*.
3163
3164         We can't allocate any new registers, but the kernarg_reg is
3165         dead after this, so we'll use that.  */
3166      rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3167				     [KERNARG_SEGMENT_PTR_ARG]);
3168      rtx retptr_mem = gen_rtx_MEM (DImode,
3169				    gen_rtx_PLUS (DImode, kernarg_reg,
3170						  GEN_INT (16)));
3171      set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3172      emit_move_insn (kernarg_reg, retptr_mem);
3173
3174      rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
3175      set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
3176      emit_move_insn (retval_mem,
3177		      gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
3178    }
3179
3180  emit_jump_insn (gen_gcn_return ());
3181}
3182
3183/* Implement TARGET_FRAME_POINTER_REQUIRED.
3184
3185   Return true if the frame pointer should not be eliminated.  */
3186
3187bool
3188gcn_frame_pointer_rqd (void)
3189{
3190  /* GDB needs the frame pointer in order to unwind properly,
3191     but that's not important for the entry point, unless alloca is used.
3192     It's not important for code execution, so we should repect the
3193     -fomit-frame-pointer flag.  */
3194  return (!flag_omit_frame_pointer
3195	  && cfun
3196	  && (cfun->calls_alloca
3197	      || (cfun->machine && cfun->machine->normal_function)));
3198}
3199
3200/* Implement TARGET_CAN_ELIMINATE.
3201
3202   Return true if the compiler is allowed to try to replace register number
3203   FROM_REG with register number TO_REG.
3204
3205   FIXME: is the default "true" not enough? Should this be a negative set?  */
3206
3207bool
3208gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3209{
3210  return (to_reg == HARD_FRAME_POINTER_REGNUM
3211	  || to_reg == STACK_POINTER_REGNUM);
3212}
3213
3214/* Implement INITIAL_ELIMINATION_OFFSET.
3215
3216   Returns the initial difference between the specified pair of registers, in
3217   terms of stack position.  */
3218
3219HOST_WIDE_INT
3220gcn_initial_elimination_offset (int from, int to)
3221{
3222  machine_function *offsets = gcn_compute_frame_offsets ();
3223
3224  switch (from)
3225    {
3226    case ARG_POINTER_REGNUM:
3227      if (to == STACK_POINTER_REGNUM)
3228	return -(offsets->callee_saves + offsets->local_vars
3229		 + offsets->outgoing_args_size);
3230      else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3231	return -offsets->callee_saves;
3232      else
3233	gcc_unreachable ();
3234      break;
3235
3236    case FRAME_POINTER_REGNUM:
3237      if (to == STACK_POINTER_REGNUM)
3238	return -(offsets->local_vars + offsets->outgoing_args_size);
3239      else if (to == HARD_FRAME_POINTER_REGNUM)
3240	return 0;
3241      else
3242	gcc_unreachable ();
3243      break;
3244
3245    default:
3246      gcc_unreachable ();
3247    }
3248}
3249
3250/* Implement HARD_REGNO_RENAME_OK.
3251
3252   Return true if it is permissible to rename a hard register from
3253   FROM_REG to TO_REG.  */
3254
3255bool
3256gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3257{
3258  if (from_reg == SCC_REG
3259      || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3260      || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3261      || to_reg == SCC_REG
3262      || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3263      || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3264    return false;
3265
3266  /* Allow the link register to be used if it was saved.  */
3267  if ((to_reg & ~1) == LINK_REGNUM)
3268    return !cfun || cfun->machine->lr_needs_saving;
3269
3270  /* Allow the registers used for the static chain to be used if the chain is
3271     not in active use.  */
3272  if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3273    return !cfun
3274	|| !(cfun->static_chain_decl
3275	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3276	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3277
3278  return true;
3279}
3280
3281/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3282
3283   Which mode is required for saving NREGS of a pseudo-register in
3284   call-clobbered hard register REGNO.  */
3285
3286machine_mode
3287gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3288				 machine_mode regmode)
3289{
3290  machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3291
3292  if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3293    result = (nregs == 1 ? SImode : DImode);
3294
3295  return result;
3296}
3297
3298/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3299
3300   Output assembler code for a block containing the constant parts
3301   of a trampoline, leaving space for the variable parts.  */
3302
3303static void
3304gcn_asm_trampoline_template (FILE *f)
3305{
3306  /* The source operand of the move instructions must be a 32-bit
3307     constant following the opcode.  */
3308  asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3309  asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3310  asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3311  asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3312  asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3313}
3314
3315/* Implement TARGET_TRAMPOLINE_INIT.
3316
3317   Emit RTL insns to initialize the variable parts of a trampoline.
3318   FNDECL is the decl of the target address, M_TRAMP is a MEM for
3319   the trampoline, and CHAIN_VALUE is an RTX for the static chain
3320   to be passed to the target function.  */
3321
3322static void
3323gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3324{
3325  if (TARGET_GCN5_PLUS)
3326    sorry ("nested function trampolines not supported on GCN5 due to"
3327           " non-executable stacks");
3328
3329  emit_block_move (m_tramp, assemble_trampoline_template (),
3330		   GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3331
3332  rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3333  rtx chain_value_reg = copy_to_reg (chain_value);
3334  rtx fnaddr_reg = copy_to_reg (fnaddr);
3335
3336  for (int i = 0; i < 4; i++)
3337    {
3338      rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3339      rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3340      emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3341    }
3342
3343  rtx tramp_addr = XEXP (m_tramp, 0);
3344  emit_insn (gen_clear_icache (tramp_addr,
3345			       plus_constant (ptr_mode, tramp_addr,
3346					      TRAMPOLINE_SIZE)));
3347}
3348
3349/* }}}  */
3350/* {{{ Miscellaneous.  */
3351
3352/* Implement TARGET_CANNOT_COPY_INSN_P.
3353
3354   Return true if INSN must not be duplicated.  */
3355
3356static bool
3357gcn_cannot_copy_insn_p (rtx_insn *insn)
3358{
3359  if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3360    return true;
3361
3362  return false;
3363}
3364
3365/* Implement TARGET_DEBUG_UNWIND_INFO.
3366
3367   Defines the mechanism that will be used for describing frame unwind
3368   information to the debugger.  */
3369
3370static enum unwind_info_type
3371gcn_debug_unwind_info ()
3372{
3373  return UI_DWARF2;
3374}
3375
3376/* Determine if there is a suitable hardware conversion instruction.
3377   Used primarily by the machine description.  */
3378
3379bool
3380gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3381{
3382  if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3383    return false;
3384
3385  if (VECTOR_MODE_P (from))
3386    {
3387      from = GET_MODE_INNER (from);
3388      to = GET_MODE_INNER (to);
3389    }
3390
3391  switch (op)
3392    {
3393    case fix_trunc_cvt:
3394    case fixuns_trunc_cvt:
3395      if (GET_MODE_CLASS (from) != MODE_FLOAT
3396	  || GET_MODE_CLASS (to) != MODE_INT)
3397	return false;
3398      break;
3399    case float_cvt:
3400    case floatuns_cvt:
3401      if (GET_MODE_CLASS (from) != MODE_INT
3402	  || GET_MODE_CLASS (to) != MODE_FLOAT)
3403	return false;
3404      break;
3405    case extend_cvt:
3406      if (GET_MODE_CLASS (from) != MODE_FLOAT
3407	  || GET_MODE_CLASS (to) != MODE_FLOAT
3408	  || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3409	return false;
3410      break;
3411    case trunc_cvt:
3412      if (GET_MODE_CLASS (from) != MODE_FLOAT
3413	  || GET_MODE_CLASS (to) != MODE_FLOAT
3414	  || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3415	return false;
3416      break;
3417    }
3418
3419  return ((to == HImode && from == HFmode)
3420	  || (to == SImode && (from == SFmode || from == DFmode))
3421	  || (to == HFmode && (from == HImode || from == SFmode))
3422	  || (to == SFmode && (from == SImode || from == HFmode
3423			       || from == DFmode))
3424	  || (to == DFmode && (from == SImode || from == SFmode)));
3425}
3426
3427/* Implement TARGET_EMUTLS_VAR_INIT.
3428
3429   Disable emutls (gthr-gcn.h does not support it, yet).  */
3430
3431tree
3432gcn_emutls_var_init (tree, tree decl, tree)
3433{
3434  sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3435  return NULL_TREE;
3436}
3437
3438/* }}}  */
3439/* {{{ Costs.  */
3440
3441/* Implement TARGET_RTX_COSTS.
3442
3443   Compute a (partial) cost for rtx X.  Return true if the complete
3444   cost has been computed, and false if subexpressions should be
3445   scanned.  In either case, *TOTAL contains the cost result.  */
3446
3447static bool
3448gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3449{
3450  enum rtx_code code = GET_CODE (x);
3451  switch (code)
3452    {
3453    case CONST:
3454    case CONST_DOUBLE:
3455    case CONST_VECTOR:
3456    case CONST_INT:
3457      if (gcn_inline_constant_p (x))
3458	*total = 0;
3459      else if (code == CONST_INT
3460	  && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3461	*total = 1;
3462      else if (gcn_constant_p (x))
3463	*total = 2;
3464      else
3465	*total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3466      return true;
3467
3468    case DIV:
3469      *total = 100;
3470      return false;
3471
3472    default:
3473      *total = 3;
3474      return false;
3475    }
3476}
3477
3478/* Implement TARGET_MEMORY_MOVE_COST.
3479
3480   Return the cost of moving data of mode M between a
3481   register and memory.  A value of 2 is the default; this cost is
3482   relative to those in `REGISTER_MOVE_COST'.
3483
3484   This function is used extensively by register_move_cost that is used to
3485   build tables at startup.  Make it inline in this case.
3486   When IN is 2, return maximum of in and out move cost.
3487
3488   If moving between registers and memory is more expensive than
3489   between two registers, you should define this macro to express the
3490   relative cost.
3491
3492   Model also increased moving costs of QImode registers in non
3493   Q_REGS classes.  */
3494
3495#define LOAD_COST  32
3496#define STORE_COST 32
3497static int
3498gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3499{
3500  int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3501  switch (regclass)
3502    {
3503    case SCC_CONDITIONAL_REG:
3504    case VCCZ_CONDITIONAL_REG:
3505    case VCC_CONDITIONAL_REG:
3506    case EXECZ_CONDITIONAL_REG:
3507    case ALL_CONDITIONAL_REGS:
3508    case SGPR_REGS:
3509    case SGPR_EXEC_REGS:
3510    case EXEC_MASK_REG:
3511    case SGPR_VOP_SRC_REGS:
3512    case SGPR_MEM_SRC_REGS:
3513    case SGPR_SRC_REGS:
3514    case SGPR_DST_REGS:
3515    case GENERAL_REGS:
3516    case AFP_REGS:
3517      if (!in)
3518	return (STORE_COST + 2) * nregs;
3519      return LOAD_COST * nregs;
3520    case VGPR_REGS:
3521      if (in)
3522	return (LOAD_COST + 2) * nregs;
3523      return STORE_COST * nregs;
3524    case ALL_REGS:
3525    case ALL_GPR_REGS:
3526    case SRCDST_REGS:
3527      if (in)
3528	return (LOAD_COST + 2) * nregs;
3529      return (STORE_COST + 2) * nregs;
3530    default:
3531      gcc_unreachable ();
3532    }
3533}
3534
3535/* Implement TARGET_REGISTER_MOVE_COST.
3536
3537   Return the cost of moving data from a register in class CLASS1 to
3538   one in class CLASS2.  Base value is 2.  */
3539
3540static int
3541gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3542{
3543  /* Increase cost of moving from and to vector registers.  While this is
3544     fast in hardware (I think), it has hidden cost of setting up the exec
3545     flags.  */
3546  if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3547    return 4;
3548  return 2;
3549}
3550
3551/* }}}  */
3552/* {{{ Builtins.  */
3553
3554/* Type codes used by GCN built-in definitions.  */
3555
3556enum gcn_builtin_type_index
3557{
3558  GCN_BTI_END_OF_PARAMS,
3559
3560  GCN_BTI_VOID,
3561  GCN_BTI_BOOL,
3562  GCN_BTI_INT,
3563  GCN_BTI_UINT,
3564  GCN_BTI_SIZE_T,
3565  GCN_BTI_LLINT,
3566  GCN_BTI_LLUINT,
3567  GCN_BTI_EXEC,
3568
3569  GCN_BTI_SF,
3570  GCN_BTI_V64SI,
3571  GCN_BTI_V64SF,
3572  GCN_BTI_V64PTR,
3573  GCN_BTI_SIPTR,
3574  GCN_BTI_SFPTR,
3575  GCN_BTI_VOIDPTR,
3576
3577  GCN_BTI_LDS_VOIDPTR,
3578
3579  GCN_BTI_MAX
3580};
3581
3582static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3583
3584#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3585#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3586#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3587#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3588#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3589#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3590#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3591#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3592#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3593
3594static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3595				 struct gcn_builtin_description *);
3596static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3597				     struct gcn_builtin_description *);
3598
3599struct gcn_builtin_description;
3600typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3601				     struct gcn_builtin_description *);
3602
3603enum gcn_builtin_type
3604{
3605  B_UNIMPLEMENTED,		/* Sorry out */
3606  B_INSN,			/* Emit a pattern */
3607  B_OVERLOAD			/* Placeholder for an overloaded function */
3608};
3609
3610struct gcn_builtin_description
3611{
3612  int fcode;
3613  int icode;
3614  const char *name;
3615  enum gcn_builtin_type type;
3616  /* The first element of parm is always the return type.  The rest
3617     are a zero terminated list of parameters.  */
3618  int parm[6];
3619  gcn_builtin_expander expander;
3620};
3621
3622/* Read in the GCN builtins from gcn-builtins.def.  */
3623
3624extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3625
3626struct gcn_builtin_description gcn_builtins[] = {
3627#define DEF_BUILTIN(fcode, icode, name, type, params, expander)	\
3628  {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3629
3630#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name)			\
3631  {GCN_BUILTIN_ ## fcode ## _V64SI,					\
3632   CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN,		\
3633   {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3634    GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},	\
3635  {GCN_BUILTIN_ ## fcode ## _V64SI_unspec,				\
3636   CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, 	\
3637   {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3638    GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3639
3640#include "gcn-builtins.def"
3641#undef DEF_BUILTIN_BINOP_INT_FP
3642#undef DEF_BUILTIN
3643};
3644
3645static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3646
3647/* Implement TARGET_BUILTIN_DECL.
3648
3649   Return the GCN builtin for CODE.  */
3650
3651tree
3652gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3653{
3654  if (code >= GCN_BUILTIN_MAX)
3655    return error_mark_node;
3656
3657  return gcn_builtin_decls[code];
3658}
3659
3660/* Helper function for gcn_init_builtins.  */
3661
3662static void
3663gcn_init_builtin_types (void)
3664{
3665  gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3666  gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3667  gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3668  gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3669  gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3670  gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3671  gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3672
3673  exec_type_node = unsigned_intDI_type_node;
3674  sf_type_node = float32_type_node;
3675  v64si_type_node = build_vector_type (intSI_type_node, 64);
3676  v64sf_type_node = build_vector_type (float_type_node, 64);
3677  v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3678					/*build_pointer_type
3679					  (integer_type_node) */
3680					, 64);
3681  tree tmp = build_distinct_type_copy (intSI_type_node);
3682  TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3683  siptr_type_node = build_pointer_type (tmp);
3684
3685  tmp = build_distinct_type_copy (float_type_node);
3686  TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3687  sfptr_type_node = build_pointer_type (tmp);
3688
3689  tmp = build_distinct_type_copy (void_type_node);
3690  TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3691  voidptr_type_node = build_pointer_type (tmp);
3692
3693  tmp = build_distinct_type_copy (void_type_node);
3694  TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3695  gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3696}
3697
3698/* Implement TARGET_INIT_BUILTINS.
3699
3700   Set up all builtin functions for this target.  */
3701
3702static void
3703gcn_init_builtins (void)
3704{
3705  gcn_init_builtin_types ();
3706
3707  struct gcn_builtin_description *d;
3708  unsigned int i;
3709  for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3710    {
3711      tree p;
3712      char name[64];		/* build_function will make a copy.  */
3713      int parm;
3714
3715      /* FIXME: Is this necessary/useful? */
3716      if (d->name == 0)
3717	continue;
3718
3719      /* Find last parm.  */
3720      for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3721	;
3722
3723      p = void_list_node;
3724      while (parm > 1)
3725	p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3726
3727      p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3728
3729      sprintf (name, "__builtin_gcn_%s", d->name);
3730      gcn_builtin_decls[i]
3731	= add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3732
3733      /* These builtins don't throw.  */
3734      TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3735    }
3736
3737  /* These builtins need to take/return an LDS pointer: override the generic
3738     versions here.  */
3739
3740  set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3741		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3742
3743  set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3744		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3745		    false);
3746
3747  set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3748		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3749		    false);
3750
3751  set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3752		    gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3753}
3754
3755/* Implement TARGET_INIT_LIBFUNCS.  */
3756
3757static void
3758gcn_init_libfuncs (void)
3759{
3760  /* BITS_PER_UNIT * 2 is 64 bits, which causes
3761     optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
3762     libcalls that we need to support operations for that type.  Initialise
3763     them here instead.  */
3764  set_optab_libfunc (udiv_optab, TImode, "__udivti3");
3765  set_optab_libfunc (umod_optab, TImode, "__umodti3");
3766  set_optab_libfunc (sdiv_optab, TImode, "__divti3");
3767  set_optab_libfunc (smod_optab, TImode, "__modti3");
3768  set_optab_libfunc (smul_optab, TImode, "__multi3");
3769  set_optab_libfunc (addv_optab, TImode, "__addvti3");
3770  set_optab_libfunc (subv_optab, TImode, "__subvti3");
3771  set_optab_libfunc (negv_optab, TImode, "__negvti2");
3772  set_optab_libfunc (absv_optab, TImode, "__absvti2");
3773  set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
3774  set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
3775  set_optab_libfunc (clz_optab, TImode, "__clzti2");
3776  set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
3777  set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
3778  set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
3779  set_optab_libfunc (parity_optab, TImode, "__parityti2");
3780  set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
3781}
3782
3783/* Expand the CMP_SWAP GCN builtins.  We have our own versions that do
3784   not require taking the address of any object, other than the memory
3785   cell being operated on.
3786
3787   Helper function for gcn_expand_builtin_1.  */
3788
3789static rtx
3790gcn_expand_cmp_swap (tree exp, rtx target)
3791{
3792  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3793  addr_space_t as
3794    = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3795  machine_mode as_mode = gcn_addr_space_address_mode (as);
3796
3797  if (!target)
3798    target = gen_reg_rtx (mode);
3799
3800  rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3801			  NULL_RTX, as_mode, EXPAND_NORMAL);
3802  rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3803			 NULL_RTX, mode, EXPAND_NORMAL);
3804  rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3805			 NULL_RTX, mode, EXPAND_NORMAL);
3806  rtx pat;
3807
3808  rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3809  set_mem_addr_space (mem, as);
3810
3811  if (!REG_P (cmp))
3812    cmp = copy_to_mode_reg (mode, cmp);
3813  if (!REG_P (src))
3814    src = copy_to_mode_reg (mode, src);
3815
3816  if (mode == SImode)
3817    pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3818  else
3819    pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3820
3821  emit_insn (pat);
3822
3823  return target;
3824}
3825
3826/* Expand many different builtins.
3827
3828   Intended for use in gcn-builtins.def.  */
3829
3830static rtx
3831gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3832		      machine_mode /*mode */ , int ignore,
3833		      struct gcn_builtin_description *)
3834{
3835  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3836  switch (DECL_MD_FUNCTION_CODE (fndecl))
3837    {
3838    case GCN_BUILTIN_FLAT_LOAD_INT32:
3839      {
3840	if (ignore)
3841	  return target;
3842	/*rtx exec = */
3843	force_reg (DImode,
3844		   expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3845				EXPAND_NORMAL));
3846	/*rtx ptr = */
3847	force_reg (V64DImode,
3848		   expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3849				EXPAND_NORMAL));
3850	/*emit_insn (gen_vector_flat_loadv64si
3851		     (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3852	return target;
3853      }
3854    case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3855    case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3856      {
3857	if (ignore)
3858	  return target;
3859	rtx exec = force_reg (DImode,
3860			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3861					   DImode,
3862					   EXPAND_NORMAL));
3863	rtx ptr = force_reg (DImode,
3864			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3865					  V64DImode,
3866					  EXPAND_NORMAL));
3867	rtx offsets = force_reg (V64SImode,
3868				 expand_expr (CALL_EXPR_ARG (exp, 2),
3869					      NULL_RTX, V64DImode,
3870					      EXPAND_NORMAL));
3871	rtx addrs = gen_reg_rtx (V64DImode);
3872	rtx tmp = gen_reg_rtx (V64SImode);
3873	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3874					  GEN_INT (2),
3875					  gcn_gen_undef (V64SImode), exec));
3876	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3877						 gcn_gen_undef (V64DImode),
3878						 exec));
3879	rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3880	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3881	/* FIXME: set attributes.  */
3882	emit_insn (gen_mov_with_exec (target, mem, exec));
3883	return target;
3884      }
3885    case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3886    case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3887      {
3888	rtx exec = force_reg (DImode,
3889			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3890					   DImode,
3891					   EXPAND_NORMAL));
3892	rtx ptr = force_reg (DImode,
3893			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3894					  V64DImode,
3895					  EXPAND_NORMAL));
3896	rtx offsets = force_reg (V64SImode,
3897				 expand_expr (CALL_EXPR_ARG (exp, 2),
3898					      NULL_RTX, V64DImode,
3899					      EXPAND_NORMAL));
3900	machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3901								       3)));
3902	rtx val = force_reg (vmode,
3903			     expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3904					  vmode,
3905					  EXPAND_NORMAL));
3906	rtx addrs = gen_reg_rtx (V64DImode);
3907	rtx tmp = gen_reg_rtx (V64SImode);
3908	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3909					  GEN_INT (2),
3910					  gcn_gen_undef (V64SImode), exec));
3911	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3912						 gcn_gen_undef (V64DImode),
3913						 exec));
3914	rtx mem = gen_rtx_MEM (vmode, addrs);
3915	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3916	/* FIXME: set attributes.  */
3917	emit_insn (gen_mov_with_exec (mem, val, exec));
3918	return target;
3919      }
3920    case GCN_BUILTIN_SQRTVF:
3921      {
3922	if (ignore)
3923	  return target;
3924	rtx exec = gcn_full_exec_reg ();
3925	rtx arg = force_reg (V64SFmode,
3926			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3927					  V64SFmode,
3928					  EXPAND_NORMAL));
3929	emit_insn (gen_sqrtv64sf2_exec
3930		   (target, arg, gcn_gen_undef (V64SFmode), exec));
3931	return target;
3932      }
3933    case GCN_BUILTIN_SQRTF:
3934      {
3935	if (ignore)
3936	  return target;
3937	rtx arg = force_reg (SFmode,
3938			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3939					  SFmode,
3940					  EXPAND_NORMAL));
3941	emit_insn (gen_sqrtsf2 (target, arg));
3942	return target;
3943      }
3944    case GCN_BUILTIN_OMP_DIM_SIZE:
3945      {
3946	if (ignore)
3947	  return target;
3948	emit_insn (gen_oacc_dim_size (target,
3949				      expand_expr (CALL_EXPR_ARG (exp, 0),
3950						   NULL_RTX, SImode,
3951						   EXPAND_NORMAL)));
3952	return target;
3953      }
3954    case GCN_BUILTIN_OMP_DIM_POS:
3955      {
3956	if (ignore)
3957	  return target;
3958	emit_insn (gen_oacc_dim_pos (target,
3959				     expand_expr (CALL_EXPR_ARG (exp, 0),
3960						  NULL_RTX, SImode,
3961						  EXPAND_NORMAL)));
3962	return target;
3963      }
3964    case GCN_BUILTIN_CMP_SWAP:
3965    case GCN_BUILTIN_CMP_SWAPLL:
3966      return gcn_expand_cmp_swap (exp, target);
3967
3968    case GCN_BUILTIN_ACC_SINGLE_START:
3969      {
3970	if (ignore)
3971	  return target;
3972
3973	rtx wavefront = gcn_oacc_dim_pos (1);
3974	rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3975	rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3976	emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3977	return cc;
3978      }
3979
3980    case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3981      {
3982	rtx blk = force_reg (SImode,
3983			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3984					  SImode, EXPAND_NORMAL));
3985	rtx wavefront = gcn_oacc_dim_pos (1);
3986	rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3987	rtx not_zero = gen_label_rtx ();
3988	emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3989	emit_move_insn (blk, const0_rtx);
3990	emit_label (not_zero);
3991	return blk;
3992      }
3993
3994    case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3995      return target;
3996
3997    case GCN_BUILTIN_ACC_BARRIER:
3998      emit_insn (gen_gcn_wavefront_barrier ());
3999      return target;
4000
4001    default:
4002      gcc_unreachable ();
4003    }
4004}
4005
4006/* Expansion of simple arithmetic and bit binary operation builtins.
4007
4008   Intended for use with gcn_builtins table.  */
4009
4010static rtx
4011gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4012			  machine_mode /*mode */ , int ignore,
4013			  struct gcn_builtin_description *d)
4014{
4015  int icode = d->icode;
4016  if (ignore)
4017    return target;
4018
4019  rtx exec = force_reg (DImode,
4020			expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4021				     EXPAND_NORMAL));
4022
4023  machine_mode m1 = insn_data[icode].operand[1].mode;
4024  rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4025			  EXPAND_NORMAL);
4026  if (!insn_data[icode].operand[1].predicate (arg1, m1))
4027    arg1 = force_reg (m1, arg1);
4028
4029  machine_mode m2 = insn_data[icode].operand[2].mode;
4030  rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4031			  EXPAND_NORMAL);
4032  if (!insn_data[icode].operand[2].predicate (arg2, m2))
4033    arg2 = force_reg (m2, arg2);
4034
4035  rtx arg_prev;
4036  if (call_expr_nargs (exp) == 4)
4037    {
4038      machine_mode m_prev = insn_data[icode].operand[4].mode;
4039      arg_prev = force_reg (m_prev,
4040			    expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4041					 m_prev, EXPAND_NORMAL));
4042    }
4043  else
4044    arg_prev = gcn_gen_undef (GET_MODE (target));
4045
4046  rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4047  emit_insn (pat);
4048  return target;
4049}
4050
4051/* Implement TARGET_EXPAND_BUILTIN.
4052
4053   Expand an expression EXP that calls a built-in function, with result going
4054   to TARGET if that's convenient (and in mode MODE if that's convenient).
4055   SUBTARGET may be used as the target for computing one of EXP's operands.
4056   IGNORE is nonzero if the value is to be ignored.  */
4057
4058rtx
4059gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4060		    int ignore)
4061{
4062  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4063  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
4064  struct gcn_builtin_description *d;
4065
4066  gcc_assert (fcode < GCN_BUILTIN_MAX);
4067  d = &gcn_builtins[fcode];
4068
4069  if (d->type == B_UNIMPLEMENTED)
4070    sorry ("Builtin not implemented");
4071
4072  return d->expander (exp, target, subtarget, mode, ignore, d);
4073}
4074
4075/* }}}  */
4076/* {{{ Vectorization.  */
4077
4078/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4079
4080   A vector mask is a value that holds one boolean result for every element in
4081   a vector.  */
4082
4083opt_machine_mode
4084gcn_vectorize_get_mask_mode (machine_mode)
4085{
4086  /* GCN uses a DImode bit-mask.  */
4087  return DImode;
4088}
4089
4090/* Return an RTX that references a vector with the i-th lane containing
4091   PERM[i]*4.
4092
4093   Helper function for gcn_vectorize_vec_perm_const.  */
4094
4095static rtx
4096gcn_make_vec_perm_address (unsigned int *perm)
4097{
4098  rtx x = gen_reg_rtx (V64SImode);
4099  emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
4100
4101  /* Permutation addresses use byte addressing.  With each vector lane being
4102     4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
4103     so only set those.
4104
4105     The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
4106     select between lanes in two vectors, but as the DS_BPERMUTE* instructions
4107     only take one source vector, the most-significant bit can be ignored
4108     here.  Instead, we can use EXEC masking to select the relevant part of
4109     each source vector after they are permuted separately.  */
4110  uint64_t bit_mask = 1 << 2;
4111  for (int i = 2; i < 8; i++, bit_mask <<= 1)
4112    {
4113      uint64_t exec_mask = 0;
4114      uint64_t lane_mask = 1;
4115      for (int j = 0; j < 64; j++, lane_mask <<= 1)
4116	if ((perm[j] * 4) & bit_mask)
4117	  exec_mask |= lane_mask;
4118
4119      if (exec_mask)
4120	emit_insn (gen_addv64si3_exec (x, x,
4121				       gcn_vec_constant (V64SImode,
4122							 bit_mask),
4123				       x, get_exec (exec_mask)));
4124    }
4125
4126  return x;
4127}
4128
4129/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
4130
4131   Return true if permutation with SEL is possible.
4132
4133   If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
4134   permutations.  */
4135
4136static bool
4137gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
4138			      rtx src0, rtx src1,
4139			      const vec_perm_indices & sel)
4140{
4141  unsigned int nelt = GET_MODE_NUNITS (vmode);
4142
4143  gcc_assert (VECTOR_MODE_P (vmode));
4144  gcc_assert (nelt <= 64);
4145  gcc_assert (sel.length () == nelt);
4146
4147  if (!dst)
4148    {
4149      /* All vector permutations are possible on this architecture,
4150         with varying degrees of efficiency depending on the permutation. */
4151      return true;
4152    }
4153
4154  unsigned int perm[64];
4155  for (unsigned int i = 0; i < nelt; ++i)
4156    perm[i] = sel[i] & (2 * nelt - 1);
4157  for (unsigned int i = nelt; i < 64; ++i)
4158    perm[i] = 0;
4159
4160  src0 = force_reg (vmode, src0);
4161  src1 = force_reg (vmode, src1);
4162
4163  /* Make life a bit easier by swapping operands if necessary so that
4164     the first element always comes from src0.  */
4165  if (perm[0] >= nelt)
4166    {
4167      std::swap (src0, src1);
4168
4169      for (unsigned int i = 0; i < nelt; ++i)
4170	if (perm[i] < nelt)
4171	  perm[i] += nelt;
4172	else
4173	  perm[i] -= nelt;
4174    }
4175
4176  /* TODO: There are more efficient ways to implement certain permutations
4177     using ds_swizzle_b32 and/or DPP.  Test for and expand them here, before
4178     this more inefficient generic approach is used.  */
4179
4180  int64_t src1_lanes = 0;
4181  int64_t lane_bit = 1;
4182
4183  for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
4184    {
4185      /* Set the bits for lanes from src1.  */
4186      if (perm[i] >= nelt)
4187	src1_lanes |= lane_bit;
4188    }
4189
4190  rtx addr = gcn_make_vec_perm_address (perm);
4191  rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
4192
4193  switch (vmode)
4194    {
4195    case E_V64QImode:
4196      ds_bpermute = gen_ds_bpermutev64qi;
4197      break;
4198    case E_V64HImode:
4199      ds_bpermute = gen_ds_bpermutev64hi;
4200      break;
4201    case E_V64SImode:
4202      ds_bpermute = gen_ds_bpermutev64si;
4203      break;
4204    case E_V64HFmode:
4205      ds_bpermute = gen_ds_bpermutev64hf;
4206      break;
4207    case E_V64SFmode:
4208      ds_bpermute = gen_ds_bpermutev64sf;
4209      break;
4210    case E_V64DImode:
4211      ds_bpermute = gen_ds_bpermutev64di;
4212      break;
4213    case E_V64DFmode:
4214      ds_bpermute = gen_ds_bpermutev64df;
4215      break;
4216    default:
4217      gcc_assert (false);
4218    }
4219
4220  /* Load elements from src0 to dst.  */
4221  gcc_assert (~src1_lanes);
4222  emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
4223
4224  /* Load elements from src1 to dst.  */
4225  if (src1_lanes)
4226    {
4227      /* Masking a lane masks both the destination and source lanes for
4228         DS_BPERMUTE, so we need to have all lanes enabled for the permute,
4229         then add an extra masked move to merge the results of permuting
4230         the two source vectors together.
4231       */
4232      rtx tmp = gen_reg_rtx (vmode);
4233      emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
4234      emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
4235    }
4236
4237  return true;
4238}
4239
4240/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4241
4242   Return nonzero if vector MODE is supported with at least move
4243   instructions.  */
4244
4245static bool
4246gcn_vector_mode_supported_p (machine_mode mode)
4247{
4248  return (mode == V64QImode || mode == V64HImode
4249	  || mode == V64SImode || mode == V64DImode
4250	  || mode == V64SFmode || mode == V64DFmode);
4251}
4252
4253/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4254
4255   Enables autovectorization for all supported modes.  */
4256
4257static machine_mode
4258gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4259{
4260  switch (mode)
4261    {
4262    case E_QImode:
4263      return V64QImode;
4264    case E_HImode:
4265      return V64HImode;
4266    case E_SImode:
4267      return V64SImode;
4268    case E_DImode:
4269      return V64DImode;
4270    case E_SFmode:
4271      return V64SFmode;
4272    case E_DFmode:
4273      return V64DFmode;
4274    default:
4275      return word_mode;
4276    }
4277}
4278
4279/* Implement TARGET_VECTORIZE_RELATED_MODE.
4280
4281   All GCN vectors are 64-lane, so this is simpler than other architectures.
4282   In particular, we do *not* want to match vector bit-size.  */
4283
4284static opt_machine_mode
4285gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
4286			 scalar_mode element_mode, poly_uint64 nunits)
4287{
4288  if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4289    return VOIDmode;
4290
4291  machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4292  if (!VECTOR_MODE_P (pref_mode))
4293    return VOIDmode;
4294
4295  return pref_mode;
4296}
4297
4298/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4299
4300   Returns the preferred alignment in bits for accesses to vectors of type type
4301   in vectorized code. This might be less than or greater than the ABI-defined
4302   value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4303   of a single element, in which case the vectorizer will not try to optimize
4304   for alignment.  */
4305
4306static poly_uint64
4307gcn_preferred_vector_alignment (const_tree type)
4308{
4309  return TYPE_ALIGN (TREE_TYPE (type));
4310}
4311
4312/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4313
4314   Return true if the target supports misaligned vector store/load of a
4315   specific factor denoted in the misalignment parameter.  */
4316
4317static bool
4318gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4319					   const_tree type, int misalignment,
4320					   bool is_packed)
4321{
4322  if (is_packed)
4323    return false;
4324
4325  /* If the misalignment is unknown, we should be able to handle the access
4326     so long as it is not to a member of a packed data structure.  */
4327  if (misalignment == -1)
4328    return true;
4329
4330  /* Return true if the misalignment is a multiple of the natural alignment
4331     of the vector's element type.  This is probably always going to be
4332     true in practice, since we've already established that this isn't a
4333     packed access.  */
4334  return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4335}
4336
4337/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4338
4339   Return true if vector alignment is reachable (by peeling N iterations) for
4340   the given scalar type TYPE.  */
4341
4342static bool
4343gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4344{
4345  /* Vectors which aren't in packed structures will not be less aligned than
4346     the natural alignment of their element type, so this is safe.  */
4347  return !is_packed;
4348}
4349
4350/* Generate DPP instructions used for vector reductions.
4351
4352   The opcode is given by INSN.
4353   The first operand of the operation is shifted right by SHIFT vector lanes.
4354   SHIFT must be a power of 2.  If SHIFT is 16, the 15th lane of each row is
4355   broadcast the next row (thereby acting like a shift of 16 for the end of
4356   each row).  If SHIFT is 32, lane 31 is broadcast to all the
4357   following lanes (thereby acting like a shift of 32 for lane 63).  */
4358
4359char *
4360gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4361			 int unspec, int shift)
4362{
4363  static char buf[128];
4364  const char *dpp;
4365  const char *vcc_in = "";
4366  const char *vcc_out = "";
4367
4368  /* Add the vcc operand if needed.  */
4369  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4370    {
4371      if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4372	vcc_in = ", vcc";
4373
4374      if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4375	  || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4376	vcc_out = ", vcc";
4377    }
4378
4379  /* Add the DPP modifiers.  */
4380  switch (shift)
4381    {
4382    case 1:
4383      dpp = "row_shr:1 bound_ctrl:0";
4384      break;
4385    case 2:
4386      dpp = "row_shr:2 bound_ctrl:0";
4387      break;
4388    case 4:
4389      dpp = "row_shr:4 bank_mask:0xe";
4390      break;
4391    case 8:
4392      dpp = "row_shr:8 bank_mask:0xc";
4393      break;
4394    case 16:
4395      dpp = "row_bcast:15 row_mask:0xa";
4396      break;
4397    case 32:
4398      dpp = "row_bcast:31 row_mask:0xc";
4399      break;
4400    default:
4401      gcc_unreachable ();
4402    }
4403
4404  if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4405    sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4406	     insn, dpp, insn, dpp);
4407  else if (unspec == UNSPEC_MOV_DPP_SHR)
4408    sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4409  else
4410    sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4411
4412  return buf;
4413}
4414
4415/* Generate vector reductions in terms of DPP instructions.
4416
4417   The vector register SRC of mode MODE is reduced using the operation given
4418   by UNSPEC, and the scalar result is returned in lane 63 of a vector
4419   register.  */
4420
4421rtx
4422gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4423{
4424  machine_mode orig_mode = mode;
4425  bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4426		      || unspec == UNSPEC_SMAX_DPP_SHR
4427		      || unspec == UNSPEC_UMIN_DPP_SHR
4428		      || unspec == UNSPEC_UMAX_DPP_SHR)
4429		     && (mode == V64DImode
4430			 || mode == V64DFmode))
4431		    || (unspec == UNSPEC_PLUS_DPP_SHR
4432			&& mode == V64DFmode));
4433  rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4434		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4435		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4436		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4437		   : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4438		   : UNKNOWN);
4439  bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4440		       || unspec == UNSPEC_SMAX_DPP_SHR
4441		       || unspec == UNSPEC_UMIN_DPP_SHR
4442		       || unspec == UNSPEC_UMAX_DPP_SHR)
4443		      && (mode == V64QImode
4444			  || mode == V64HImode));
4445  bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4446		    || unspec == UNSPEC_UMAX_DPP_SHR);
4447  bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4448			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4449			&& (TARGET_GCN3 || mode == V64DImode);
4450
4451  if (use_plus_carry)
4452    unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4453
4454  if (use_extends)
4455    {
4456      rtx tmp = gen_reg_rtx (V64SImode);
4457      convert_move (tmp, src, unsignedp);
4458      src = tmp;
4459      mode = V64SImode;
4460    }
4461
4462  /* Perform reduction by first performing the reduction operation on every
4463     pair of lanes, then on every pair of results from the previous
4464     iteration (thereby effectively reducing every 4 lanes) and so on until
4465     all lanes are reduced.  */
4466  rtx in, out = force_reg (mode, src);
4467  for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4468    {
4469      rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4470      in = out;
4471      out = gen_reg_rtx (mode);
4472
4473      if (use_moves)
4474	{
4475	  rtx tmp = gen_reg_rtx (mode);
4476	  emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4477	  emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4478	}
4479      else
4480	{
4481	  rtx insn = gen_rtx_SET (out,
4482				  gen_rtx_UNSPEC (mode,
4483						  gen_rtvec (3, in, in,
4484							     shift_val),
4485						  unspec));
4486
4487	  /* Add clobber for instructions that set the carry flags.  */
4488	  if (use_plus_carry)
4489	    {
4490	      rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4491					     gen_rtx_REG (DImode, VCC_REG));
4492	      insn = gen_rtx_PARALLEL (VOIDmode,
4493				       gen_rtvec (2, insn, clobber));
4494	    }
4495
4496	  emit_insn (insn);
4497	}
4498    }
4499
4500  if (use_extends)
4501    {
4502      rtx tmp = gen_reg_rtx (orig_mode);
4503      convert_move (tmp, out, unsignedp);
4504      out = tmp;
4505    }
4506
4507  return out;
4508}
4509
4510/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.  */
4511
4512int
4513gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4514			tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4515{
4516  /* Always vectorize.  */
4517  return 1;
4518}
4519
4520/* }}}  */
4521/* {{{ md_reorg pass.  */
4522
4523/* Identify VMEM instructions from their "type" attribute.  */
4524
4525static bool
4526gcn_vmem_insn_p (attr_type type)
4527{
4528  switch (type)
4529    {
4530    case TYPE_MUBUF:
4531    case TYPE_MTBUF:
4532    case TYPE_FLAT:
4533      return true;
4534    case TYPE_UNKNOWN:
4535    case TYPE_SOP1:
4536    case TYPE_SOP2:
4537    case TYPE_SOPK:
4538    case TYPE_SOPC:
4539    case TYPE_SOPP:
4540    case TYPE_SMEM:
4541    case TYPE_DS:
4542    case TYPE_VOP2:
4543    case TYPE_VOP1:
4544    case TYPE_VOPC:
4545    case TYPE_VOP3A:
4546    case TYPE_VOP3B:
4547    case TYPE_VOP_SDWA:
4548    case TYPE_VOP_DPP:
4549    case TYPE_MULT:
4550    case TYPE_VMULT:
4551      return false;
4552    }
4553  gcc_unreachable ();
4554  return false;
4555}
4556
4557/* If INSN sets the EXEC register to a constant value, return the value,
4558   otherwise return zero.  */
4559
4560static int64_t
4561gcn_insn_exec_value (rtx_insn *insn)
4562{
4563  if (!NONDEBUG_INSN_P (insn))
4564    return 0;
4565
4566  rtx pattern = PATTERN (insn);
4567
4568  if (GET_CODE (pattern) == SET)
4569    {
4570      rtx dest = XEXP (pattern, 0);
4571      rtx src = XEXP (pattern, 1);
4572
4573      if (GET_MODE (dest) == DImode
4574	  && REG_P (dest) && REGNO (dest) == EXEC_REG
4575	  && CONST_INT_P (src))
4576	return INTVAL (src);
4577    }
4578
4579  return 0;
4580}
4581
4582/* Sets the EXEC register before INSN to the value that it had after
4583   LAST_EXEC_DEF.  The constant value of the EXEC register is returned if
4584   known, otherwise it returns zero.  */
4585
4586static int64_t
4587gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4588		  bool curr_exec_known, bool &last_exec_def_saved)
4589{
4590  rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4591  rtx exec;
4592
4593  int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4594
4595  if (exec_value)
4596    {
4597      /* If the EXEC value is a constant and it happens to be the same as the
4598         current EXEC value, the restore can be skipped.  */
4599      if (curr_exec_known && exec_value == curr_exec)
4600	return exec_value;
4601
4602      exec = GEN_INT (exec_value);
4603    }
4604  else
4605    {
4606      /* If the EXEC value is not a constant, save it in a register after the
4607	 point of definition.  */
4608      rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4609
4610      if (!last_exec_def_saved)
4611	{
4612	  start_sequence ();
4613	  emit_move_insn (exec_save_reg, exec_reg);
4614	  rtx_insn *seq = get_insns ();
4615	  end_sequence ();
4616
4617	  emit_insn_after (seq, last_exec_def);
4618	  if (dump_file && (dump_flags & TDF_DETAILS))
4619	    fprintf (dump_file, "Saving EXEC after insn %d.\n",
4620		     INSN_UID (last_exec_def));
4621
4622	  last_exec_def_saved = true;
4623	}
4624
4625      exec = exec_save_reg;
4626    }
4627
4628  /* Restore EXEC register before the usage.  */
4629  start_sequence ();
4630  emit_move_insn (exec_reg, exec);
4631  rtx_insn *seq = get_insns ();
4632  end_sequence ();
4633  emit_insn_before (seq, insn);
4634
4635  if (dump_file && (dump_flags & TDF_DETAILS))
4636    {
4637      if (exec_value)
4638	fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4639		 exec_value, INSN_UID (insn));
4640      else
4641	fprintf (dump_file,
4642		 "Restoring EXEC from saved value before insn %d.\n",
4643		 INSN_UID (insn));
4644    }
4645
4646  return exec_value;
4647}
4648
4649/* Implement TARGET_MACHINE_DEPENDENT_REORG.
4650
4651   Ensure that pipeline dependencies and lane masking are set correctly.  */
4652
4653static void
4654gcn_md_reorg (void)
4655{
4656  basic_block bb;
4657  rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4658  regset_head live;
4659
4660  INIT_REG_SET (&live);
4661
4662  compute_bb_for_insn ();
4663
4664  if (!optimize)
4665    {
4666      split_all_insns ();
4667      if (dump_file && (dump_flags & TDF_DETAILS))
4668	{
4669	  fprintf (dump_file, "After split:\n");
4670	  print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4671	}
4672
4673      /* Update data-flow information for split instructions.  */
4674      df_insn_rescan_all ();
4675    }
4676
4677  df_live_add_problem ();
4678  df_live_set_all_dirty ();
4679  df_analyze ();
4680
4681  /* This pass ensures that the EXEC register is set correctly, according
4682     to the "exec" attribute.  However, care must be taken so that the
4683     value that reaches explicit uses of the EXEC register remains the
4684     same as before.
4685   */
4686
4687  FOR_EACH_BB_FN (bb, cfun)
4688    {
4689      if (dump_file && (dump_flags & TDF_DETAILS))
4690	fprintf (dump_file, "BB %d:\n", bb->index);
4691
4692      rtx_insn *insn, *curr;
4693      rtx_insn *last_exec_def = BB_HEAD (bb);
4694      bool last_exec_def_saved = false;
4695      bool curr_exec_explicit = true;
4696      bool curr_exec_known = true;
4697      int64_t curr_exec = 0;	/* 0 here means 'the value is that of EXEC
4698				   after last_exec_def is executed'.  */
4699
4700      bitmap live_in = DF_LR_IN (bb);
4701      bool exec_live_on_entry = false;
4702      if (bitmap_bit_p (live_in, EXEC_LO_REG)
4703	  || bitmap_bit_p (live_in, EXEC_HI_REG))
4704	{
4705	  if (dump_file)
4706	    fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
4707		     (int) bb->index);
4708	  exec_live_on_entry = true;
4709	}
4710
4711      FOR_BB_INSNS_SAFE (bb, insn, curr)
4712	{
4713	  if (!NONDEBUG_INSN_P (insn))
4714	    continue;
4715
4716	  if (GET_CODE (PATTERN (insn)) == USE
4717	      || GET_CODE (PATTERN (insn)) == CLOBBER)
4718	    continue;
4719
4720	  HARD_REG_SET defs, uses;
4721	  CLEAR_HARD_REG_SET (defs);
4722	  CLEAR_HARD_REG_SET (uses);
4723	  note_stores (insn, record_hard_reg_sets, &defs);
4724	  note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4725
4726	  bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4727	  bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4728	  bool exec_used = (hard_reg_set_intersect_p
4729			    (uses, reg_class_contents[(int) EXEC_MASK_REG])
4730			    || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4731
4732	  /* Check the instruction for implicit setting of EXEC via an
4733	     attribute.  */
4734	  attr_exec exec_attr = get_attr_exec (insn);
4735	  int64_t new_exec;
4736
4737	  switch (exec_attr)
4738	    {
4739	    case EXEC_NONE:
4740	      new_exec = 0;
4741	      break;
4742
4743	    case EXEC_SINGLE:
4744	      /* Instructions that do not involve memory accesses only require
4745		 bit 0 of EXEC to be set.  */
4746	      if (gcn_vmem_insn_p (get_attr_type (insn))
4747		  || get_attr_type (insn) == TYPE_DS)
4748		new_exec = 1;
4749	      else
4750		new_exec = curr_exec | 1;
4751	      break;
4752
4753	    case EXEC_FULL:
4754	      new_exec = -1;
4755	      break;
4756
4757	    default:  /* Auto-detect what setting is appropriate.  */
4758	      {
4759	        new_exec = 0;
4760
4761		/* If EXEC is referenced explicitly then we don't need to do
4762		   anything to set it, so we're done.  */
4763		if (exec_used)
4764		  break;
4765
4766		/* Scan the insn for VGPRs defs or uses.  The mode determines
4767		   what kind of exec is needed.  */
4768		subrtx_iterator::array_type array;
4769		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4770		  {
4771		    const_rtx x = *iter;
4772		    if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4773		      {
4774			if (VECTOR_MODE_P (GET_MODE (x)))
4775			  {
4776			    new_exec = -1;
4777			    break;
4778			  }
4779			else
4780			  new_exec = 1;
4781		      }
4782		  }
4783	        }
4784	      break;
4785	    }
4786
4787	  if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4788	    {
4789	      start_sequence ();
4790	      emit_move_insn (exec_reg, GEN_INT (new_exec));
4791	      rtx_insn *seq = get_insns ();
4792	      end_sequence ();
4793	      emit_insn_before (seq, insn);
4794
4795	      if (dump_file && (dump_flags & TDF_DETAILS))
4796		fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4797			 new_exec, INSN_UID (insn));
4798
4799	      curr_exec = new_exec;
4800	      curr_exec_explicit = false;
4801	      curr_exec_known = true;
4802	    }
4803	  else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4804	    {
4805	      fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4806		       new_exec, INSN_UID (insn));
4807	    }
4808
4809	  /* The state of the EXEC register is unknown after a
4810	     function call.  */
4811	  if (CALL_P (insn))
4812	    curr_exec_known = false;
4813
4814	  /* Handle explicit uses of EXEC.  If the instruction is a partial
4815	     explicit definition of EXEC, then treat it as an explicit use of
4816	     EXEC as well.  */
4817	  if (exec_used || exec_lo_def_p != exec_hi_def_p)
4818	    {
4819	      /* An instruction that explicitly uses EXEC should not also
4820		 implicitly define it.  */
4821	      gcc_assert (!exec_used || !new_exec);
4822
4823	      if (!curr_exec_known || !curr_exec_explicit)
4824		{
4825		  /* Restore the previous explicitly defined value.  */
4826		  curr_exec = gcn_restore_exec (insn, last_exec_def,
4827						curr_exec, curr_exec_known,
4828						last_exec_def_saved);
4829		  curr_exec_explicit = true;
4830		  curr_exec_known = true;
4831		}
4832	    }
4833
4834	  /* Handle explicit definitions of EXEC.  */
4835	  if (exec_lo_def_p || exec_hi_def_p)
4836	    {
4837	      last_exec_def = insn;
4838	      last_exec_def_saved = false;
4839	      curr_exec = gcn_insn_exec_value (insn);
4840	      curr_exec_explicit = true;
4841	      curr_exec_known = true;
4842
4843	      if (dump_file && (dump_flags & TDF_DETAILS))
4844		fprintf (dump_file,
4845			 "Found %s definition of EXEC at insn %d.\n",
4846			 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4847			 INSN_UID (insn));
4848	    }
4849
4850	  exec_live_on_entry = false;
4851	}
4852
4853      COPY_REG_SET (&live, DF_LR_OUT (bb));
4854      df_simulate_initialize_backwards (bb, &live);
4855
4856      /* If EXEC is live after the basic block, restore the value of EXEC
4857	 at the end of the block.  */
4858      if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4859	   || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4860	  && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
4861	{
4862	  rtx_insn *end_insn = BB_END (bb);
4863
4864	  /* If the instruction is not a jump instruction, do the restore
4865	     after the last instruction in the basic block.  */
4866	  if (NONJUMP_INSN_P (end_insn))
4867	    end_insn = NEXT_INSN (end_insn);
4868
4869	  gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4870			    curr_exec_known, last_exec_def_saved);
4871	}
4872    }
4873
4874  CLEAR_REG_SET (&live);
4875
4876  /* "Manually Inserted Wait States (NOPs)."
4877
4878     GCN hardware detects most kinds of register dependencies, but there
4879     are some exceptions documented in the ISA manual.  This pass
4880     detects the missed cases, and inserts the documented number of NOPs
4881     required for correct execution.  */
4882
4883  const int max_waits = 5;
4884  struct ilist
4885  {
4886    rtx_insn *insn;
4887    attr_unit unit;
4888    attr_delayeduse delayeduse;
4889    HARD_REG_SET writes;
4890    HARD_REG_SET reads;
4891    int age;
4892  } back[max_waits];
4893  int oldest = 0;
4894  for (int i = 0; i < max_waits; i++)
4895    back[i].insn = NULL;
4896
4897  rtx_insn *insn, *last_insn = NULL;
4898  for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4899    {
4900      if (!NONDEBUG_INSN_P (insn))
4901	continue;
4902
4903      if (GET_CODE (PATTERN (insn)) == USE
4904	  || GET_CODE (PATTERN (insn)) == CLOBBER)
4905	continue;
4906
4907      attr_type itype = get_attr_type (insn);
4908      attr_unit iunit = get_attr_unit (insn);
4909      attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4910      HARD_REG_SET ireads, iwrites;
4911      CLEAR_HARD_REG_SET (ireads);
4912      CLEAR_HARD_REG_SET (iwrites);
4913      note_stores (insn, record_hard_reg_sets, &iwrites);
4914      note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4915
4916      /* Scan recent previous instructions for dependencies not handled in
4917         hardware.  */
4918      int nops_rqd = 0;
4919      for (int i = oldest; i < oldest + max_waits; i++)
4920	{
4921	  struct ilist *prev_insn = &back[i % max_waits];
4922
4923	  if (!prev_insn->insn)
4924	    continue;
4925
4926	  /* VALU writes SGPR followed by VMEM reading the same SGPR
4927	     requires 5 wait states.  */
4928	  if ((prev_insn->age + nops_rqd) < 5
4929	      && prev_insn->unit == UNIT_VECTOR
4930	      && gcn_vmem_insn_p (itype))
4931	    {
4932	      HARD_REG_SET regs = prev_insn->writes & ireads;
4933	      if (hard_reg_set_intersect_p
4934		  (regs, reg_class_contents[(int) SGPR_REGS]))
4935		nops_rqd = 5 - prev_insn->age;
4936	    }
4937
4938	  /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4939	     requires 5 wait states.  */
4940	  if ((prev_insn->age + nops_rqd) < 5
4941	      && prev_insn->unit == UNIT_VECTOR
4942	      && iunit == UNIT_VECTOR
4943	      && ((hard_reg_set_intersect_p
4944		   (prev_insn->writes,
4945		    reg_class_contents[(int) EXEC_MASK_REG])
4946		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4947		  ||
4948		  (hard_reg_set_intersect_p
4949		   (prev_insn->writes,
4950		    reg_class_contents[(int) VCC_CONDITIONAL_REG])
4951		   && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4952	    nops_rqd = 5 - prev_insn->age;
4953
4954	  /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4955	     SGPR/VCC as lane select requires 4 wait states.  */
4956	  if ((prev_insn->age + nops_rqd) < 4
4957	      && prev_insn->unit == UNIT_VECTOR
4958	      && get_attr_laneselect (insn) == LANESELECT_YES)
4959	    {
4960	      HARD_REG_SET regs = prev_insn->writes & ireads;
4961	      if (hard_reg_set_intersect_p
4962		  (regs, reg_class_contents[(int) SGPR_REGS])
4963		  || hard_reg_set_intersect_p
4964		     (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4965		nops_rqd = 4 - prev_insn->age;
4966	    }
4967
4968	  /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4969	     requires 2 wait states.  */
4970	  if ((prev_insn->age + nops_rqd) < 2
4971	      && prev_insn->unit == UNIT_VECTOR
4972	      && itype == TYPE_VOP_DPP)
4973	    {
4974	      HARD_REG_SET regs = prev_insn->writes & ireads;
4975	      if (hard_reg_set_intersect_p
4976		  (regs, reg_class_contents[(int) VGPR_REGS]))
4977		nops_rqd = 2 - prev_insn->age;
4978	    }
4979
4980	  /* Store that requires input registers are not overwritten by
4981	     following instruction.  */
4982	  if ((prev_insn->age + nops_rqd) < 1
4983	      && prev_insn->delayeduse == DELAYEDUSE_YES
4984	      && ((hard_reg_set_intersect_p
4985		   (prev_insn->reads, iwrites))))
4986	    nops_rqd = 1 - prev_insn->age;
4987	}
4988
4989      /* Insert the required number of NOPs.  */
4990      for (int i = nops_rqd; i > 0; i--)
4991	emit_insn_after (gen_nop (), last_insn);
4992
4993      /* Age the previous instructions.  We can also ignore writes to
4994         registers subsequently overwritten.  */
4995      HARD_REG_SET written;
4996      CLEAR_HARD_REG_SET (written);
4997      for (int i = oldest + max_waits - 1; i > oldest; i--)
4998	{
4999	  struct ilist *prev_insn = &back[i % max_waits];
5000
5001	  /* Assume all instructions are equivalent to one "wait", the same
5002	     as s_nop.  This is probably true for SALU, but not VALU (which
5003	     may take longer), so this is not optimal.  However, AMD do
5004	     not publish the cycle times for instructions.  */
5005	  prev_insn->age += 1 + nops_rqd;
5006
5007	  written |= iwrites;
5008	  prev_insn->writes &= ~written;
5009	}
5010
5011      /* Track the current instruction as a previous instruction.  */
5012      back[oldest].insn = insn;
5013      back[oldest].unit = iunit;
5014      back[oldest].delayeduse = idelayeduse;
5015      back[oldest].writes = iwrites;
5016      back[oldest].reads = ireads;
5017      back[oldest].age = 0;
5018      oldest = (oldest + 1) % max_waits;
5019
5020      last_insn = insn;
5021    }
5022}
5023
5024/* }}}  */
5025/* {{{ OpenACC / OpenMP.  */
5026
5027#define GCN_DEFAULT_GANGS 0	/* Choose at runtime.  */
5028#define GCN_DEFAULT_WORKERS 0	/* Choose at runtime.  */
5029#define GCN_DEFAULT_VECTORS 1	/* Use autovectorization only, for now.  */
5030
5031/* Implement TARGET_GOACC_VALIDATE_DIMS.
5032
5033   Check the launch dimensions provided for an OpenACC compute
5034   region, or routine.  */
5035
5036static bool
5037gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
5038			 unsigned /*used*/)
5039{
5040  bool changed = false;
5041  const int max_workers = 16;
5042
5043  /* The vector size must appear to be 64, to the user, unless this is a
5044     SEQ routine.  The real, internal value is always 1, which means use
5045     autovectorization, but the user should not see that.  */
5046  if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5047      && dims[GOMP_DIM_VECTOR] >= 0)
5048    {
5049      if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
5050	  && dims[GOMP_DIM_VECTOR] != 64)
5051	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5052		    OPT_Wopenacc_dims,
5053		    (dims[GOMP_DIM_VECTOR]
5054		     ? G_("using %<vector_length (64)%>, ignoring %d")
5055		     : G_("using %<vector_length (64)%>, "
5056			  "ignoring runtime setting")),
5057		    dims[GOMP_DIM_VECTOR]);
5058      dims[GOMP_DIM_VECTOR] = 1;
5059      changed = true;
5060    }
5061
5062  /* Check the num workers is not too large.  */
5063  if (dims[GOMP_DIM_WORKER] > max_workers)
5064    {
5065      warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5066		  OPT_Wopenacc_dims,
5067		  "using %<num_workers (%d)%>, ignoring %d",
5068		  max_workers, dims[GOMP_DIM_WORKER]);
5069      dims[GOMP_DIM_WORKER] = max_workers;
5070      changed = true;
5071    }
5072
5073  /* Set global defaults.  */
5074  if (!decl)
5075    {
5076      dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
5077      if (dims[GOMP_DIM_WORKER] < 0)
5078	dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5079      if (dims[GOMP_DIM_GANG] < 0)
5080	dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
5081      changed = true;
5082    }
5083
5084  return changed;
5085}
5086
5087/* Helper function for oacc_dim_size instruction.
5088   Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass.  */
5089
5090rtx
5091gcn_oacc_dim_size (int dim)
5092{
5093  if (dim < 0 || dim > 2)
5094    error ("offload dimension out of range (%d)", dim);
5095
5096  /* Vectors are a special case.  */
5097  if (dim == 2)
5098    return const1_rtx;		/* Think of this as 1 times 64.  */
5099
5100  static int offset[] = {
5101    /* Offsets into dispatch packet.  */
5102    12,				/* X dim = Gang / Team / Work-group.  */
5103    20,				/* Z dim = Worker / Thread / Wavefront.  */
5104    16				/* Y dim = Vector / SIMD / Work-item.  */
5105  };
5106  rtx addr = gen_rtx_PLUS (DImode,
5107			   gen_rtx_REG (DImode,
5108					cfun->machine->args.
5109					reg[DISPATCH_PTR_ARG]),
5110			   GEN_INT (offset[dim]));
5111  return gen_rtx_MEM (SImode, addr);
5112}
5113
5114/* Helper function for oacc_dim_pos instruction.
5115   Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass.  */
5116
5117rtx
5118gcn_oacc_dim_pos (int dim)
5119{
5120  if (dim < 0 || dim > 2)
5121    error ("offload dimension out of range (%d)", dim);
5122
5123  static const int reg[] = {
5124    WORKGROUP_ID_X_ARG,		/* Gang / Team / Work-group.  */
5125    WORK_ITEM_ID_Z_ARG,		/* Worker / Thread / Wavefront.  */
5126    WORK_ITEM_ID_Y_ARG		/* Vector / SIMD / Work-item.  */
5127  };
5128
5129  int reg_num = cfun->machine->args.reg[reg[dim]];
5130
5131  /* The information must have been requested by the kernel.  */
5132  gcc_assert (reg_num >= 0);
5133
5134  return gen_rtx_REG (SImode, reg_num);
5135}
5136
5137/* Implement TARGET_GOACC_FORK_JOIN.  */
5138
5139static bool
5140gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5141{
5142  tree arg = gimple_call_arg (call, 2);
5143  unsigned axis = TREE_INT_CST_LOW (arg);
5144
5145  if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
5146    return true;
5147
5148  return false;
5149}
5150
5151/* Implement ???????
5152   FIXME make this a real hook.
5153
5154   Adjust FNDECL such that options inherited from the host compiler
5155   are made appropriate for the accelerator compiler.  */
5156
5157void
5158gcn_fixup_accel_lto_options (tree fndecl)
5159{
5160  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5161  if (!func_optimize)
5162    return;
5163
5164  tree old_optimize
5165    = build_optimization_node (&global_options, &global_options_set);
5166  tree new_optimize;
5167
5168  /* If the function changed the optimization levels as well as
5169     setting target options, start with the optimizations
5170     specified.  */
5171  if (func_optimize != old_optimize)
5172    cl_optimization_restore (&global_options, &global_options_set,
5173			     TREE_OPTIMIZATION (func_optimize));
5174
5175  gcn_option_override ();
5176
5177  /* The target attributes may also change some optimization flags,
5178     so update the optimization options if necessary.  */
5179  new_optimize = build_optimization_node (&global_options,
5180					  &global_options_set);
5181
5182  if (old_optimize != new_optimize)
5183    {
5184      DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5185      cl_optimization_restore (&global_options, &global_options_set,
5186			       TREE_OPTIMIZATION (old_optimize));
5187    }
5188}
5189
5190/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook.  */
5191
5192static void
5193gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
5194		       unsigned HOST_WIDE_INT *hi,
5195		       int ARG_UNUSED (dims[GOMP_DIM_MAX]),
5196		       unsigned HOST_WIDE_INT
5197			 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
5198		       unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
5199{
5200  *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5201  /* !!! We can maybe use dims[] to estimate the maximum number of work
5202     groups/wavefronts/etc. we will launch, and therefore tune the maximum
5203     amount of LDS we should use.  For now, use a minimal amount to try to
5204     maximise occupancy.  */
5205  *hi = acc_lds_size;
5206  machine_function *machfun = cfun->machine;
5207  machfun->reduction_base = gang_private_size_opt;
5208  machfun->reduction_limit
5209    = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5210}
5211
5212/* }}}  */
5213/* {{{ ASM Output.  */
5214
5215/*  Implement TARGET_ASM_FILE_START.
5216
5217    Print assembler file header text.  */
5218
5219static void
5220output_file_start (void)
5221{
5222  const char *cpu;
5223  bool use_xnack_attr = true;
5224  bool use_sram_attr = true;
5225  switch (gcn_arch)
5226    {
5227    case PROCESSOR_FIJI:
5228      cpu = "gfx803";
5229#ifndef HAVE_GCN_XNACK_FIJI
5230      use_xnack_attr = false;
5231#endif
5232      use_sram_attr = false;
5233      break;
5234    case PROCESSOR_VEGA10:
5235      cpu = "gfx900";
5236#ifndef HAVE_GCN_XNACK_GFX900
5237      use_xnack_attr = false;
5238#endif
5239      use_sram_attr = false;
5240      break;
5241    case PROCESSOR_VEGA20:
5242      cpu = "gfx906";
5243#ifndef HAVE_GCN_XNACK_GFX906
5244      use_xnack_attr = false;
5245#endif
5246      use_sram_attr = false;
5247      break;
5248    case PROCESSOR_GFX908:
5249      cpu = "gfx908";
5250#ifndef HAVE_GCN_XNACK_GFX908
5251      use_xnack_attr = false;
5252#endif
5253#ifndef HAVE_GCN_SRAM_ECC_GFX908
5254      use_sram_attr = false;
5255#endif
5256      break;
5257    default: gcc_unreachable ();
5258    }
5259
5260#if HAVE_GCN_ASM_V3_SYNTAX
5261  const char *xnack = (flag_xnack ? "+xnack" : "");
5262  const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
5263#endif
5264#if HAVE_GCN_ASM_V4_SYNTAX
5265  /* In HSACOv4 no attribute setting means the binary supports "any" hardware
5266     configuration.  In GCC binaries, this is true for SRAM ECC, but not
5267     XNACK.  */
5268  const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
5269  const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
5270			  : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
5271			  : "");
5272#endif
5273  if (!use_xnack_attr)
5274    xnack = "";
5275  if (!use_sram_attr)
5276    sram_ecc = "";
5277
5278  fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
5279	  cpu,
5280#if HAVE_GCN_ASM_V3_SYNTAX
5281	  xnack, sram_ecc
5282#endif
5283#ifdef HAVE_GCN_ASM_V4_SYNTAX
5284	  sram_ecc, xnack
5285#endif
5286	  );
5287}
5288
5289/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
5290
5291   Print the initial definition of a function name.
5292
5293   For GCN kernel entry points this includes all the HSA meta-data, special
5294   alignment constraints that don't apply to regular functions, and magic
5295   comments that pass information to mkoffload.  */
5296
5297void
5298gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
5299{
5300  int sgpr, vgpr;
5301  bool xnack_enabled = false;
5302
5303  fputs ("\n\n", file);
5304
5305  if (cfun && cfun->machine && cfun->machine->normal_function)
5306    {
5307      fputs ("\t.type\t", file);
5308      assemble_name (file, name);
5309      fputs (",@function\n", file);
5310      assemble_name (file, name);
5311      fputs (":\n", file);
5312      return;
5313    }
5314
5315  /* Determine count of sgpr/vgpr registers by looking for last
5316     one used.  */
5317  for (sgpr = 101; sgpr >= 0; sgpr--)
5318    if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
5319      break;
5320  sgpr++;
5321  for (vgpr = 255; vgpr >= 0; vgpr--)
5322    if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
5323      break;
5324  vgpr++;
5325
5326  if (!leaf_function_p ())
5327    {
5328      /* We can't know how many registers function calls might use.  */
5329      if (vgpr < MAX_NORMAL_VGPR_COUNT)
5330	vgpr = MAX_NORMAL_VGPR_COUNT;
5331      if (sgpr < MAX_NORMAL_SGPR_COUNT)
5332	sgpr = MAX_NORMAL_SGPR_COUNT;
5333    }
5334
5335  fputs ("\t.rodata\n"
5336	 "\t.p2align\t6\n"
5337	 "\t.amdhsa_kernel\t", file);
5338  assemble_name (file, name);
5339  fputs ("\n", file);
5340  int reg = FIRST_SGPR_REG;
5341  for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5342    {
5343      int reg_first = -1;
5344      int reg_last;
5345      if ((cfun->machine->args.requested & (1 << a))
5346	  && (gcn_kernel_arg_types[a].fixed_regno < 0))
5347	{
5348	  reg_first = reg;
5349	  reg_last = (reg_first
5350		      + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5351			 / UNITS_PER_WORD) - 1);
5352	  reg = reg_last + 1;
5353	}
5354
5355      if (gcn_kernel_arg_types[a].header_pseudo)
5356	{
5357	  fprintf (file, "\t  %s%s\t%i",
5358		   (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5359		   gcn_kernel_arg_types[a].header_pseudo,
5360		   (cfun->machine->args.requested & (1 << a)) != 0);
5361	  if (reg_first != -1)
5362	    {
5363	      fprintf (file, " ; (");
5364	      for (int i = reg_first; i <= reg_last; ++i)
5365		{
5366		  if (i != reg_first)
5367		    fprintf (file, ", ");
5368		  fprintf (file, "%s", reg_names[i]);
5369		}
5370	      fprintf (file, ")");
5371	    }
5372	  fprintf (file, "\n");
5373	}
5374      else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5375	       && cfun->machine->args.requested & (1 << a))
5376	fprintf (file, "\t  ; %s\t%i (%s)\n",
5377		 gcn_kernel_arg_types[a].name,
5378		 (cfun->machine->args.requested & (1 << a)) != 0,
5379		 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5380    }
5381  fprintf (file, "\t  .amdhsa_system_vgpr_workitem_id\t%i\n",
5382	   (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5383	   ? 2
5384	   : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5385	   ? 1 : 0);
5386  fprintf (file,
5387	   "\t  .amdhsa_next_free_vgpr\t%i\n"
5388	   "\t  .amdhsa_next_free_sgpr\t%i\n"
5389	   "\t  .amdhsa_reserve_vcc\t1\n"
5390	   "\t  .amdhsa_reserve_flat_scratch\t0\n"
5391	   "\t  .amdhsa_reserve_xnack_mask\t%i\n"
5392	   "\t  .amdhsa_private_segment_fixed_size\t%i\n"
5393	   "\t  .amdhsa_group_segment_fixed_size\t%u\n"
5394	   "\t  .amdhsa_float_denorm_mode_32\t3\n"
5395	   "\t  .amdhsa_float_denorm_mode_16_64\t3\n",
5396	   vgpr,
5397	   sgpr,
5398	   xnack_enabled,
5399	   /* workitem_private_segment_bytes_size needs to be
5400	      one 64th the wave-front stack size.  */
5401	   stack_size_opt / 64,
5402	   LDS_SIZE);
5403  fputs ("\t.end_amdhsa_kernel\n", file);
5404
5405#if 1
5406  /* The following is YAML embedded in assembler; tabs are not allowed.  */
5407  fputs ("        .amdgpu_metadata\n"
5408	 "        amdhsa.version:\n"
5409	 "          - 1\n"
5410	 "          - 0\n"
5411	 "        amdhsa.kernels:\n"
5412	 "          - .name: ", file);
5413  assemble_name (file, name);
5414  fputs ("\n            .symbol: ", file);
5415  assemble_name (file, name);
5416  fprintf (file,
5417	   ".kd\n"
5418	   "            .kernarg_segment_size: %i\n"
5419	   "            .kernarg_segment_align: %i\n"
5420	   "            .group_segment_fixed_size: %u\n"
5421	   "            .private_segment_fixed_size: %i\n"
5422	   "            .wavefront_size: 64\n"
5423	   "            .sgpr_count: %i\n"
5424	   "            .vgpr_count: %i\n"
5425	   "            .max_flat_workgroup_size: 1024\n",
5426	   cfun->machine->kernarg_segment_byte_size,
5427	   cfun->machine->kernarg_segment_alignment,
5428	   LDS_SIZE,
5429	   stack_size_opt / 64,
5430	   sgpr, vgpr);
5431  fputs ("        .end_amdgpu_metadata\n", file);
5432#endif
5433
5434  fputs ("\t.text\n", file);
5435  fputs ("\t.align\t256\n", file);
5436  fputs ("\t.type\t", file);
5437  assemble_name (file, name);
5438  fputs (",@function\n", file);
5439  assemble_name (file, name);
5440  fputs (":\n", file);
5441
5442  /* This comment is read by mkoffload.  */
5443  if (flag_openacc)
5444    fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5445	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5446	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5447	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5448}
5449
5450/* Implement TARGET_ASM_SELECT_SECTION.
5451
5452   Return the section into which EXP should be placed.  */
5453
5454static section *
5455gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5456{
5457  if (TREE_TYPE (exp) != error_mark_node
5458      && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5459    {
5460      if (!DECL_P (exp))
5461	return get_section (".lds_bss",
5462			    SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5463			    NULL);
5464
5465      return get_named_section (exp, ".lds_bss", reloc);
5466    }
5467
5468  return default_elf_select_section (exp, reloc, align);
5469}
5470
5471/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5472
5473   Emits custom text into the assembler file at the head of each function.  */
5474
5475static void
5476gcn_target_asm_function_prologue (FILE *file)
5477{
5478  machine_function *offsets = gcn_compute_frame_offsets ();
5479
5480  asm_fprintf (file, "\t; using %s addressing in function\n",
5481	       offsets->use_flat_addressing ? "flat" : "global");
5482
5483  if (offsets->normal_function)
5484    {
5485      asm_fprintf (file, "\t; frame pointer needed: %s\n",
5486		   offsets->need_frame_pointer ? "true" : "false");
5487      asm_fprintf (file, "\t; lr needs saving: %s\n",
5488		   offsets->lr_needs_saving ? "true" : "false");
5489      asm_fprintf (file, "\t; outgoing args size: %wd\n",
5490		   offsets->outgoing_args_size);
5491      asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5492      asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5493      asm_fprintf (file, "\t; callee save size: %wd\n",
5494		   offsets->callee_saves);
5495    }
5496  else
5497    {
5498      asm_fprintf (file, "\t; HSA kernel entry point\n");
5499      asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5500      asm_fprintf (file, "\t; outgoing args size: %wd\n",
5501		   offsets->outgoing_args_size);
5502    }
5503}
5504
5505/* Helper function for print_operand and print_operand_address.
5506
5507   Print a register as the assembler requires, according to mode and name.  */
5508
5509static void
5510print_reg (FILE *file, rtx x)
5511{
5512  machine_mode mode = GET_MODE (x);
5513  if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5514      || mode == HFmode || mode == SFmode
5515      || mode == V64SFmode || mode == V64SImode
5516      || mode == V64QImode || mode == V64HImode)
5517    fprintf (file, "%s", reg_names[REGNO (x)]);
5518  else if (mode == DImode || mode == V64DImode
5519	   || mode == DFmode || mode == V64DFmode)
5520    {
5521      if (SGPR_REGNO_P (REGNO (x)))
5522	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5523		 REGNO (x) - FIRST_SGPR_REG + 1);
5524      else if (VGPR_REGNO_P (REGNO (x)))
5525	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5526		 REGNO (x) - FIRST_VGPR_REG + 1);
5527      else if (REGNO (x) == FLAT_SCRATCH_REG)
5528	fprintf (file, "flat_scratch");
5529      else if (REGNO (x) == EXEC_REG)
5530	fprintf (file, "exec");
5531      else if (REGNO (x) == VCC_LO_REG)
5532	fprintf (file, "vcc");
5533      else
5534	fprintf (file, "[%s:%s]",
5535		 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5536    }
5537  else if (mode == TImode)
5538    {
5539      if (SGPR_REGNO_P (REGNO (x)))
5540	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5541		 REGNO (x) - FIRST_SGPR_REG + 3);
5542      else if (VGPR_REGNO_P (REGNO (x)))
5543	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5544		 REGNO (x) - FIRST_VGPR_REG + 3);
5545      else
5546	gcc_unreachable ();
5547    }
5548  else
5549    gcc_unreachable ();
5550}
5551
5552/* Implement TARGET_SECTION_TYPE_FLAGS.
5553
5554   Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION.  */
5555
5556static unsigned int
5557gcn_section_type_flags (tree decl, const char *name, int reloc)
5558{
5559  if (strcmp (name, ".lds_bss") == 0)
5560    return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5561
5562  return default_section_type_flags (decl, name, reloc);
5563}
5564
5565/* Helper function for gcn_asm_output_symbol_ref.
5566
5567   FIXME: This function is used to lay out gang-private variables in LDS
5568   on a per-CU basis.
5569   There may be cases in which gang-private variables in different compilation
5570   units could clobber each other.  In that case we should be relying on the
5571   linker to lay out gang-private LDS space, but that doesn't appear to be
5572   possible at present.  */
5573
5574static void
5575gcn_print_lds_decl (FILE *f, tree var)
5576{
5577  int *offset;
5578  if ((offset = lds_allocs.get (var)))
5579    fprintf (f, "%u", (unsigned) *offset);
5580  else
5581    {
5582      unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5583      tree type = TREE_TYPE (var);
5584      unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5585      if (size > align && size > 4 && align < 8)
5586	align = 8;
5587
5588      gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5589
5590      lds_allocs.put (var, gang_private_hwm);
5591      fprintf (f, "%u", gang_private_hwm);
5592      gang_private_hwm += size;
5593      if (gang_private_hwm > gang_private_size_opt)
5594	error ("%d bytes of gang-private data-share memory exhausted"
5595	       " (increase with %<-mgang-private-size=%d%>, for example)",
5596	       gang_private_size_opt, gang_private_hwm);
5597    }
5598}
5599
5600/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h.  */
5601
5602void
5603gcn_asm_output_symbol_ref (FILE *file, rtx x)
5604{
5605  tree decl;
5606  if (cfun
5607      && (decl = SYMBOL_REF_DECL (x)) != 0
5608      && TREE_CODE (decl) == VAR_DECL
5609      && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5610    {
5611      /* LDS symbols (emitted using this hook) are only used at present
5612         to propagate worker values from an active thread to neutered
5613         threads.  Use the same offset for each such block, but don't
5614         use zero because null pointers are used to identify the active
5615         thread in GOACC_single_copy_start calls.  */
5616      gcn_print_lds_decl (file, decl);
5617    }
5618  else
5619    {
5620      assemble_name (file, XSTR (x, 0));
5621      /* FIXME: See above -- this condition is unreachable.  */
5622      if (cfun
5623	  && (decl = SYMBOL_REF_DECL (x)) != 0
5624	  && TREE_CODE (decl) == VAR_DECL
5625	  && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5626	fputs ("@abs32", file);
5627    }
5628}
5629
5630/* Implement TARGET_CONSTANT_ALIGNMENT.
5631
5632   Returns the alignment in bits of a constant that is being placed in memory.
5633   CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5634   would ordinarily have.  */
5635
5636static HOST_WIDE_INT
5637gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5638			HOST_WIDE_INT basic_align)
5639{
5640  return basic_align > 128 ? basic_align : 128;
5641}
5642
5643/* Implement PRINT_OPERAND_ADDRESS via gcn.h.  */
5644
5645void
5646print_operand_address (FILE *file, rtx mem)
5647{
5648  gcc_assert (MEM_P (mem));
5649
5650  rtx reg;
5651  rtx offset;
5652  addr_space_t as = MEM_ADDR_SPACE (mem);
5653  rtx addr = XEXP (mem, 0);
5654  gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5655
5656  if (AS_SCRATCH_P (as))
5657    switch (GET_CODE (addr))
5658      {
5659      case REG:
5660	print_reg (file, addr);
5661	break;
5662
5663      case PLUS:
5664	reg = XEXP (addr, 0);
5665	offset = XEXP (addr, 1);
5666	print_reg (file, reg);
5667	if (GET_CODE (offset) == CONST_INT)
5668	  fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5669	else
5670	  abort ();
5671	break;
5672
5673      default:
5674	debug_rtx (addr);
5675	abort ();
5676      }
5677  else if (AS_ANY_FLAT_P (as))
5678    {
5679      if (GET_CODE (addr) == REG)
5680	print_reg (file, addr);
5681      else
5682	{
5683	  gcc_assert (TARGET_GCN5_PLUS);
5684	  print_reg (file, XEXP (addr, 0));
5685	}
5686    }
5687  else if (AS_GLOBAL_P (as))
5688    {
5689      gcc_assert (TARGET_GCN5_PLUS);
5690
5691      rtx base = addr;
5692      rtx vgpr_offset = NULL_RTX;
5693
5694      if (GET_CODE (addr) == PLUS)
5695	{
5696	  base = XEXP (addr, 0);
5697
5698	  if (GET_CODE (base) == PLUS)
5699	    {
5700	      /* (SGPR + VGPR) + CONST  */
5701	      vgpr_offset = XEXP (base, 1);
5702	      base = XEXP (base, 0);
5703	    }
5704	  else
5705	    {
5706	      rtx offset = XEXP (addr, 1);
5707
5708	      if (REG_P (offset))
5709		/* SGPR + VGPR  */
5710		vgpr_offset = offset;
5711	      else if (CONST_INT_P (offset))
5712		/* VGPR + CONST or SGPR + CONST  */
5713		;
5714	      else
5715		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5716	    }
5717	}
5718
5719      if (REG_P (base))
5720	{
5721	  if (VGPR_REGNO_P (REGNO (base)))
5722	    print_reg (file, base);
5723	  else if (SGPR_REGNO_P (REGNO (base)))
5724	    {
5725	      /* The assembler requires a 64-bit VGPR pair here, even though
5726	         the offset should be only 32-bit.  */
5727	      if (vgpr_offset == NULL_RTX)
5728		/* In this case, the vector offset is zero, so we use the first
5729		   lane of v1, which is initialized to zero.  */
5730		{
5731		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5732		    fprintf (file, "v1");
5733		  else
5734		    fprintf (file, "v[1:2]");
5735		}
5736	      else if (REG_P (vgpr_offset)
5737		       && VGPR_REGNO_P (REGNO (vgpr_offset)))
5738		{
5739		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5740		    fprintf (file, "v%d",
5741			     REGNO (vgpr_offset) - FIRST_VGPR_REG);
5742		  else
5743		    fprintf (file, "v[%d:%d]",
5744			     REGNO (vgpr_offset) - FIRST_VGPR_REG,
5745			     REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5746		}
5747	      else
5748		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5749	    }
5750	}
5751      else
5752	output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5753    }
5754  else if (AS_ANY_DS_P (as))
5755    switch (GET_CODE (addr))
5756      {
5757      case REG:
5758	print_reg (file, addr);
5759	break;
5760
5761      case PLUS:
5762	reg = XEXP (addr, 0);
5763	print_reg (file, reg);
5764	break;
5765
5766      default:
5767	debug_rtx (addr);
5768	abort ();
5769      }
5770  else
5771    switch (GET_CODE (addr))
5772      {
5773      case REG:
5774	print_reg (file, addr);
5775	fprintf (file, ", 0");
5776	break;
5777
5778      case PLUS:
5779	reg = XEXP (addr, 0);
5780	offset = XEXP (addr, 1);
5781	print_reg (file, reg);
5782	fprintf (file, ", ");
5783	if (GET_CODE (offset) == REG)
5784	  print_reg (file, reg);
5785	else if (GET_CODE (offset) == CONST_INT)
5786	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5787	else
5788	  abort ();
5789	break;
5790
5791      default:
5792	debug_rtx (addr);
5793	abort ();
5794      }
5795}
5796
5797/* Implement PRINT_OPERAND via gcn.h.
5798
5799   b - print operand size as untyped operand (b8/b16/b32/b64)
5800   B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5801   i - print operand size as untyped operand (i16/b32/i64)
5802   I - print operand size as SI/DI untyped operand(i32/b32/i64)
5803   u - print operand size as untyped operand (u16/u32/u64)
5804   U - print operand size as SI/DI untyped operand(u32/u64)
5805   o - print operand size as memory access size for loads
5806       (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5807   s - print operand size as memory access size for stores
5808       (byte/short/dword/dwordx2/wordx3/dwordx4)
5809   C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5810   c - print inverse conditional code for s_cbranch
5811   D - print conditional code for s_cmp (eq_u64/lg_u64...)
5812   E - print conditional code for v_cmp (eq_u64/ne_u64...)
5813   A - print address in formatting suitable for given address space.
5814   O - print offset:n for data share operations.
5815   ^ - print "_co" suffix for GCN5 mnemonics
5816   g - print "glc", if appropriate for given MEM
5817 */
5818
5819void
5820print_operand (FILE *file, rtx x, int code)
5821{
5822  int xcode = x ? GET_CODE (x) : 0;
5823  bool invert = false;
5824  switch (code)
5825    {
5826      /* Instructions have the following suffixes.
5827         If there are two suffixes, the first is the destination type,
5828	 and the second is the source type.
5829
5830         B32 Bitfield (untyped data) 32-bit
5831         B64 Bitfield (untyped data) 64-bit
5832         F16 floating-point 16-bit
5833         F32 floating-point 32-bit (IEEE 754 single-precision float)
5834         F64 floating-point 64-bit (IEEE 754 double-precision float)
5835         I16 signed 32-bit integer
5836         I32 signed 32-bit integer
5837         I64 signed 64-bit integer
5838         U16 unsigned 32-bit integer
5839         U32 unsigned 32-bit integer
5840         U64 unsigned 64-bit integer  */
5841
5842      /* Print operand size as untyped suffix.  */
5843    case 'b':
5844      {
5845	const char *s = "";
5846	machine_mode mode = GET_MODE (x);
5847	if (VECTOR_MODE_P (mode))
5848	  mode = GET_MODE_INNER (mode);
5849	switch (GET_MODE_SIZE (mode))
5850	  {
5851	  case 1:
5852	    s = "_b8";
5853	    break;
5854	  case 2:
5855	    s = "_b16";
5856	    break;
5857	  case 4:
5858	    s = "_b32";
5859	    break;
5860	  case 8:
5861	    s = "_b64";
5862	    break;
5863	  default:
5864	    output_operand_lossage ("invalid operand %%xn code");
5865	    return;
5866	  }
5867	fputs (s, file);
5868      }
5869      return;
5870    case 'B':
5871      {
5872	const char *s = "";
5873	machine_mode mode = GET_MODE (x);
5874	if (VECTOR_MODE_P (mode))
5875	  mode = GET_MODE_INNER (mode);
5876	switch (GET_MODE_SIZE (mode))
5877	  {
5878	  case 1:
5879	  case 2:
5880	  case 4:
5881	    s = "_b32";
5882	    break;
5883	  case 8:
5884	    s = "_b64";
5885	    break;
5886	  default:
5887	    output_operand_lossage ("invalid operand %%xn code");
5888	    return;
5889	  }
5890	fputs (s, file);
5891      }
5892      return;
5893    case 'e':
5894      fputs ("sext(", file);
5895      print_operand (file, x, 0);
5896      fputs (")", file);
5897      return;
5898    case 'i':
5899    case 'I':
5900    case 'u':
5901    case 'U':
5902      {
5903	bool signed_p = code == 'i';
5904	bool min32_p = code == 'I' || code == 'U';
5905	const char *s = "";
5906	machine_mode mode = GET_MODE (x);
5907	if (VECTOR_MODE_P (mode))
5908	  mode = GET_MODE_INNER (mode);
5909	if (mode == VOIDmode)
5910	  switch (GET_CODE (x))
5911	    {
5912	    case CONST_INT:
5913	      s = signed_p ? "_i32" : "_u32";
5914	      break;
5915	    case CONST_DOUBLE:
5916	      s = "_f64";
5917	      break;
5918	    default:
5919	      output_operand_lossage ("invalid operand %%xn code");
5920	      return;
5921	    }
5922	else if (FLOAT_MODE_P (mode))
5923	  switch (GET_MODE_SIZE (mode))
5924	    {
5925	    case 2:
5926	      s = "_f16";
5927	      break;
5928	    case 4:
5929	      s = "_f32";
5930	      break;
5931	    case 8:
5932	      s = "_f64";
5933	      break;
5934	    default:
5935	      output_operand_lossage ("invalid operand %%xn code");
5936	      return;
5937	    }
5938	else if (min32_p)
5939	  switch (GET_MODE_SIZE (mode))
5940	    {
5941	    case 1:
5942	    case 2:
5943	    case 4:
5944	      s = signed_p ? "_i32" : "_u32";
5945	      break;
5946	    case 8:
5947	      s = signed_p ? "_i64" : "_u64";
5948	      break;
5949	    default:
5950	      output_operand_lossage ("invalid operand %%xn code");
5951	      return;
5952	    }
5953	else
5954	  switch (GET_MODE_SIZE (mode))
5955	    {
5956	    case 1:
5957	      s = signed_p ? "_i8" : "_u8";
5958	      break;
5959	    case 2:
5960	      s = signed_p ? "_i16" : "_u16";
5961	      break;
5962	    case 4:
5963	      s = signed_p ? "_i32" : "_u32";
5964	      break;
5965	    case 8:
5966	      s = signed_p ? "_i64" : "_u64";
5967	      break;
5968	    default:
5969	      output_operand_lossage ("invalid operand %%xn code");
5970	      return;
5971	    }
5972	fputs (s, file);
5973      }
5974      return;
5975      /* Print operand size as untyped suffix.  */
5976    case 'o':
5977      {
5978	const char *s = 0;
5979	switch (GET_MODE_SIZE (GET_MODE (x)))
5980	  {
5981	  case 1:
5982	    s = "_ubyte";
5983	    break;
5984	  case 2:
5985	    s = "_ushort";
5986	    break;
5987	  /* The following are full-vector variants.  */
5988	  case 64:
5989	    s = "_ubyte";
5990	    break;
5991	  case 128:
5992	    s = "_ushort";
5993	    break;
5994	  }
5995
5996	if (s)
5997	  {
5998	    fputs (s, file);
5999	    return;
6000	  }
6001
6002	/* Fall-through - the other cases for 'o' are the same as for 's'.  */
6003	gcc_fallthrough();
6004      }
6005    case 's':
6006      {
6007	const char *s = "";
6008	switch (GET_MODE_SIZE (GET_MODE (x)))
6009	  {
6010	  case 1:
6011	    s = "_byte";
6012	    break;
6013	  case 2:
6014	    s = "_short";
6015	    break;
6016	  case 4:
6017	    s = "_dword";
6018	    break;
6019	  case 8:
6020	    s = "_dwordx2";
6021	    break;
6022	  case 12:
6023	    s = "_dwordx3";
6024	    break;
6025	  case 16:
6026	    s = "_dwordx4";
6027	    break;
6028	  case 32:
6029	    s = "_dwordx8";
6030	    break;
6031	  case 64:
6032	    s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
6033	    break;
6034	  /* The following are full-vector variants.  */
6035	  case 128:
6036	    s = "_short";
6037	    break;
6038	  case 256:
6039	    s = "_dword";
6040	    break;
6041	  case 512:
6042	    s = "_dwordx2";
6043	    break;
6044	  default:
6045	    output_operand_lossage ("invalid operand %%xn code");
6046	    return;
6047	  }
6048	fputs (s, file);
6049      }
6050      return;
6051    case 'A':
6052      if (xcode != MEM)
6053	{
6054	  output_operand_lossage ("invalid %%xn code");
6055	  return;
6056	}
6057      print_operand_address (file, x);
6058      return;
6059    case 'O':
6060      {
6061	if (xcode != MEM)
6062	  {
6063	    output_operand_lossage ("invalid %%xn code");
6064	    return;
6065	  }
6066	if (AS_GDS_P (MEM_ADDR_SPACE (x)))
6067	  fprintf (file, " gds");
6068
6069	rtx x0 = XEXP (x, 0);
6070	if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
6071	  {
6072	    gcc_assert (TARGET_GCN5_PLUS);
6073
6074	    fprintf (file, ", ");
6075
6076	    rtx base = x0;
6077	    rtx const_offset = NULL_RTX;
6078
6079	    if (GET_CODE (base) == PLUS)
6080	      {
6081		rtx offset = XEXP (x0, 1);
6082		base = XEXP (x0, 0);
6083
6084		if (GET_CODE (base) == PLUS)
6085		  /* (SGPR + VGPR) + CONST  */
6086		  /* Ignore the VGPR offset for this operand.  */
6087		  base = XEXP (base, 0);
6088
6089		if (CONST_INT_P (offset))
6090		  const_offset = XEXP (x0, 1);
6091		else if (REG_P (offset))
6092		  /* SGPR + VGPR  */
6093		  /* Ignore the VGPR offset for this operand.  */
6094		  ;
6095		else
6096		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6097	      }
6098
6099	    if (REG_P (base))
6100	      {
6101		if (VGPR_REGNO_P (REGNO (base)))
6102		  /* The VGPR address is specified in the %A operand.  */
6103		  fprintf (file, "off");
6104		else if (SGPR_REGNO_P (REGNO (base)))
6105		  print_reg (file, base);
6106		else
6107		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6108	      }
6109	    else
6110	      output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6111
6112	    if (const_offset != NULL_RTX)
6113	      fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
6114		       INTVAL (const_offset));
6115
6116	    return;
6117	  }
6118
6119	if (GET_CODE (x0) == REG)
6120	  return;
6121	if (GET_CODE (x0) != PLUS)
6122	  {
6123	    output_operand_lossage ("invalid %%xn code");
6124	    return;
6125	  }
6126	rtx val = XEXP (x0, 1);
6127	if (GET_CODE (val) == CONST_VECTOR)
6128	  val = CONST_VECTOR_ELT (val, 0);
6129	if (GET_CODE (val) != CONST_INT)
6130	  {
6131	    output_operand_lossage ("invalid %%xn code");
6132	    return;
6133	  }
6134	fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
6135
6136      }
6137      return;
6138    case 'c':
6139      invert = true;
6140      /* Fall through.  */
6141    case 'C':
6142      {
6143	const char *s;
6144	bool num = false;
6145	if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
6146	  {
6147	    output_operand_lossage ("invalid %%xn code");
6148	    return;
6149	  }
6150	switch (REGNO (XEXP (x, 0)))
6151	  {
6152	  case VCC_REG:
6153	  case VCCZ_REG:
6154	    s = "_vcc";
6155	    break;
6156	  case SCC_REG:
6157	    /* For some reason llvm-mc insists on scc0 instead of sccz.  */
6158	    num = true;
6159	    s = "_scc";
6160	    break;
6161	  case EXECZ_REG:
6162	    s = "_exec";
6163	    break;
6164	  default:
6165	    output_operand_lossage ("invalid %%xn code");
6166	    return;
6167	  }
6168	fputs (s, file);
6169	if (xcode == (invert ? NE : EQ))
6170	  fputc (num ? '0' : 'z', file);
6171	else
6172	  fputs (num ? "1" : "nz", file);
6173	return;
6174      }
6175    case 'D':
6176      {
6177	const char *s;
6178	bool cmp_signed = false;
6179	switch (xcode)
6180	  {
6181	  case EQ:
6182	    s = "_eq_";
6183	    break;
6184	  case NE:
6185	    s = "_lg_";
6186	    break;
6187	  case LT:
6188	    s = "_lt_";
6189	    cmp_signed = true;
6190	    break;
6191	  case LE:
6192	    s = "_le_";
6193	    cmp_signed = true;
6194	    break;
6195	  case GT:
6196	    s = "_gt_";
6197	    cmp_signed = true;
6198	    break;
6199	  case GE:
6200	    s = "_ge_";
6201	    cmp_signed = true;
6202	    break;
6203	  case LTU:
6204	    s = "_lt_";
6205	    break;
6206	  case LEU:
6207	    s = "_le_";
6208	    break;
6209	  case GTU:
6210	    s = "_gt_";
6211	    break;
6212	  case GEU:
6213	    s = "_ge_";
6214	    break;
6215	  default:
6216	    output_operand_lossage ("invalid %%xn code");
6217	    return;
6218	  }
6219	fputs (s, file);
6220	fputc (cmp_signed ? 'i' : 'u', file);
6221
6222	machine_mode mode = GET_MODE (XEXP (x, 0));
6223
6224	if (mode == VOIDmode)
6225	  mode = GET_MODE (XEXP (x, 1));
6226
6227	/* If both sides are constants, then assume the instruction is in
6228	   SImode since s_cmp can only do integer compares.  */
6229	if (mode == VOIDmode)
6230	  mode = SImode;
6231
6232	switch (GET_MODE_SIZE (mode))
6233	  {
6234	  case 4:
6235	    s = "32";
6236	    break;
6237	  case 8:
6238	    s = "64";
6239	    break;
6240	  default:
6241	    output_operand_lossage ("invalid operand %%xn code");
6242	    return;
6243	  }
6244	fputs (s, file);
6245	return;
6246      }
6247    case 'E':
6248      {
6249	const char *s;
6250	bool cmp_signed = false;
6251	machine_mode mode = GET_MODE (XEXP (x, 0));
6252
6253	if (mode == VOIDmode)
6254	  mode = GET_MODE (XEXP (x, 1));
6255
6256	/* If both sides are constants, assume the instruction is in SFmode
6257	   if either operand is floating point, otherwise assume SImode.  */
6258	if (mode == VOIDmode)
6259	  {
6260	    if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
6261		|| GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
6262	      mode = SFmode;
6263	    else
6264	      mode = SImode;
6265	  }
6266
6267	/* Use the same format code for vector comparisons.  */
6268	if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
6269	    || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
6270	  mode = GET_MODE_INNER (mode);
6271
6272	bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
6273
6274	switch (xcode)
6275	  {
6276	  case EQ:
6277	    s = "_eq_";
6278	    break;
6279	  case NE:
6280	    s = float_p ? "_neq_" : "_ne_";
6281	    break;
6282	  case LT:
6283	    s = "_lt_";
6284	    cmp_signed = true;
6285	    break;
6286	  case LE:
6287	    s = "_le_";
6288	    cmp_signed = true;
6289	    break;
6290	  case GT:
6291	    s = "_gt_";
6292	    cmp_signed = true;
6293	    break;
6294	  case GE:
6295	    s = "_ge_";
6296	    cmp_signed = true;
6297	    break;
6298	  case LTU:
6299	    s = "_lt_";
6300	    break;
6301	  case LEU:
6302	    s = "_le_";
6303	    break;
6304	  case GTU:
6305	    s = "_gt_";
6306	    break;
6307	  case GEU:
6308	    s = "_ge_";
6309	    break;
6310	  case ORDERED:
6311	    s = "_o_";
6312	    break;
6313	  case UNORDERED:
6314	    s = "_u_";
6315	    break;
6316	  case UNEQ:
6317	    s = "_nlg_";
6318	    break;
6319	  case UNGE:
6320	    s = "_nlt_";
6321	    break;
6322	  case UNGT:
6323	    s = "_nle_";
6324	    break;
6325	  case UNLE:
6326	    s = "_ngt_";
6327	    break;
6328	  case UNLT:
6329	    s = "_nge_";
6330	    break;
6331	  case LTGT:
6332	    s = "_lg_";
6333	    break;
6334	  default:
6335	    output_operand_lossage ("invalid %%xn code");
6336	    return;
6337	  }
6338	fputs (s, file);
6339	fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6340
6341	switch (GET_MODE_SIZE (mode))
6342	  {
6343	  case 1:
6344	    output_operand_lossage ("operand %%xn code invalid for QImode");
6345	    return;
6346	  case 2:
6347	    s = "16";
6348	    break;
6349	  case 4:
6350	    s = "32";
6351	    break;
6352	  case 8:
6353	    s = "64";
6354	    break;
6355	  default:
6356	    output_operand_lossage ("invalid operand %%xn code");
6357	    return;
6358	  }
6359	fputs (s, file);
6360	return;
6361      }
6362    case 'L':
6363      print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6364      return;
6365    case 'H':
6366      print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6367      return;
6368    case 'R':
6369      /* Print a scalar register number as an integer.  Temporary hack.  */
6370      gcc_assert (REG_P (x));
6371      fprintf (file, "%u", (int) REGNO (x));
6372      return;
6373    case 'V':
6374      /* Print a vector register number as an integer.  Temporary hack.  */
6375      gcc_assert (REG_P (x));
6376      fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6377      return;
6378    case 0:
6379      if (xcode == REG)
6380	print_reg (file, x);
6381      else if (xcode == MEM)
6382	output_address (GET_MODE (x), x);
6383      else if (xcode == CONST_INT)
6384	fprintf (file, "%i", (int) INTVAL (x));
6385      else if (xcode == CONST_VECTOR)
6386	print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6387      else if (xcode == CONST_DOUBLE)
6388	{
6389	  const char *str;
6390	  switch (gcn_inline_fp_constant_p (x, false))
6391	    {
6392	    case 240:
6393	      str = "0.5";
6394	      break;
6395	    case 241:
6396	      str = "-0.5";
6397	      break;
6398	    case 242:
6399	      str = "1.0";
6400	      break;
6401	    case 243:
6402	      str = "-1.0";
6403	      break;
6404	    case 244:
6405	      str = "2.0";
6406	      break;
6407	    case 245:
6408	      str = "-2.0";
6409	      break;
6410	    case 246:
6411	      str = "4.0";
6412	      break;
6413	    case 247:
6414	      str = "-4.0";
6415	      break;
6416	    case 248:
6417	      str = "1/pi";
6418	      break;
6419	    default:
6420	      rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6421					    ? DImode : SImode,
6422					    x, GET_MODE (x), 0);
6423	      if (x)
6424		print_operand (file, ix, code);
6425	      else
6426		output_operand_lossage ("invalid fp constant");
6427	      return;
6428	      break;
6429	    }
6430	  fprintf (file, str);
6431	  return;
6432	}
6433      else
6434	output_addr_const (file, x);
6435      return;
6436    case '^':
6437      if (TARGET_GCN5_PLUS)
6438	fputs ("_co", file);
6439      return;
6440    case 'g':
6441      gcc_assert (xcode == MEM);
6442      if (MEM_VOLATILE_P (x))
6443	fputs (" glc", file);
6444      return;
6445    default:
6446      output_operand_lossage ("invalid %%xn code");
6447    }
6448  gcc_unreachable ();
6449}
6450
6451/* Implement DBX_REGISTER_NUMBER macro.
6452
6453   Return the DWARF register number that corresponds to the GCC internal
6454   REGNO.  */
6455
6456unsigned int
6457gcn_dwarf_register_number (unsigned int regno)
6458{
6459  /* Registers defined in DWARF.  */
6460  if (regno == EXEC_LO_REG)
6461    return 17;
6462  /* We need to use a more complex DWARF expression for this
6463  else if (regno == EXEC_HI_REG)
6464    return 17; */
6465  else if (regno == VCC_LO_REG)
6466    return 768;
6467  /* We need to use a more complex DWARF expression for this
6468  else if (regno == VCC_HI_REG)
6469    return 768;  */
6470  else if (regno == SCC_REG)
6471    return 128;
6472  else if (regno == DWARF_LINK_REGISTER)
6473    return 16;
6474  else if (SGPR_REGNO_P (regno))
6475    {
6476      if (regno - FIRST_SGPR_REG < 64)
6477	return (regno - FIRST_SGPR_REG + 32);
6478      else
6479	return (regno - FIRST_SGPR_REG + 1024);
6480    }
6481  else if (VGPR_REGNO_P (regno))
6482    return (regno - FIRST_VGPR_REG + 2560);
6483
6484  /* Otherwise, there's nothing sensible to do.  */
6485  return regno + 100000;
6486}
6487
6488/* Implement TARGET_DWARF_REGISTER_SPAN.
6489
6490   DImode and Vector DImode require additional registers.  */
6491
6492static rtx
6493gcn_dwarf_register_span (rtx rtl)
6494{
6495  machine_mode mode = GET_MODE (rtl);
6496
6497  if (VECTOR_MODE_P (mode))
6498    mode = GET_MODE_INNER (mode);
6499
6500  if (GET_MODE_SIZE (mode) != 8)
6501    return NULL_RTX;
6502
6503  unsigned regno = REGNO (rtl);
6504
6505  if (regno == DWARF_LINK_REGISTER)
6506    return NULL_RTX;
6507
6508  rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
6509  XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
6510  XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
6511
6512  return p;
6513}
6514
6515/* }}}  */
6516/* {{{ TARGET hook overrides.  */
6517
6518#undef  TARGET_ADDR_SPACE_ADDRESS_MODE
6519#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6520#undef  TARGET_ADDR_SPACE_DEBUG
6521#define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
6522#undef  TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6523#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6524  gcn_addr_space_legitimate_address_p
6525#undef  TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6526#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6527#undef  TARGET_ADDR_SPACE_POINTER_MODE
6528#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6529#undef  TARGET_ADDR_SPACE_SUBSET_P
6530#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6531#undef  TARGET_ADDR_SPACE_CONVERT
6532#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6533#undef  TARGET_ARG_PARTIAL_BYTES
6534#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6535#undef  TARGET_ASM_ALIGNED_DI_OP
6536#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6537#undef  TARGET_ASM_FILE_START
6538#define TARGET_ASM_FILE_START output_file_start
6539#undef  TARGET_ASM_FUNCTION_PROLOGUE
6540#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6541#undef  TARGET_ASM_SELECT_SECTION
6542#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6543#undef  TARGET_ASM_TRAMPOLINE_TEMPLATE
6544#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6545#undef  TARGET_ATTRIBUTE_TABLE
6546#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6547#undef  TARGET_BUILTIN_DECL
6548#define TARGET_BUILTIN_DECL gcn_builtin_decl
6549#undef  TARGET_CAN_CHANGE_MODE_CLASS
6550#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6551#undef  TARGET_CAN_ELIMINATE
6552#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6553#undef  TARGET_CANNOT_COPY_INSN_P
6554#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6555#undef  TARGET_CLASS_LIKELY_SPILLED_P
6556#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6557#undef  TARGET_CLASS_MAX_NREGS
6558#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6559#undef  TARGET_CONDITIONAL_REGISTER_USAGE
6560#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6561#undef  TARGET_CONSTANT_ALIGNMENT
6562#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6563#undef  TARGET_DEBUG_UNWIND_INFO
6564#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6565#undef  TARGET_DWARF_REGISTER_SPAN
6566#define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
6567#undef  TARGET_EMUTLS_VAR_INIT
6568#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6569#undef  TARGET_EXPAND_BUILTIN
6570#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6571#undef  TARGET_FRAME_POINTER_REQUIRED
6572#define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
6573#undef  TARGET_FUNCTION_ARG
6574#undef  TARGET_FUNCTION_ARG_ADVANCE
6575#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6576#define TARGET_FUNCTION_ARG gcn_function_arg
6577#undef  TARGET_FUNCTION_VALUE
6578#define TARGET_FUNCTION_VALUE gcn_function_value
6579#undef  TARGET_FUNCTION_VALUE_REGNO_P
6580#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6581#undef  TARGET_GIMPLIFY_VA_ARG_EXPR
6582#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6583#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6584#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6585#undef  TARGET_GOACC_ADJUST_PRIVATE_DECL
6586#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
6587#undef  TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
6588#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
6589  gcn_goacc_create_worker_broadcast_record
6590#undef  TARGET_GOACC_FORK_JOIN
6591#define TARGET_GOACC_FORK_JOIN gcn_fork_join
6592#undef  TARGET_GOACC_REDUCTION
6593#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6594#undef  TARGET_GOACC_VALIDATE_DIMS
6595#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6596#undef  TARGET_GOACC_SHARED_MEM_LAYOUT
6597#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
6598#undef  TARGET_HARD_REGNO_MODE_OK
6599#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6600#undef  TARGET_HARD_REGNO_NREGS
6601#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6602#undef  TARGET_HAVE_SPECULATION_SAFE_VALUE
6603#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6604#undef  TARGET_INIT_BUILTINS
6605#define TARGET_INIT_BUILTINS gcn_init_builtins
6606#undef  TARGET_INIT_LIBFUNCS
6607#define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
6608#undef  TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6609#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6610  gcn_ira_change_pseudo_allocno_class
6611#undef  TARGET_LEGITIMATE_CONSTANT_P
6612#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6613#undef  TARGET_LRA_P
6614#define TARGET_LRA_P hook_bool_void_true
6615#undef  TARGET_MACHINE_DEPENDENT_REORG
6616#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6617#undef  TARGET_MEMORY_MOVE_COST
6618#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6619#undef  TARGET_MODES_TIEABLE_P
6620#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6621#undef  TARGET_OPTION_OVERRIDE
6622#define TARGET_OPTION_OVERRIDE gcn_option_override
6623#undef  TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6624#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6625  gcn_pretend_outgoing_varargs_named
6626#undef  TARGET_PROMOTE_FUNCTION_MODE
6627#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6628#undef  TARGET_REGISTER_MOVE_COST
6629#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6630#undef  TARGET_RETURN_IN_MEMORY
6631#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6632#undef  TARGET_RTX_COSTS
6633#define TARGET_RTX_COSTS gcn_rtx_costs
6634#undef  TARGET_SECONDARY_RELOAD
6635#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6636#undef  TARGET_SECTION_TYPE_FLAGS
6637#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6638#undef  TARGET_SCALAR_MODE_SUPPORTED_P
6639#define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
6640#undef  TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6641#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6642  gcn_small_register_classes_for_mode_p
6643#undef  TARGET_SPILL_CLASS
6644#define TARGET_SPILL_CLASS gcn_spill_class
6645#undef  TARGET_STRICT_ARGUMENT_NAMING
6646#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6647#undef  TARGET_TRAMPOLINE_INIT
6648#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6649#undef  TARGET_TRULY_NOOP_TRUNCATION
6650#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6651#undef  TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6652#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6653#undef  TARGET_VECTORIZE_GET_MASK_MODE
6654#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6655#undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6656#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6657#undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6658#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6659  gcn_preferred_vector_alignment
6660#undef  TARGET_VECTORIZE_RELATED_MODE
6661#define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6662#undef  TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6663#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6664  gcn_vectorize_support_vector_misalignment
6665#undef  TARGET_VECTORIZE_VEC_PERM_CONST
6666#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6667#undef  TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6668#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6669  gcn_vector_alignment_reachable
6670#undef  TARGET_VECTOR_MODE_SUPPORTED_P
6671#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6672
6673struct gcc_target targetm = TARGET_INITIALIZER;
6674
6675#include "gt-gcn.h"
6676/* }}}  */
6677