1/* Machine description for AArch64 architecture.
2   Copyright (C) 2009-2015 Free Software Foundation, Inc.
3   Contributed by ARM Ltd.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   GCC is distributed in the hope that it will be useful, but
13   WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with GCC; see the file COPYING3.  If not see
19   <http://www.gnu.org/licenses/>.  */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "tm.h"
25#include "insn-codes.h"
26#include "rtl.h"
27#include "insn-attr.h"
28#include "hash-set.h"
29#include "machmode.h"
30#include "vec.h"
31#include "double-int.h"
32#include "input.h"
33#include "alias.h"
34#include "symtab.h"
35#include "wide-int.h"
36#include "inchash.h"
37#include "tree.h"
38#include "fold-const.h"
39#include "stringpool.h"
40#include "stor-layout.h"
41#include "calls.h"
42#include "varasm.h"
43#include "regs.h"
44#include "dominance.h"
45#include "cfg.h"
46#include "cfgrtl.h"
47#include "cfganal.h"
48#include "lcm.h"
49#include "cfgbuild.h"
50#include "cfgcleanup.h"
51#include "predict.h"
52#include "basic-block.h"
53#include "df.h"
54#include "hard-reg-set.h"
55#include "output.h"
56#include "hashtab.h"
57#include "function.h"
58#include "flags.h"
59#include "statistics.h"
60#include "real.h"
61#include "fixed-value.h"
62#include "insn-config.h"
63#include "expmed.h"
64#include "dojump.h"
65#include "explow.h"
66#include "emit-rtl.h"
67#include "stmt.h"
68#include "expr.h"
69#include "reload.h"
70#include "toplev.h"
71#include "target.h"
72#include "target-def.h"
73#include "targhooks.h"
74#include "ggc.h"
75#include "tm_p.h"
76#include "recog.h"
77#include "langhooks.h"
78#include "diagnostic-core.h"
79#include "hash-table.h"
80#include "tree-ssa-alias.h"
81#include "internal-fn.h"
82#include "gimple-fold.h"
83#include "tree-eh.h"
84#include "gimple-expr.h"
85#include "is-a.h"
86#include "gimple.h"
87#include "gimplify.h"
88#include "optabs.h"
89#include "dwarf2.h"
90#include "cfgloop.h"
91#include "tree-vectorizer.h"
92#include "aarch64-cost-tables.h"
93#include "dumpfile.h"
94#include "builtins.h"
95#include "rtl-iter.h"
96#include "tm-constrs.h"
97#include "sched-int.h"
98
99/* Defined for convenience.  */
100#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
101
102/* Classifies an address.
103
104   ADDRESS_REG_IMM
105       A simple base register plus immediate offset.
106
107   ADDRESS_REG_WB
108       A base register indexed by immediate offset with writeback.
109
110   ADDRESS_REG_REG
111       A base register indexed by (optionally scaled) register.
112
113   ADDRESS_REG_UXTW
114       A base register indexed by (optionally scaled) zero-extended register.
115
116   ADDRESS_REG_SXTW
117       A base register indexed by (optionally scaled) sign-extended register.
118
119   ADDRESS_LO_SUM
120       A LO_SUM rtx with a base register and "LO12" symbol relocation.
121
122   ADDRESS_SYMBOLIC:
123       A constant symbolic address, in pc-relative literal pool.  */
124
125enum aarch64_address_type {
126  ADDRESS_REG_IMM,
127  ADDRESS_REG_WB,
128  ADDRESS_REG_REG,
129  ADDRESS_REG_UXTW,
130  ADDRESS_REG_SXTW,
131  ADDRESS_LO_SUM,
132  ADDRESS_SYMBOLIC
133};
134
135struct aarch64_address_info {
136  enum aarch64_address_type type;
137  rtx base;
138  rtx offset;
139  int shift;
140  enum aarch64_symbol_type symbol_type;
141};
142
143struct simd_immediate_info
144{
145  rtx value;
146  int shift;
147  int element_width;
148  bool mvn;
149  bool msl;
150};
151
152/* The current code model.  */
153enum aarch64_code_model aarch64_cmodel;
154
155#ifdef HAVE_AS_TLS
156#undef TARGET_HAVE_TLS
157#define TARGET_HAVE_TLS 1
158#endif
159
160static bool aarch64_composite_type_p (const_tree, machine_mode);
161static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
162						     const_tree,
163						     machine_mode *, int *,
164						     bool *);
165static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
166static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
167static void aarch64_override_options_after_change (void);
168static bool aarch64_vector_mode_supported_p (machine_mode);
169static unsigned bit_count (unsigned HOST_WIDE_INT);
170static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
171						 const unsigned char *sel);
172static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
173
174/* Major revision number of the ARM Architecture implemented by the target.  */
175unsigned aarch64_architecture_version;
176
177/* The processor for which instructions should be scheduled.  */
178enum aarch64_processor aarch64_tune = cortexa53;
179
180/* The current tuning set.  */
181const struct tune_params *aarch64_tune_params;
182
183/* Mask to specify which instructions we are allowed to generate.  */
184unsigned long aarch64_isa_flags = 0;
185
186/* Mask to specify which instruction scheduling options should be used.  */
187unsigned long aarch64_tune_flags = 0;
188
189/* Tuning parameters.  */
190
191static const struct cpu_addrcost_table generic_addrcost_table =
192{
193    {
194      0, /* hi  */
195      0, /* si  */
196      0, /* di  */
197      0, /* ti  */
198    },
199  0, /* pre_modify  */
200  0, /* post_modify  */
201  0, /* register_offset  */
202  0, /* register_extend  */
203  0 /* imm_offset  */
204};
205
206static const struct cpu_addrcost_table cortexa57_addrcost_table =
207{
208    {
209      1, /* hi  */
210      0, /* si  */
211      0, /* di  */
212      1, /* ti  */
213    },
214  0, /* pre_modify  */
215  0, /* post_modify  */
216  0, /* register_offset  */
217  0, /* register_extend  */
218  0, /* imm_offset  */
219};
220
221static const struct cpu_addrcost_table xgene1_addrcost_table =
222{
223    {
224      1, /* hi  */
225      0, /* si  */
226      0, /* di  */
227      1, /* ti  */
228    },
229  1, /* pre_modify  */
230  0, /* post_modify  */
231  0, /* register_offset  */
232  1, /* register_extend  */
233  0, /* imm_offset  */
234};
235
236static const struct cpu_regmove_cost generic_regmove_cost =
237{
238  1, /* GP2GP  */
239  /* Avoid the use of slow int<->fp moves for spilling by setting
240     their cost higher than memmov_cost.  */
241  5, /* GP2FP  */
242  5, /* FP2GP  */
243  2 /* FP2FP  */
244};
245
246static const struct cpu_regmove_cost cortexa57_regmove_cost =
247{
248  1, /* GP2GP  */
249  /* Avoid the use of slow int<->fp moves for spilling by setting
250     their cost higher than memmov_cost.  */
251  5, /* GP2FP  */
252  5, /* FP2GP  */
253  2 /* FP2FP  */
254};
255
256static const struct cpu_regmove_cost cortexa53_regmove_cost =
257{
258  1, /* GP2GP  */
259  /* Avoid the use of slow int<->fp moves for spilling by setting
260     their cost higher than memmov_cost.  */
261  5, /* GP2FP  */
262  5, /* FP2GP  */
263  2 /* FP2FP  */
264};
265
266static const struct cpu_regmove_cost thunderx_regmove_cost =
267{
268  2, /* GP2GP  */
269  2, /* GP2FP  */
270  6, /* FP2GP  */
271  4 /* FP2FP  */
272};
273
274static const struct cpu_regmove_cost xgene1_regmove_cost =
275{
276  1, /* GP2GP  */
277  /* Avoid the use of slow int<->fp moves for spilling by setting
278     their cost higher than memmov_cost.  */
279  8, /* GP2FP  */
280  8, /* FP2GP  */
281  2 /* FP2FP  */
282};
283
284/* Generic costs for vector insn classes.  */
285static const struct cpu_vector_cost generic_vector_cost =
286{
287  1, /* scalar_stmt_cost  */
288  1, /* scalar_load_cost  */
289  1, /* scalar_store_cost  */
290  1, /* vec_stmt_cost  */
291  1, /* vec_to_scalar_cost  */
292  1, /* scalar_to_vec_cost  */
293  1, /* vec_align_load_cost  */
294  1, /* vec_unalign_load_cost  */
295  1, /* vec_unalign_store_cost  */
296  1, /* vec_store_cost  */
297  3, /* cond_taken_branch_cost  */
298  1 /* cond_not_taken_branch_cost  */
299};
300
301/* Generic costs for vector insn classes.  */
302static const struct cpu_vector_cost cortexa57_vector_cost =
303{
304  1, /* scalar_stmt_cost  */
305  4, /* scalar_load_cost  */
306  1, /* scalar_store_cost  */
307  3, /* vec_stmt_cost  */
308  8, /* vec_to_scalar_cost  */
309  8, /* scalar_to_vec_cost  */
310  5, /* vec_align_load_cost  */
311  5, /* vec_unalign_load_cost  */
312  1, /* vec_unalign_store_cost  */
313  1, /* vec_store_cost  */
314  1, /* cond_taken_branch_cost  */
315  1 /* cond_not_taken_branch_cost  */
316};
317
318/* Generic costs for vector insn classes.  */
319static const struct cpu_vector_cost xgene1_vector_cost =
320{
321  1, /* scalar_stmt_cost  */
322  5, /* scalar_load_cost  */
323  1, /* scalar_store_cost  */
324  2, /* vec_stmt_cost  */
325  4, /* vec_to_scalar_cost  */
326  4, /* scalar_to_vec_cost  */
327  10, /* vec_align_load_cost  */
328  10, /* vec_unalign_load_cost  */
329  2, /* vec_unalign_store_cost  */
330  2, /* vec_store_cost  */
331  2, /* cond_taken_branch_cost  */
332  1 /* cond_not_taken_branch_cost  */
333};
334
335#define AARCH64_FUSE_NOTHING	(0)
336#define AARCH64_FUSE_MOV_MOVK	(1 << 0)
337#define AARCH64_FUSE_ADRP_ADD	(1 << 1)
338#define AARCH64_FUSE_MOVK_MOVK	(1 << 2)
339#define AARCH64_FUSE_ADRP_LDR	(1 << 3)
340#define AARCH64_FUSE_CMP_BRANCH	(1 << 4)
341
342static const struct tune_params generic_tunings =
343{
344  &cortexa57_extra_costs,
345  &generic_addrcost_table,
346  &generic_regmove_cost,
347  &generic_vector_cost,
348  4, /* memmov_cost  */
349  2, /* issue_rate  */
350  AARCH64_FUSE_NOTHING, /* fuseable_ops  */
351  8,	/* function_align.  */
352  8,	/* jump_align.  */
353  4,	/* loop_align.  */
354  2,	/* int_reassoc_width.  */
355  4,	/* fp_reassoc_width.  */
356  1	/* vec_reassoc_width.  */
357};
358
359static const struct tune_params cortexa53_tunings =
360{
361  &cortexa53_extra_costs,
362  &generic_addrcost_table,
363  &cortexa53_regmove_cost,
364  &generic_vector_cost,
365  4, /* memmov_cost  */
366  2, /* issue_rate  */
367  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
368   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
369  8,	/* function_align.  */
370  8,	/* jump_align.  */
371  4,	/* loop_align.  */
372  2,	/* int_reassoc_width.  */
373  4,	/* fp_reassoc_width.  */
374  1	/* vec_reassoc_width.  */
375};
376
377static const struct tune_params cortexa57_tunings =
378{
379  &cortexa57_extra_costs,
380  &cortexa57_addrcost_table,
381  &cortexa57_regmove_cost,
382  &cortexa57_vector_cost,
383  4, /* memmov_cost  */
384  3, /* issue_rate  */
385  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
386   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
387  16,	/* function_align.  */
388  8,	/* jump_align.  */
389  4,	/* loop_align.  */
390  2,	/* int_reassoc_width.  */
391  4,	/* fp_reassoc_width.  */
392  1	/* vec_reassoc_width.  */
393};
394
395static const struct tune_params thunderx_tunings =
396{
397  &thunderx_extra_costs,
398  &generic_addrcost_table,
399  &thunderx_regmove_cost,
400  &generic_vector_cost,
401  6, /* memmov_cost  */
402  2, /* issue_rate  */
403  AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
404  8,	/* function_align.  */
405  8,	/* jump_align.  */
406  8,	/* loop_align.  */
407  2,	/* int_reassoc_width.  */
408  4,	/* fp_reassoc_width.  */
409  1	/* vec_reassoc_width.  */
410};
411
412static const struct tune_params xgene1_tunings =
413{
414  &xgene1_extra_costs,
415  &xgene1_addrcost_table,
416  &xgene1_regmove_cost,
417  &xgene1_vector_cost,
418  6, /* memmov_cost  */
419  4, /* issue_rate  */
420  AARCH64_FUSE_NOTHING, /* fuseable_ops  */
421  16,	/* function_align.  */
422  8,	/* jump_align.  */
423  16,	/* loop_align.  */
424  2,	/* int_reassoc_width.  */
425  4,	/* fp_reassoc_width.  */
426  1	/* vec_reassoc_width.  */
427};
428
429/* A processor implementing AArch64.  */
430struct processor
431{
432  const char *const name;
433  enum aarch64_processor core;
434  const char *arch;
435  unsigned architecture_version;
436  const unsigned long flags;
437  const struct tune_params *const tune;
438};
439
440/* Processor cores implementing AArch64.  */
441static const struct processor all_cores[] =
442{
443#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
444  {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
445#include "aarch64-cores.def"
446#undef AARCH64_CORE
447  {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
448  {NULL, aarch64_none, NULL, 0, 0, NULL}
449};
450
451/* Architectures implementing AArch64.  */
452static const struct processor all_architectures[] =
453{
454#define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
455  {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
456#include "aarch64-arches.def"
457#undef AARCH64_ARCH
458  {NULL, aarch64_none, NULL, 0, 0, NULL}
459};
460
461/* Target specification.  These are populated as commandline arguments
462   are processed, or NULL if not specified.  */
463static const struct processor *selected_arch;
464static const struct processor *selected_cpu;
465static const struct processor *selected_tune;
466
467#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
468
469/* An ISA extension in the co-processor and main instruction set space.  */
470struct aarch64_option_extension
471{
472  const char *const name;
473  const unsigned long flags_on;
474  const unsigned long flags_off;
475};
476
477/* ISA extensions in AArch64.  */
478static const struct aarch64_option_extension all_extensions[] =
479{
480#define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
481  {NAME, FLAGS_ON, FLAGS_OFF},
482#include "aarch64-option-extensions.def"
483#undef AARCH64_OPT_EXTENSION
484  {NULL, 0, 0}
485};
486
487/* Used to track the size of an address when generating a pre/post
488   increment address.  */
489static machine_mode aarch64_memory_reference_mode;
490
491/* A table of valid AArch64 "bitmask immediate" values for
492   logical instructions.  */
493
494#define AARCH64_NUM_BITMASKS  5334
495static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
496
497typedef enum aarch64_cond_code
498{
499  AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
500  AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
501  AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
502}
503aarch64_cc;
504
505#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
506
507/* The condition codes of the processor, and the inverse function.  */
508static const char * const aarch64_condition_codes[] =
509{
510  "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
511  "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
512};
513
514static unsigned int
515aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
516{
517  return 2;
518}
519
520static int
521aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
522			     enum machine_mode mode)
523{
524  if (VECTOR_MODE_P (mode))
525    return aarch64_tune_params->vec_reassoc_width;
526  if (INTEGRAL_MODE_P (mode))
527    return aarch64_tune_params->int_reassoc_width;
528  if (FLOAT_MODE_P (mode))
529    return aarch64_tune_params->fp_reassoc_width;
530  return 1;
531}
532
533/* Provide a mapping from gcc register numbers to dwarf register numbers.  */
534unsigned
535aarch64_dbx_register_number (unsigned regno)
536{
537   if (GP_REGNUM_P (regno))
538     return AARCH64_DWARF_R0 + regno - R0_REGNUM;
539   else if (regno == SP_REGNUM)
540     return AARCH64_DWARF_SP;
541   else if (FP_REGNUM_P (regno))
542     return AARCH64_DWARF_V0 + regno - V0_REGNUM;
543
544   /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
545      equivalent DWARF register.  */
546   return DWARF_FRAME_REGISTERS;
547}
548
549/* Return TRUE if MODE is any of the large INT modes.  */
550static bool
551aarch64_vect_struct_mode_p (machine_mode mode)
552{
553  return mode == OImode || mode == CImode || mode == XImode;
554}
555
556/* Return TRUE if MODE is any of the vector modes.  */
557static bool
558aarch64_vector_mode_p (machine_mode mode)
559{
560  return aarch64_vector_mode_supported_p (mode)
561	 || aarch64_vect_struct_mode_p (mode);
562}
563
564/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
565static bool
566aarch64_array_mode_supported_p (machine_mode mode,
567				unsigned HOST_WIDE_INT nelems)
568{
569  if (TARGET_SIMD
570      && AARCH64_VALID_SIMD_QREG_MODE (mode)
571      && (nelems >= 2 && nelems <= 4))
572    return true;
573
574  return false;
575}
576
577/* Implement HARD_REGNO_NREGS.  */
578
579int
580aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
581{
582  switch (aarch64_regno_regclass (regno))
583    {
584    case FP_REGS:
585    case FP_LO_REGS:
586      return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
587    default:
588      return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
589    }
590  gcc_unreachable ();
591}
592
593/* Implement HARD_REGNO_MODE_OK.  */
594
595int
596aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
597{
598  if (GET_MODE_CLASS (mode) == MODE_CC)
599    return regno == CC_REGNUM;
600
601  if (regno == SP_REGNUM)
602    /* The purpose of comparing with ptr_mode is to support the
603       global register variable associated with the stack pointer
604       register via the syntax of asm ("wsp") in ILP32.  */
605    return mode == Pmode || mode == ptr_mode;
606
607  if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
608    return mode == Pmode;
609
610  if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
611    return 1;
612
613  if (FP_REGNUM_P (regno))
614    {
615      if (aarch64_vect_struct_mode_p (mode))
616	return
617	  (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
618      else
619	return 1;
620    }
621
622  return 0;
623}
624
625/* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
626machine_mode
627aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
628				     machine_mode mode)
629{
630  /* Handle modes that fit within single registers.  */
631  if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
632    {
633      if (GET_MODE_SIZE (mode) >= 4)
634        return mode;
635      else
636        return SImode;
637    }
638  /* Fall back to generic for multi-reg and very large modes.  */
639  else
640    return choose_hard_reg_mode (regno, nregs, false);
641}
642
643/* Return true if calls to DECL should be treated as
644   long-calls (ie called via a register).  */
645static bool
646aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
647{
648  return false;
649}
650
651/* Return true if calls to symbol-ref SYM should be treated as
652   long-calls (ie called via a register).  */
653bool
654aarch64_is_long_call_p (rtx sym)
655{
656  return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
657}
658
659/* Return true if the offsets to a zero/sign-extract operation
660   represent an expression that matches an extend operation.  The
661   operands represent the paramters from
662
663   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
664bool
665aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
666				rtx extract_imm)
667{
668  HOST_WIDE_INT mult_val, extract_val;
669
670  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
671    return false;
672
673  mult_val = INTVAL (mult_imm);
674  extract_val = INTVAL (extract_imm);
675
676  if (extract_val > 8
677      && extract_val < GET_MODE_BITSIZE (mode)
678      && exact_log2 (extract_val & ~7) > 0
679      && (extract_val & 7) <= 4
680      && mult_val == (1 << (extract_val & 7)))
681    return true;
682
683  return false;
684}
685
686/* Emit an insn that's a simple single-set.  Both the operands must be
687   known to be valid.  */
688inline static rtx
689emit_set_insn (rtx x, rtx y)
690{
691  return emit_insn (gen_rtx_SET (VOIDmode, x, y));
692}
693
694/* X and Y are two things to compare using CODE.  Emit the compare insn and
695   return the rtx for register 0 in the proper mode.  */
696rtx
697aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
698{
699  machine_mode mode = SELECT_CC_MODE (code, x, y);
700  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
701
702  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
703  return cc_reg;
704}
705
706/* Build the SYMBOL_REF for __tls_get_addr.  */
707
708static GTY(()) rtx tls_get_addr_libfunc;
709
710rtx
711aarch64_tls_get_addr (void)
712{
713  if (!tls_get_addr_libfunc)
714    tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
715  return tls_get_addr_libfunc;
716}
717
718/* Return the TLS model to use for ADDR.  */
719
720static enum tls_model
721tls_symbolic_operand_type (rtx addr)
722{
723  enum tls_model tls_kind = TLS_MODEL_NONE;
724  rtx sym, addend;
725
726  if (GET_CODE (addr) == CONST)
727    {
728      split_const (addr, &sym, &addend);
729      if (GET_CODE (sym) == SYMBOL_REF)
730	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
731    }
732  else if (GET_CODE (addr) == SYMBOL_REF)
733    tls_kind = SYMBOL_REF_TLS_MODEL (addr);
734
735  return tls_kind;
736}
737
738/* We'll allow lo_sum's in addresses in our legitimate addresses
739   so that combine would take care of combining addresses where
740   necessary, but for generation purposes, we'll generate the address
741   as :
742   RTL                               Absolute
743   tmp = hi (symbol_ref);            adrp  x1, foo
744   dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
745                                     nop
746
747   PIC                               TLS
748   adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
749   ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
750                                     bl   __tls_get_addr
751                                     nop
752
753   Load TLS symbol, depending on TLS mechanism and TLS access model.
754
755   Global Dynamic - Traditional TLS:
756   adrp tmp, :tlsgd:imm
757   add  dest, tmp, #:tlsgd_lo12:imm
758   bl   __tls_get_addr
759
760   Global Dynamic - TLS Descriptors:
761   adrp dest, :tlsdesc:imm
762   ldr  tmp, [dest, #:tlsdesc_lo12:imm]
763   add  dest, dest, #:tlsdesc_lo12:imm
764   blr  tmp
765   mrs  tp, tpidr_el0
766   add  dest, dest, tp
767
768   Initial Exec:
769   mrs  tp, tpidr_el0
770   adrp tmp, :gottprel:imm
771   ldr  dest, [tmp, #:gottprel_lo12:imm]
772   add  dest, dest, tp
773
774   Local Exec:
775   mrs  tp, tpidr_el0
776   add  t0, tp, #:tprel_hi12:imm, lsl #12
777   add  t0, t0, #:tprel_lo12_nc:imm
778*/
779
780static void
781aarch64_load_symref_appropriately (rtx dest, rtx imm,
782				   enum aarch64_symbol_type type)
783{
784  switch (type)
785    {
786    case SYMBOL_SMALL_ABSOLUTE:
787      {
788	/* In ILP32, the mode of dest can be either SImode or DImode.  */
789	rtx tmp_reg = dest;
790	machine_mode mode = GET_MODE (dest);
791
792	gcc_assert (mode == Pmode || mode == ptr_mode);
793
794	if (can_create_pseudo_p ())
795	  tmp_reg = gen_reg_rtx (mode);
796
797	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
798	emit_insn (gen_add_losym (dest, tmp_reg, imm));
799	return;
800      }
801
802    case SYMBOL_TINY_ABSOLUTE:
803      emit_insn (gen_rtx_SET (Pmode, dest, imm));
804      return;
805
806    case SYMBOL_SMALL_GOT:
807      {
808	/* In ILP32, the mode of dest can be either SImode or DImode,
809	   while the got entry is always of SImode size.  The mode of
810	   dest depends on how dest is used: if dest is assigned to a
811	   pointer (e.g. in the memory), it has SImode; it may have
812	   DImode if dest is dereferenced to access the memeory.
813	   This is why we have to handle three different ldr_got_small
814	   patterns here (two patterns for ILP32).  */
815	rtx tmp_reg = dest;
816	machine_mode mode = GET_MODE (dest);
817
818	if (can_create_pseudo_p ())
819	  tmp_reg = gen_reg_rtx (mode);
820
821	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
822	if (mode == ptr_mode)
823	  {
824	    if (mode == DImode)
825	      emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
826	    else
827	      emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
828	  }
829	else
830	  {
831	    gcc_assert (mode == Pmode);
832	    emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
833	  }
834
835	return;
836      }
837
838    case SYMBOL_SMALL_TLSGD:
839      {
840	rtx_insn *insns;
841	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
842
843	start_sequence ();
844	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
845	insns = get_insns ();
846	end_sequence ();
847
848	RTL_CONST_CALL_P (insns) = 1;
849	emit_libcall_block (insns, dest, result, imm);
850	return;
851      }
852
853    case SYMBOL_SMALL_TLSDESC:
854      {
855	machine_mode mode = GET_MODE (dest);
856	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
857	rtx tp;
858
859	gcc_assert (mode == Pmode || mode == ptr_mode);
860
861	/* In ILP32, the got entry is always of SImode size.  Unlike
862	   small GOT, the dest is fixed at reg 0.  */
863	if (TARGET_ILP32)
864	  emit_insn (gen_tlsdesc_small_si (imm));
865	else
866	  emit_insn (gen_tlsdesc_small_di (imm));
867	tp = aarch64_load_tp (NULL);
868
869	if (mode != Pmode)
870	  tp = gen_lowpart (mode, tp);
871
872	emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
873	set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
874	return;
875      }
876
877    case SYMBOL_SMALL_GOTTPREL:
878      {
879	/* In ILP32, the mode of dest can be either SImode or DImode,
880	   while the got entry is always of SImode size.  The mode of
881	   dest depends on how dest is used: if dest is assigned to a
882	   pointer (e.g. in the memory), it has SImode; it may have
883	   DImode if dest is dereferenced to access the memeory.
884	   This is why we have to handle three different tlsie_small
885	   patterns here (two patterns for ILP32).  */
886	machine_mode mode = GET_MODE (dest);
887	rtx tmp_reg = gen_reg_rtx (mode);
888	rtx tp = aarch64_load_tp (NULL);
889
890	if (mode == ptr_mode)
891	  {
892	    if (mode == DImode)
893	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
894	    else
895	      {
896		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
897		tp = gen_lowpart (mode, tp);
898	      }
899	  }
900	else
901	  {
902	    gcc_assert (mode == Pmode);
903	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
904	  }
905
906	emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
907	set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
908	return;
909      }
910
911    case SYMBOL_SMALL_TPREL:
912      {
913	rtx tp = aarch64_load_tp (NULL);
914
915	if (GET_MODE (dest) != Pmode)
916	  tp = gen_lowpart (GET_MODE (dest), tp);
917
918	emit_insn (gen_tlsle_small (dest, tp, imm));
919	set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
920	return;
921      }
922
923    case SYMBOL_TINY_GOT:
924      emit_insn (gen_ldr_got_tiny (dest, imm));
925      return;
926
927    default:
928      gcc_unreachable ();
929    }
930}
931
932/* Emit a move from SRC to DEST.  Assume that the move expanders can
933   handle all moves if !can_create_pseudo_p ().  The distinction is
934   important because, unlike emit_move_insn, the move expanders know
935   how to force Pmode objects into the constant pool even when the
936   constant pool address is not itself legitimate.  */
937static rtx
938aarch64_emit_move (rtx dest, rtx src)
939{
940  return (can_create_pseudo_p ()
941	  ? emit_move_insn (dest, src)
942	  : emit_move_insn_1 (dest, src));
943}
944
945/* Split a 128-bit move operation into two 64-bit move operations,
946   taking care to handle partial overlap of register to register
947   copies.  Special cases are needed when moving between GP regs and
948   FP regs.  SRC can be a register, constant or memory; DST a register
949   or memory.  If either operand is memory it must not have any side
950   effects.  */
951void
952aarch64_split_128bit_move (rtx dst, rtx src)
953{
954  rtx dst_lo, dst_hi;
955  rtx src_lo, src_hi;
956
957  machine_mode mode = GET_MODE (dst);
958
959  gcc_assert (mode == TImode || mode == TFmode);
960  gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
961  gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
962
963  if (REG_P (dst) && REG_P (src))
964    {
965      int src_regno = REGNO (src);
966      int dst_regno = REGNO (dst);
967
968      /* Handle FP <-> GP regs.  */
969      if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
970	{
971	  src_lo = gen_lowpart (word_mode, src);
972	  src_hi = gen_highpart (word_mode, src);
973
974	  if (mode == TImode)
975	    {
976	      emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
977	      emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
978	    }
979	  else
980	    {
981	      emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
982	      emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
983	    }
984	  return;
985	}
986      else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
987	{
988	  dst_lo = gen_lowpart (word_mode, dst);
989	  dst_hi = gen_highpart (word_mode, dst);
990
991	  if (mode == TImode)
992	    {
993	      emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
994	      emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
995	    }
996	  else
997	    {
998	      emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
999	      emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1000	    }
1001	  return;
1002	}
1003    }
1004
1005  dst_lo = gen_lowpart (word_mode, dst);
1006  dst_hi = gen_highpart (word_mode, dst);
1007  src_lo = gen_lowpart (word_mode, src);
1008  src_hi = gen_highpart_mode (word_mode, mode, src);
1009
1010  /* At most one pairing may overlap.  */
1011  if (reg_overlap_mentioned_p (dst_lo, src_hi))
1012    {
1013      aarch64_emit_move (dst_hi, src_hi);
1014      aarch64_emit_move (dst_lo, src_lo);
1015    }
1016  else
1017    {
1018      aarch64_emit_move (dst_lo, src_lo);
1019      aarch64_emit_move (dst_hi, src_hi);
1020    }
1021}
1022
1023bool
1024aarch64_split_128bit_move_p (rtx dst, rtx src)
1025{
1026  return (! REG_P (src)
1027	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1028}
1029
1030/* Split a complex SIMD combine.  */
1031
1032void
1033aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1034{
1035  machine_mode src_mode = GET_MODE (src1);
1036  machine_mode dst_mode = GET_MODE (dst);
1037
1038  gcc_assert (VECTOR_MODE_P (dst_mode));
1039
1040  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1041    {
1042      rtx (*gen) (rtx, rtx, rtx);
1043
1044      switch (src_mode)
1045	{
1046	case V8QImode:
1047	  gen = gen_aarch64_simd_combinev8qi;
1048	  break;
1049	case V4HImode:
1050	  gen = gen_aarch64_simd_combinev4hi;
1051	  break;
1052	case V2SImode:
1053	  gen = gen_aarch64_simd_combinev2si;
1054	  break;
1055	case V2SFmode:
1056	  gen = gen_aarch64_simd_combinev2sf;
1057	  break;
1058	case DImode:
1059	  gen = gen_aarch64_simd_combinedi;
1060	  break;
1061	case DFmode:
1062	  gen = gen_aarch64_simd_combinedf;
1063	  break;
1064	default:
1065	  gcc_unreachable ();
1066	}
1067
1068      emit_insn (gen (dst, src1, src2));
1069      return;
1070    }
1071}
1072
1073/* Split a complex SIMD move.  */
1074
1075void
1076aarch64_split_simd_move (rtx dst, rtx src)
1077{
1078  machine_mode src_mode = GET_MODE (src);
1079  machine_mode dst_mode = GET_MODE (dst);
1080
1081  gcc_assert (VECTOR_MODE_P (dst_mode));
1082
1083  if (REG_P (dst) && REG_P (src))
1084    {
1085      rtx (*gen) (rtx, rtx);
1086
1087      gcc_assert (VECTOR_MODE_P (src_mode));
1088
1089      switch (src_mode)
1090	{
1091	case V16QImode:
1092	  gen = gen_aarch64_split_simd_movv16qi;
1093	  break;
1094	case V8HImode:
1095	  gen = gen_aarch64_split_simd_movv8hi;
1096	  break;
1097	case V4SImode:
1098	  gen = gen_aarch64_split_simd_movv4si;
1099	  break;
1100	case V2DImode:
1101	  gen = gen_aarch64_split_simd_movv2di;
1102	  break;
1103	case V4SFmode:
1104	  gen = gen_aarch64_split_simd_movv4sf;
1105	  break;
1106	case V2DFmode:
1107	  gen = gen_aarch64_split_simd_movv2df;
1108	  break;
1109	default:
1110	  gcc_unreachable ();
1111	}
1112
1113      emit_insn (gen (dst, src));
1114      return;
1115    }
1116}
1117
1118bool
1119aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1120			      machine_mode ymode, rtx y)
1121{
1122  rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1123  gcc_assert (r != NULL);
1124  return rtx_equal_p (x, r);
1125}
1126
1127
1128static rtx
1129aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1130{
1131  if (can_create_pseudo_p ())
1132    return force_reg (mode, value);
1133  else
1134    {
1135      x = aarch64_emit_move (x, value);
1136      return x;
1137    }
1138}
1139
1140
1141static rtx
1142aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1143{
1144  if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1145    {
1146      rtx high;
1147      /* Load the full offset into a register.  This
1148         might be improvable in the future.  */
1149      high = GEN_INT (offset);
1150      offset = 0;
1151      high = aarch64_force_temporary (mode, temp, high);
1152      reg = aarch64_force_temporary (mode, temp,
1153				     gen_rtx_PLUS (mode, high, reg));
1154    }
1155  return plus_constant (mode, reg, offset);
1156}
1157
1158static int
1159aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1160				machine_mode mode)
1161{
1162  unsigned HOST_WIDE_INT mask;
1163  int i;
1164  bool first;
1165  unsigned HOST_WIDE_INT val;
1166  bool subtargets;
1167  rtx subtarget;
1168  int one_match, zero_match, first_not_ffff_match;
1169  int num_insns = 0;
1170
1171  if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1172    {
1173      if (generate)
1174	emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1175      num_insns++;
1176      return num_insns;
1177    }
1178
1179  if (mode == SImode)
1180    {
1181      /* We know we can't do this in 1 insn, and we must be able to do it
1182	 in two; so don't mess around looking for sequences that don't buy
1183	 us anything.  */
1184      if (generate)
1185	{
1186	  emit_insn (gen_rtx_SET (VOIDmode, dest,
1187				  GEN_INT (INTVAL (imm) & 0xffff)));
1188	  emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1189				     GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1190	}
1191      num_insns += 2;
1192      return num_insns;
1193    }
1194
1195  /* Remaining cases are all for DImode.  */
1196
1197  val = INTVAL (imm);
1198  subtargets = optimize && can_create_pseudo_p ();
1199
1200  one_match = 0;
1201  zero_match = 0;
1202  mask = 0xffff;
1203  first_not_ffff_match = -1;
1204
1205  for (i = 0; i < 64; i += 16, mask <<= 16)
1206    {
1207      if ((val & mask) == mask)
1208	one_match++;
1209      else
1210	{
1211	  if (first_not_ffff_match < 0)
1212	    first_not_ffff_match = i;
1213	  if ((val & mask) == 0)
1214	    zero_match++;
1215	}
1216    }
1217
1218  if (one_match == 2)
1219    {
1220      /* Set one of the quarters and then insert back into result.  */
1221      mask = 0xffffll << first_not_ffff_match;
1222      if (generate)
1223	{
1224	  emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1225	  emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1226				     GEN_INT ((val >> first_not_ffff_match)
1227					      & 0xffff)));
1228	}
1229      num_insns += 2;
1230      return num_insns;
1231    }
1232
1233  if (zero_match == 2)
1234    goto simple_sequence;
1235
1236  mask = 0x0ffff0000UL;
1237  for (i = 16; i < 64; i += 16, mask <<= 16)
1238    {
1239      HOST_WIDE_INT comp = mask & ~(mask - 1);
1240
1241      if (aarch64_uimm12_shift (val - (val & mask)))
1242	{
1243	  if (generate)
1244	    {
1245	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1246	      emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1247				      GEN_INT (val & mask)));
1248	      emit_insn (gen_adddi3 (dest, subtarget,
1249				     GEN_INT (val - (val & mask))));
1250	    }
1251	  num_insns += 2;
1252	  return num_insns;
1253	}
1254      else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1255	{
1256	  if (generate)
1257	    {
1258	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1259	      emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1260				      GEN_INT ((val + comp) & mask)));
1261	      emit_insn (gen_adddi3 (dest, subtarget,
1262				     GEN_INT (val - ((val + comp) & mask))));
1263	    }
1264	  num_insns += 2;
1265	  return num_insns;
1266	}
1267      else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1268	{
1269	  if (generate)
1270	    {
1271	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1272	      emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1273				      GEN_INT ((val - comp) | ~mask)));
1274	      emit_insn (gen_adddi3 (dest, subtarget,
1275				     GEN_INT (val - ((val - comp) | ~mask))));
1276	    }
1277	  num_insns += 2;
1278	  return num_insns;
1279	}
1280      else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1281	{
1282	  if (generate)
1283	    {
1284	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1285	      emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1286				      GEN_INT (val | ~mask)));
1287	      emit_insn (gen_adddi3 (dest, subtarget,
1288				     GEN_INT (val - (val | ~mask))));
1289	    }
1290	  num_insns += 2;
1291	  return num_insns;
1292	}
1293    }
1294
1295  /* See if we can do it by arithmetically combining two
1296     immediates.  */
1297  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1298    {
1299      int j;
1300      mask = 0xffff;
1301
1302      if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1303	  || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1304	{
1305	  if (generate)
1306	    {
1307	      subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1308	      emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1309				      GEN_INT (aarch64_bitmasks[i])));
1310	      emit_insn (gen_adddi3 (dest, subtarget,
1311				     GEN_INT (val - aarch64_bitmasks[i])));
1312	    }
1313	  num_insns += 2;
1314	  return num_insns;
1315	}
1316
1317      for (j = 0; j < 64; j += 16, mask <<= 16)
1318	{
1319	  if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1320	    {
1321	      if (generate)
1322		{
1323		  emit_insn (gen_rtx_SET (VOIDmode, dest,
1324					  GEN_INT (aarch64_bitmasks[i])));
1325		  emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1326					     GEN_INT ((val >> j) & 0xffff)));
1327		}
1328	      num_insns += 2;
1329	      return num_insns;
1330	    }
1331	}
1332    }
1333
1334  /* See if we can do it by logically combining two immediates.  */
1335  for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1336    {
1337      if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1338	{
1339	  int j;
1340
1341	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1342	    if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1343	      {
1344		if (generate)
1345		  {
1346		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1347		    emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1348					    GEN_INT (aarch64_bitmasks[i])));
1349		    emit_insn (gen_iordi3 (dest, subtarget,
1350					   GEN_INT (aarch64_bitmasks[j])));
1351		  }
1352		num_insns += 2;
1353		return num_insns;
1354	      }
1355	}
1356      else if ((val & aarch64_bitmasks[i]) == val)
1357	{
1358	  int j;
1359
1360	  for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1361	    if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1362	      {
1363		if (generate)
1364		  {
1365		    subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1366		    emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1367					    GEN_INT (aarch64_bitmasks[j])));
1368		    emit_insn (gen_anddi3 (dest, subtarget,
1369					   GEN_INT (aarch64_bitmasks[i])));
1370		  }
1371		num_insns += 2;
1372		return num_insns;
1373	      }
1374	}
1375    }
1376
1377  if (one_match > zero_match)
1378    {
1379      /* Set either first three quarters or all but the third.	 */
1380      mask = 0xffffll << (16 - first_not_ffff_match);
1381      if (generate)
1382	emit_insn (gen_rtx_SET (VOIDmode, dest,
1383				GEN_INT (val | mask | 0xffffffff00000000ull)));
1384      num_insns ++;
1385
1386      /* Now insert other two quarters.	 */
1387      for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1388	   i < 64; i += 16, mask <<= 16)
1389	{
1390	  if ((val & mask) != mask)
1391	    {
1392	      if (generate)
1393		emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1394					   GEN_INT ((val >> i) & 0xffff)));
1395	      num_insns ++;
1396	    }
1397	}
1398      return num_insns;
1399    }
1400
1401 simple_sequence:
1402  first = true;
1403  mask = 0xffff;
1404  for (i = 0; i < 64; i += 16, mask <<= 16)
1405    {
1406      if ((val & mask) != 0)
1407	{
1408	  if (first)
1409	    {
1410	      if (generate)
1411		emit_insn (gen_rtx_SET (VOIDmode, dest,
1412					GEN_INT (val & mask)));
1413	      num_insns ++;
1414	      first = false;
1415	    }
1416	  else
1417	    {
1418	      if (generate)
1419		emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1420					   GEN_INT ((val >> i) & 0xffff)));
1421	      num_insns ++;
1422	    }
1423	}
1424    }
1425
1426  return num_insns;
1427}
1428
1429
1430void
1431aarch64_expand_mov_immediate (rtx dest, rtx imm)
1432{
1433  machine_mode mode = GET_MODE (dest);
1434
1435  gcc_assert (mode == SImode || mode == DImode);
1436
1437  /* Check on what type of symbol it is.  */
1438  if (GET_CODE (imm) == SYMBOL_REF
1439      || GET_CODE (imm) == LABEL_REF
1440      || GET_CODE (imm) == CONST)
1441    {
1442      rtx mem, base, offset;
1443      enum aarch64_symbol_type sty;
1444
1445      /* If we have (const (plus symbol offset)), separate out the offset
1446	 before we start classifying the symbol.  */
1447      split_const (imm, &base, &offset);
1448
1449      sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1450      switch (sty)
1451	{
1452	case SYMBOL_FORCE_TO_MEM:
1453	  if (offset != const0_rtx
1454	      && targetm.cannot_force_const_mem (mode, imm))
1455	    {
1456	      gcc_assert (can_create_pseudo_p ());
1457	      base = aarch64_force_temporary (mode, dest, base);
1458	      base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1459	      aarch64_emit_move (dest, base);
1460	      return;
1461	    }
1462	  mem = force_const_mem (ptr_mode, imm);
1463	  gcc_assert (mem);
1464	  if (mode != ptr_mode)
1465	    mem = gen_rtx_ZERO_EXTEND (mode, mem);
1466	  emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1467	  return;
1468
1469        case SYMBOL_SMALL_TLSGD:
1470        case SYMBOL_SMALL_TLSDESC:
1471        case SYMBOL_SMALL_GOTTPREL:
1472	case SYMBOL_SMALL_GOT:
1473	case SYMBOL_TINY_GOT:
1474	  if (offset != const0_rtx)
1475	    {
1476	      gcc_assert(can_create_pseudo_p ());
1477	      base = aarch64_force_temporary (mode, dest, base);
1478	      base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1479	      aarch64_emit_move (dest, base);
1480	      return;
1481	    }
1482	  /* FALLTHRU */
1483
1484        case SYMBOL_SMALL_TPREL:
1485	case SYMBOL_SMALL_ABSOLUTE:
1486	case SYMBOL_TINY_ABSOLUTE:
1487	  aarch64_load_symref_appropriately (dest, imm, sty);
1488	  return;
1489
1490	default:
1491	  gcc_unreachable ();
1492	}
1493    }
1494
1495  if (!CONST_INT_P (imm))
1496    {
1497      if (GET_CODE (imm) == HIGH)
1498	emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1499      else
1500        {
1501	  rtx mem = force_const_mem (mode, imm);
1502	  gcc_assert (mem);
1503	  emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1504	}
1505
1506      return;
1507    }
1508
1509  aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1510}
1511
1512static bool
1513aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1514				 tree exp ATTRIBUTE_UNUSED)
1515{
1516  /* Currently, always true.  */
1517  return true;
1518}
1519
1520/* Implement TARGET_PASS_BY_REFERENCE.  */
1521
1522static bool
1523aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1524			   machine_mode mode,
1525			   const_tree type,
1526			   bool named ATTRIBUTE_UNUSED)
1527{
1528  HOST_WIDE_INT size;
1529  machine_mode dummymode;
1530  int nregs;
1531
1532  /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1533  size = (mode == BLKmode && type)
1534    ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1535
1536  /* Aggregates are passed by reference based on their size.  */
1537  if (type && AGGREGATE_TYPE_P (type))
1538    {
1539      size = int_size_in_bytes (type);
1540    }
1541
1542  /* Variable sized arguments are always returned by reference.  */
1543  if (size < 0)
1544    return true;
1545
1546  /* Can this be a candidate to be passed in fp/simd register(s)?  */
1547  if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1548					       &dummymode, &nregs,
1549					       NULL))
1550    return false;
1551
1552  /* Arguments which are variable sized or larger than 2 registers are
1553     passed by reference unless they are a homogenous floating point
1554     aggregate.  */
1555  return size > 2 * UNITS_PER_WORD;
1556}
1557
1558/* Return TRUE if VALTYPE is padded to its least significant bits.  */
1559static bool
1560aarch64_return_in_msb (const_tree valtype)
1561{
1562  machine_mode dummy_mode;
1563  int dummy_int;
1564
1565  /* Never happens in little-endian mode.  */
1566  if (!BYTES_BIG_ENDIAN)
1567    return false;
1568
1569  /* Only composite types smaller than or equal to 16 bytes can
1570     be potentially returned in registers.  */
1571  if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1572      || int_size_in_bytes (valtype) <= 0
1573      || int_size_in_bytes (valtype) > 16)
1574    return false;
1575
1576  /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1577     or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1578     is always passed/returned in the least significant bits of fp/simd
1579     register(s).  */
1580  if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1581					       &dummy_mode, &dummy_int, NULL))
1582    return false;
1583
1584  return true;
1585}
1586
1587/* Implement TARGET_FUNCTION_VALUE.
1588   Define how to find the value returned by a function.  */
1589
1590static rtx
1591aarch64_function_value (const_tree type, const_tree func,
1592			bool outgoing ATTRIBUTE_UNUSED)
1593{
1594  machine_mode mode;
1595  int unsignedp;
1596  int count;
1597  machine_mode ag_mode;
1598
1599  mode = TYPE_MODE (type);
1600  if (INTEGRAL_TYPE_P (type))
1601    mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1602
1603  if (aarch64_return_in_msb (type))
1604    {
1605      HOST_WIDE_INT size = int_size_in_bytes (type);
1606
1607      if (size % UNITS_PER_WORD != 0)
1608	{
1609	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1610	  mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1611	}
1612    }
1613
1614  if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1615					       &ag_mode, &count, NULL))
1616    {
1617      if (!aarch64_composite_type_p (type, mode))
1618	{
1619	  gcc_assert (count == 1 && mode == ag_mode);
1620	  return gen_rtx_REG (mode, V0_REGNUM);
1621	}
1622      else
1623	{
1624	  int i;
1625	  rtx par;
1626
1627	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1628	  for (i = 0; i < count; i++)
1629	    {
1630	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1631	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1632				       GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1633	      XVECEXP (par, 0, i) = tmp;
1634	    }
1635	  return par;
1636	}
1637    }
1638  else
1639    return gen_rtx_REG (mode, R0_REGNUM);
1640}
1641
1642/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1643   Return true if REGNO is the number of a hard register in which the values
1644   of called function may come back.  */
1645
1646static bool
1647aarch64_function_value_regno_p (const unsigned int regno)
1648{
1649  /* Maximum of 16 bytes can be returned in the general registers.  Examples
1650     of 16-byte return values are: 128-bit integers and 16-byte small
1651     structures (excluding homogeneous floating-point aggregates).  */
1652  if (regno == R0_REGNUM || regno == R1_REGNUM)
1653    return true;
1654
1655  /* Up to four fp/simd registers can return a function value, e.g. a
1656     homogeneous floating-point aggregate having four members.  */
1657  if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1658    return TARGET_FLOAT;
1659
1660  return false;
1661}
1662
1663/* Implement TARGET_RETURN_IN_MEMORY.
1664
1665   If the type T of the result of a function is such that
1666     void func (T arg)
1667   would require that arg be passed as a value in a register (or set of
1668   registers) according to the parameter passing rules, then the result
1669   is returned in the same registers as would be used for such an
1670   argument.  */
1671
1672static bool
1673aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1674{
1675  HOST_WIDE_INT size;
1676  machine_mode ag_mode;
1677  int count;
1678
1679  if (!AGGREGATE_TYPE_P (type)
1680      && TREE_CODE (type) != COMPLEX_TYPE
1681      && TREE_CODE (type) != VECTOR_TYPE)
1682    /* Simple scalar types always returned in registers.  */
1683    return false;
1684
1685  if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1686					       type,
1687					       &ag_mode,
1688					       &count,
1689					       NULL))
1690    return false;
1691
1692  /* Types larger than 2 registers returned in memory.  */
1693  size = int_size_in_bytes (type);
1694  return (size < 0 || size > 2 * UNITS_PER_WORD);
1695}
1696
1697static bool
1698aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1699			       const_tree type, int *nregs)
1700{
1701  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1702  return aarch64_vfp_is_call_or_return_candidate (mode,
1703						  type,
1704						  &pcum->aapcs_vfp_rmode,
1705						  nregs,
1706						  NULL);
1707}
1708
1709/* Given MODE and TYPE of a function argument, return the alignment in
1710   bits.  The idea is to suppress any stronger alignment requested by
1711   the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1712   This is a helper function for local use only.  */
1713
1714static unsigned int
1715aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1716{
1717  unsigned int alignment;
1718
1719  if (type)
1720    {
1721      if (!integer_zerop (TYPE_SIZE (type)))
1722	{
1723	  if (TYPE_MODE (type) == mode)
1724	    alignment = TYPE_ALIGN (type);
1725	  else
1726	    alignment = GET_MODE_ALIGNMENT (mode);
1727	}
1728      else
1729	alignment = 0;
1730    }
1731  else
1732    alignment = GET_MODE_ALIGNMENT (mode);
1733
1734  return alignment;
1735}
1736
1737/* Layout a function argument according to the AAPCS64 rules.  The rule
1738   numbers refer to the rule numbers in the AAPCS64.  */
1739
1740static void
1741aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1742		    const_tree type,
1743		    bool named ATTRIBUTE_UNUSED)
1744{
1745  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1746  int ncrn, nvrn, nregs;
1747  bool allocate_ncrn, allocate_nvrn;
1748  HOST_WIDE_INT size;
1749
1750  /* We need to do this once per argument.  */
1751  if (pcum->aapcs_arg_processed)
1752    return;
1753
1754  pcum->aapcs_arg_processed = true;
1755
1756  /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1757  size
1758    = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1759			UNITS_PER_WORD);
1760
1761  allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1762  allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1763						 mode,
1764						 type,
1765						 &nregs);
1766
1767  /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1768     The following code thus handles passing by SIMD/FP registers first.  */
1769
1770  nvrn = pcum->aapcs_nvrn;
1771
1772  /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1773     and homogenous short-vector aggregates (HVA).  */
1774  if (allocate_nvrn)
1775    {
1776      if (nvrn + nregs <= NUM_FP_ARG_REGS)
1777	{
1778	  pcum->aapcs_nextnvrn = nvrn + nregs;
1779	  if (!aarch64_composite_type_p (type, mode))
1780	    {
1781	      gcc_assert (nregs == 1);
1782	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1783	    }
1784	  else
1785	    {
1786	      rtx par;
1787	      int i;
1788	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1789	      for (i = 0; i < nregs; i++)
1790		{
1791		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1792					 V0_REGNUM + nvrn + i);
1793		  tmp = gen_rtx_EXPR_LIST
1794		    (VOIDmode, tmp,
1795		     GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1796		  XVECEXP (par, 0, i) = tmp;
1797		}
1798	      pcum->aapcs_reg = par;
1799	    }
1800	  return;
1801	}
1802      else
1803	{
1804	  /* C.3 NSRN is set to 8.  */
1805	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1806	  goto on_stack;
1807	}
1808    }
1809
1810  ncrn = pcum->aapcs_ncrn;
1811  nregs = size / UNITS_PER_WORD;
1812
1813  /* C6 - C9.  though the sign and zero extension semantics are
1814     handled elsewhere.  This is the case where the argument fits
1815     entirely general registers.  */
1816  if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1817    {
1818      unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1819
1820      gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1821
1822      /* C.8 if the argument has an alignment of 16 then the NGRN is
1823         rounded up to the next even number.  */
1824      if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1825	{
1826	  ++ncrn;
1827	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1828	}
1829      /* NREGS can be 0 when e.g. an empty structure is to be passed.
1830         A reg is still generated for it, but the caller should be smart
1831	 enough not to use it.  */
1832      if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1833	{
1834	  pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1835	}
1836      else
1837	{
1838	  rtx par;
1839	  int i;
1840
1841	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1842	  for (i = 0; i < nregs; i++)
1843	    {
1844	      rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1845	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1846				       GEN_INT (i * UNITS_PER_WORD));
1847	      XVECEXP (par, 0, i) = tmp;
1848	    }
1849	  pcum->aapcs_reg = par;
1850	}
1851
1852      pcum->aapcs_nextncrn = ncrn + nregs;
1853      return;
1854    }
1855
1856  /* C.11  */
1857  pcum->aapcs_nextncrn = NUM_ARG_REGS;
1858
1859  /* The argument is passed on stack; record the needed number of words for
1860     this argument and align the total size if necessary.  */
1861on_stack:
1862  pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1863  if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1864    pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1865					       16 / UNITS_PER_WORD);
1866  return;
1867}
1868
1869/* Implement TARGET_FUNCTION_ARG.  */
1870
1871static rtx
1872aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1873		      const_tree type, bool named)
1874{
1875  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1876  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1877
1878  if (mode == VOIDmode)
1879    return NULL_RTX;
1880
1881  aarch64_layout_arg (pcum_v, mode, type, named);
1882  return pcum->aapcs_reg;
1883}
1884
1885void
1886aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1887			   const_tree fntype ATTRIBUTE_UNUSED,
1888			   rtx libname ATTRIBUTE_UNUSED,
1889			   const_tree fndecl ATTRIBUTE_UNUSED,
1890			   unsigned n_named ATTRIBUTE_UNUSED)
1891{
1892  pcum->aapcs_ncrn = 0;
1893  pcum->aapcs_nvrn = 0;
1894  pcum->aapcs_nextncrn = 0;
1895  pcum->aapcs_nextnvrn = 0;
1896  pcum->pcs_variant = ARM_PCS_AAPCS64;
1897  pcum->aapcs_reg = NULL_RTX;
1898  pcum->aapcs_arg_processed = false;
1899  pcum->aapcs_stack_words = 0;
1900  pcum->aapcs_stack_size = 0;
1901
1902  return;
1903}
1904
1905static void
1906aarch64_function_arg_advance (cumulative_args_t pcum_v,
1907			      machine_mode mode,
1908			      const_tree type,
1909			      bool named)
1910{
1911  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1912  if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1913    {
1914      aarch64_layout_arg (pcum_v, mode, type, named);
1915      gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1916		  != (pcum->aapcs_stack_words != 0));
1917      pcum->aapcs_arg_processed = false;
1918      pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1919      pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1920      pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1921      pcum->aapcs_stack_words = 0;
1922      pcum->aapcs_reg = NULL_RTX;
1923    }
1924}
1925
1926bool
1927aarch64_function_arg_regno_p (unsigned regno)
1928{
1929  return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1930	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1931}
1932
1933/* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1934   PARM_BOUNDARY bits of alignment, but will be given anything up
1935   to STACK_BOUNDARY bits if the type requires it.  This makes sure
1936   that both before and after the layout of each argument, the Next
1937   Stacked Argument Address (NSAA) will have a minimum alignment of
1938   8 bytes.  */
1939
1940static unsigned int
1941aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1942{
1943  unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1944
1945  if (alignment < PARM_BOUNDARY)
1946    alignment = PARM_BOUNDARY;
1947  if (alignment > STACK_BOUNDARY)
1948    alignment = STACK_BOUNDARY;
1949  return alignment;
1950}
1951
1952/* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1953
1954   Return true if an argument passed on the stack should be padded upwards,
1955   i.e. if the least-significant byte of the stack slot has useful data.
1956
1957   Small aggregate types are placed in the lowest memory address.
1958
1959   The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1960
1961bool
1962aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1963{
1964  /* On little-endian targets, the least significant byte of every stack
1965     argument is passed at the lowest byte address of the stack slot.  */
1966  if (!BYTES_BIG_ENDIAN)
1967    return true;
1968
1969  /* Otherwise, integral, floating-point and pointer types are padded downward:
1970     the least significant byte of a stack argument is passed at the highest
1971     byte address of the stack slot.  */
1972  if (type
1973      ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1974	 || POINTER_TYPE_P (type))
1975      : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1976    return false;
1977
1978  /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1979  return true;
1980}
1981
1982/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1983
1984   It specifies padding for the last (may also be the only)
1985   element of a block move between registers and memory.  If
1986   assuming the block is in the memory, padding upward means that
1987   the last element is padded after its highest significant byte,
1988   while in downward padding, the last element is padded at the
1989   its least significant byte side.
1990
1991   Small aggregates and small complex types are always padded
1992   upwards.
1993
1994   We don't need to worry about homogeneous floating-point or
1995   short-vector aggregates; their move is not affected by the
1996   padding direction determined here.  Regardless of endianness,
1997   each element of such an aggregate is put in the least
1998   significant bits of a fp/simd register.
1999
2000   Return !BYTES_BIG_ENDIAN if the least significant byte of the
2001   register has useful data, and return the opposite if the most
2002   significant byte does.  */
2003
2004bool
2005aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2006		     bool first ATTRIBUTE_UNUSED)
2007{
2008
2009  /* Small composite types are always padded upward.  */
2010  if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2011    {
2012      HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2013			    : GET_MODE_SIZE (mode));
2014      if (size < 2 * UNITS_PER_WORD)
2015	return true;
2016    }
2017
2018  /* Otherwise, use the default padding.  */
2019  return !BYTES_BIG_ENDIAN;
2020}
2021
2022static machine_mode
2023aarch64_libgcc_cmp_return_mode (void)
2024{
2025  return SImode;
2026}
2027
2028static bool
2029aarch64_frame_pointer_required (void)
2030{
2031  /* In aarch64_override_options_after_change
2032     flag_omit_leaf_frame_pointer turns off the frame pointer by
2033     default.  Turn it back on now if we've not got a leaf
2034     function.  */
2035  if (flag_omit_leaf_frame_pointer
2036      && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2037    return true;
2038
2039  return false;
2040}
2041
2042/* Mark the registers that need to be saved by the callee and calculate
2043   the size of the callee-saved registers area and frame record (both FP
2044   and LR may be omitted).  */
2045static void
2046aarch64_layout_frame (void)
2047{
2048  HOST_WIDE_INT offset = 0;
2049  int regno;
2050
2051  if (reload_completed && cfun->machine->frame.laid_out)
2052    return;
2053
2054#define SLOT_NOT_REQUIRED (-2)
2055#define SLOT_REQUIRED     (-1)
2056
2057  cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2058  cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2059
2060  /* First mark all the registers that really need to be saved...  */
2061  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2062    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2063
2064  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2065    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2066
2067  /* ... that includes the eh data registers (if needed)...  */
2068  if (crtl->calls_eh_return)
2069    for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2070      cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2071	= SLOT_REQUIRED;
2072
2073  /* ... and any callee saved register that dataflow says is live.  */
2074  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2075    if (df_regs_ever_live_p (regno)
2076	&& (regno == R30_REGNUM
2077	    || !call_used_regs[regno]))
2078      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2079
2080  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2081    if (df_regs_ever_live_p (regno)
2082	&& !call_used_regs[regno])
2083      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2084
2085  if (frame_pointer_needed)
2086    {
2087      /* FP and LR are placed in the linkage record.  */
2088      cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2089      cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2090      cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2091      cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2092      cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2093      offset += 2 * UNITS_PER_WORD;
2094    }
2095
2096  /* Now assign stack slots for them.  */
2097  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2098    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2099      {
2100	cfun->machine->frame.reg_offset[regno] = offset;
2101	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2102	  cfun->machine->frame.wb_candidate1 = regno;
2103	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2104	  cfun->machine->frame.wb_candidate2 = regno;
2105	offset += UNITS_PER_WORD;
2106      }
2107
2108  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2109    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2110      {
2111	cfun->machine->frame.reg_offset[regno] = offset;
2112	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2113	  cfun->machine->frame.wb_candidate1 = regno;
2114	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2115		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2116	  cfun->machine->frame.wb_candidate2 = regno;
2117	offset += UNITS_PER_WORD;
2118      }
2119
2120  cfun->machine->frame.padding0 =
2121    (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2122  offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2123
2124  cfun->machine->frame.saved_regs_size = offset;
2125
2126  cfun->machine->frame.hard_fp_offset
2127    = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2128			+ get_frame_size ()
2129			+ cfun->machine->frame.saved_regs_size,
2130			STACK_BOUNDARY / BITS_PER_UNIT);
2131
2132  cfun->machine->frame.frame_size
2133    = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2134			+ crtl->outgoing_args_size,
2135			STACK_BOUNDARY / BITS_PER_UNIT);
2136
2137  cfun->machine->frame.laid_out = true;
2138}
2139
2140static bool
2141aarch64_register_saved_on_entry (int regno)
2142{
2143  return cfun->machine->frame.reg_offset[regno] >= 0;
2144}
2145
2146static unsigned
2147aarch64_next_callee_save (unsigned regno, unsigned limit)
2148{
2149  while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2150    regno ++;
2151  return regno;
2152}
2153
2154static void
2155aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2156			   HOST_WIDE_INT adjustment)
2157 {
2158  rtx base_rtx = stack_pointer_rtx;
2159  rtx insn, reg, mem;
2160
2161  reg = gen_rtx_REG (mode, regno);
2162  mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2163			    plus_constant (Pmode, base_rtx, -adjustment));
2164  mem = gen_rtx_MEM (mode, mem);
2165
2166  insn = emit_move_insn (mem, reg);
2167  RTX_FRAME_RELATED_P (insn) = 1;
2168}
2169
2170static rtx
2171aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2172			  HOST_WIDE_INT adjustment)
2173{
2174  switch (mode)
2175    {
2176    case DImode:
2177      return gen_storewb_pairdi_di (base, base, reg, reg2,
2178				    GEN_INT (-adjustment),
2179				    GEN_INT (UNITS_PER_WORD - adjustment));
2180    case DFmode:
2181      return gen_storewb_pairdf_di (base, base, reg, reg2,
2182				    GEN_INT (-adjustment),
2183				    GEN_INT (UNITS_PER_WORD - adjustment));
2184    default:
2185      gcc_unreachable ();
2186    }
2187}
2188
2189static void
2190aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2191			 unsigned regno2, HOST_WIDE_INT adjustment)
2192{
2193  rtx_insn *insn;
2194  rtx reg1 = gen_rtx_REG (mode, regno1);
2195  rtx reg2 = gen_rtx_REG (mode, regno2);
2196
2197  insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2198					      reg2, adjustment));
2199  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2200  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2201  RTX_FRAME_RELATED_P (insn) = 1;
2202}
2203
2204static rtx
2205aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2206			 HOST_WIDE_INT adjustment)
2207{
2208  switch (mode)
2209    {
2210    case DImode:
2211      return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2212				   GEN_INT (UNITS_PER_WORD));
2213    case DFmode:
2214      return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2215				   GEN_INT (UNITS_PER_WORD));
2216    default:
2217      gcc_unreachable ();
2218    }
2219}
2220
2221static rtx
2222aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2223			rtx reg2)
2224{
2225  switch (mode)
2226    {
2227    case DImode:
2228      return gen_store_pairdi (mem1, reg1, mem2, reg2);
2229
2230    case DFmode:
2231      return gen_store_pairdf (mem1, reg1, mem2, reg2);
2232
2233    default:
2234      gcc_unreachable ();
2235    }
2236}
2237
2238static rtx
2239aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2240		       rtx mem2)
2241{
2242  switch (mode)
2243    {
2244    case DImode:
2245      return gen_load_pairdi (reg1, mem1, reg2, mem2);
2246
2247    case DFmode:
2248      return gen_load_pairdf (reg1, mem1, reg2, mem2);
2249
2250    default:
2251      gcc_unreachable ();
2252    }
2253}
2254
2255
2256static void
2257aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2258			   unsigned start, unsigned limit, bool skip_wb)
2259{
2260  rtx_insn *insn;
2261  rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2262						 ? gen_frame_mem : gen_rtx_MEM);
2263  unsigned regno;
2264  unsigned regno2;
2265
2266  for (regno = aarch64_next_callee_save (start, limit);
2267       regno <= limit;
2268       regno = aarch64_next_callee_save (regno + 1, limit))
2269    {
2270      rtx reg, mem;
2271      HOST_WIDE_INT offset;
2272
2273      if (skip_wb
2274	  && (regno == cfun->machine->frame.wb_candidate1
2275	      || regno == cfun->machine->frame.wb_candidate2))
2276	continue;
2277
2278      reg = gen_rtx_REG (mode, regno);
2279      offset = start_offset + cfun->machine->frame.reg_offset[regno];
2280      mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2281					      offset));
2282
2283      regno2 = aarch64_next_callee_save (regno + 1, limit);
2284
2285      if (regno2 <= limit
2286	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2287	      == cfun->machine->frame.reg_offset[regno2]))
2288
2289	{
2290	  rtx reg2 = gen_rtx_REG (mode, regno2);
2291	  rtx mem2;
2292
2293	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2294	  mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2295						   offset));
2296	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2297						    reg2));
2298
2299	  /* The first part of a frame-related parallel insn is
2300	     always assumed to be relevant to the frame
2301	     calculations; subsequent parts, are only
2302	     frame-related if explicitly marked.  */
2303	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2304	  regno = regno2;
2305	}
2306      else
2307	insn = emit_move_insn (mem, reg);
2308
2309      RTX_FRAME_RELATED_P (insn) = 1;
2310    }
2311}
2312
2313static void
2314aarch64_restore_callee_saves (machine_mode mode,
2315			      HOST_WIDE_INT start_offset, unsigned start,
2316			      unsigned limit, bool skip_wb, rtx *cfi_ops)
2317{
2318  rtx base_rtx = stack_pointer_rtx;
2319  rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2320						 ? gen_frame_mem : gen_rtx_MEM);
2321  unsigned regno;
2322  unsigned regno2;
2323  HOST_WIDE_INT offset;
2324
2325  for (regno = aarch64_next_callee_save (start, limit);
2326       regno <= limit;
2327       regno = aarch64_next_callee_save (regno + 1, limit))
2328    {
2329      rtx reg, mem;
2330
2331      if (skip_wb
2332	  && (regno == cfun->machine->frame.wb_candidate1
2333	      || regno == cfun->machine->frame.wb_candidate2))
2334	continue;
2335
2336      reg = gen_rtx_REG (mode, regno);
2337      offset = start_offset + cfun->machine->frame.reg_offset[regno];
2338      mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2339
2340      regno2 = aarch64_next_callee_save (regno + 1, limit);
2341
2342      if (regno2 <= limit
2343	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2344	      == cfun->machine->frame.reg_offset[regno2]))
2345	{
2346	  rtx reg2 = gen_rtx_REG (mode, regno2);
2347	  rtx mem2;
2348
2349	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2350	  mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2351	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2352
2353	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2354	  regno = regno2;
2355	}
2356      else
2357	emit_move_insn (reg, mem);
2358      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2359    }
2360}
2361
2362/* AArch64 stack frames generated by this compiler look like:
2363
2364	+-------------------------------+
2365	|                               |
2366	|  incoming stack arguments     |
2367	|                               |
2368	+-------------------------------+
2369	|                               | <-- incoming stack pointer (aligned)
2370	|  callee-allocated save area   |
2371	|  for register varargs         |
2372	|                               |
2373	+-------------------------------+
2374	|  local variables              | <-- frame_pointer_rtx
2375	|                               |
2376	+-------------------------------+
2377	|  padding0                     | \
2378	+-------------------------------+  |
2379	|  callee-saved registers       |  | frame.saved_regs_size
2380	+-------------------------------+  |
2381	|  LR'                          |  |
2382	+-------------------------------+  |
2383	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2384        +-------------------------------+
2385	|  dynamic allocation           |
2386	+-------------------------------+
2387	|  padding                      |
2388	+-------------------------------+
2389	|  outgoing stack arguments     | <-- arg_pointer
2390        |                               |
2391	+-------------------------------+
2392	|                               | <-- stack_pointer_rtx (aligned)
2393
2394   Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2395   but leave frame_pointer_rtx and hard_frame_pointer_rtx
2396   unchanged.  */
2397
2398/* Generate the prologue instructions for entry into a function.
2399   Establish the stack frame by decreasing the stack pointer with a
2400   properly calculated size and, if necessary, create a frame record
2401   filled with the values of LR and previous frame pointer.  The
2402   current FP is also set up if it is in use.  */
2403
2404void
2405aarch64_expand_prologue (void)
2406{
2407  /* sub sp, sp, #<frame_size>
2408     stp {fp, lr}, [sp, #<frame_size> - 16]
2409     add fp, sp, #<frame_size> - hardfp_offset
2410     stp {cs_reg}, [fp, #-16] etc.
2411
2412     sub sp, sp, <final_adjustment_if_any>
2413  */
2414  HOST_WIDE_INT frame_size, offset;
2415  HOST_WIDE_INT fp_offset;		/* Offset from hard FP to SP.  */
2416  HOST_WIDE_INT hard_fp_offset;
2417  rtx_insn *insn;
2418
2419  aarch64_layout_frame ();
2420
2421  offset = frame_size = cfun->machine->frame.frame_size;
2422  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2423  fp_offset = frame_size - hard_fp_offset;
2424
2425  if (flag_stack_usage_info)
2426    current_function_static_stack_size = frame_size;
2427
2428  /* Store pairs and load pairs have a range only -512 to 504.  */
2429  if (offset >= 512)
2430    {
2431      /* When the frame has a large size, an initial decrease is done on
2432	 the stack pointer to jump over the callee-allocated save area for
2433	 register varargs, the local variable area and/or the callee-saved
2434	 register area.  This will allow the pre-index write-back
2435	 store pair instructions to be used for setting up the stack frame
2436	 efficiently.  */
2437      offset = hard_fp_offset;
2438      if (offset >= 512)
2439	offset = cfun->machine->frame.saved_regs_size;
2440
2441      frame_size -= (offset + crtl->outgoing_args_size);
2442      fp_offset = 0;
2443
2444      if (frame_size >= 0x1000000)
2445	{
2446	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2447	  emit_move_insn (op0, GEN_INT (-frame_size));
2448	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2449
2450	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
2451			gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2452				     plus_constant (Pmode, stack_pointer_rtx,
2453						    -frame_size)));
2454	  RTX_FRAME_RELATED_P (insn) = 1;
2455	}
2456      else if (frame_size > 0)
2457	{
2458	  int hi_ofs = frame_size & 0xfff000;
2459	  int lo_ofs = frame_size & 0x000fff;
2460
2461	  if (hi_ofs)
2462	    {
2463	      insn = emit_insn (gen_add2_insn
2464				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
2465	      RTX_FRAME_RELATED_P (insn) = 1;
2466	    }
2467	  if (lo_ofs)
2468	    {
2469	      insn = emit_insn (gen_add2_insn
2470				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
2471	      RTX_FRAME_RELATED_P (insn) = 1;
2472	    }
2473	}
2474    }
2475  else
2476    frame_size = -1;
2477
2478  if (offset > 0)
2479    {
2480      bool skip_wb = false;
2481
2482      if (frame_pointer_needed)
2483	{
2484	  skip_wb = true;
2485
2486	  if (fp_offset)
2487	    {
2488	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2489					       GEN_INT (-offset)));
2490	      RTX_FRAME_RELATED_P (insn) = 1;
2491
2492	      aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2493					 R30_REGNUM, false);
2494	    }
2495	  else
2496	    aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2497
2498	  /* Set up frame pointer to point to the location of the
2499	     previous frame pointer on the stack.  */
2500	  insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2501					   stack_pointer_rtx,
2502					   GEN_INT (fp_offset)));
2503	  RTX_FRAME_RELATED_P (insn) = 1;
2504	  emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2505	}
2506      else
2507	{
2508	  unsigned reg1 = cfun->machine->frame.wb_candidate1;
2509	  unsigned reg2 = cfun->machine->frame.wb_candidate2;
2510
2511	  if (fp_offset
2512	      || reg1 == FIRST_PSEUDO_REGISTER
2513	      || (reg2 == FIRST_PSEUDO_REGISTER
2514		  && offset >= 256))
2515	    {
2516	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2517					       GEN_INT (-offset)));
2518	      RTX_FRAME_RELATED_P (insn) = 1;
2519	    }
2520	  else
2521	    {
2522	      machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2523
2524	      skip_wb = true;
2525
2526	      if (reg2 == FIRST_PSEUDO_REGISTER)
2527		aarch64_pushwb_single_reg (mode1, reg1, offset);
2528	      else
2529		aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2530	    }
2531	}
2532
2533      aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2534				 skip_wb);
2535      aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2536				 skip_wb);
2537    }
2538
2539  /* when offset >= 512,
2540     sub sp, sp, #<outgoing_args_size> */
2541  if (frame_size > -1)
2542    {
2543      if (crtl->outgoing_args_size > 0)
2544	{
2545	  insn = emit_insn (gen_add2_insn
2546			    (stack_pointer_rtx,
2547			     GEN_INT (- crtl->outgoing_args_size)));
2548	  RTX_FRAME_RELATED_P (insn) = 1;
2549	}
2550    }
2551}
2552
2553/* Return TRUE if we can use a simple_return insn.
2554
2555   This function checks whether the callee saved stack is empty, which
2556   means no restore actions are need. The pro_and_epilogue will use
2557   this to check whether shrink-wrapping opt is feasible.  */
2558
2559bool
2560aarch64_use_return_insn_p (void)
2561{
2562  if (!reload_completed)
2563    return false;
2564
2565  if (crtl->profile)
2566    return false;
2567
2568  aarch64_layout_frame ();
2569
2570  return cfun->machine->frame.frame_size == 0;
2571}
2572
2573/* Generate the epilogue instructions for returning from a function.  */
2574void
2575aarch64_expand_epilogue (bool for_sibcall)
2576{
2577  HOST_WIDE_INT frame_size, offset;
2578  HOST_WIDE_INT fp_offset;
2579  HOST_WIDE_INT hard_fp_offset;
2580  rtx_insn *insn;
2581  /* We need to add memory barrier to prevent read from deallocated stack.  */
2582  bool need_barrier_p = (get_frame_size () != 0
2583			 || cfun->machine->frame.saved_varargs_size);
2584
2585  aarch64_layout_frame ();
2586
2587  offset = frame_size = cfun->machine->frame.frame_size;
2588  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2589  fp_offset = frame_size - hard_fp_offset;
2590
2591  /* Store pairs and load pairs have a range only -512 to 504.  */
2592  if (offset >= 512)
2593    {
2594      offset = hard_fp_offset;
2595      if (offset >= 512)
2596	offset = cfun->machine->frame.saved_regs_size;
2597
2598      frame_size -= (offset + crtl->outgoing_args_size);
2599      fp_offset = 0;
2600      if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2601	{
2602	  insn = emit_insn (gen_add2_insn
2603			    (stack_pointer_rtx,
2604			     GEN_INT (crtl->outgoing_args_size)));
2605	  RTX_FRAME_RELATED_P (insn) = 1;
2606	}
2607    }
2608  else
2609    frame_size = -1;
2610
2611  /* If there were outgoing arguments or we've done dynamic stack
2612     allocation, then restore the stack pointer from the frame
2613     pointer.  This is at most one insn and more efficient than using
2614     GCC's internal mechanism.  */
2615  if (frame_pointer_needed
2616      && (crtl->outgoing_args_size || cfun->calls_alloca))
2617    {
2618      if (cfun->calls_alloca)
2619	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2620
2621      insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2622				       hard_frame_pointer_rtx,
2623				       GEN_INT (0)));
2624      offset = offset - fp_offset;
2625    }
2626
2627  if (offset > 0)
2628    {
2629      unsigned reg1 = cfun->machine->frame.wb_candidate1;
2630      unsigned reg2 = cfun->machine->frame.wb_candidate2;
2631      bool skip_wb = true;
2632      rtx cfi_ops = NULL;
2633
2634      if (frame_pointer_needed)
2635	fp_offset = 0;
2636      else if (fp_offset
2637	       || reg1 == FIRST_PSEUDO_REGISTER
2638	       || (reg2 == FIRST_PSEUDO_REGISTER
2639		   && offset >= 256))
2640	skip_wb = false;
2641
2642      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2643				    skip_wb, &cfi_ops);
2644      aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2645				    skip_wb, &cfi_ops);
2646
2647      if (need_barrier_p)
2648	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2649
2650      if (skip_wb)
2651	{
2652	  machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2653	  rtx rreg1 = gen_rtx_REG (mode1, reg1);
2654
2655	  cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2656	  if (reg2 == FIRST_PSEUDO_REGISTER)
2657	    {
2658	      rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2659	      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2660	      mem = gen_rtx_MEM (mode1, mem);
2661	      insn = emit_move_insn (rreg1, mem);
2662	    }
2663	  else
2664	    {
2665	      rtx rreg2 = gen_rtx_REG (mode1, reg2);
2666
2667	      cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2668	      insn = emit_insn (aarch64_gen_loadwb_pair
2669				(mode1, stack_pointer_rtx, rreg1,
2670				 rreg2, offset));
2671	    }
2672	}
2673      else
2674	{
2675	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2676					   GEN_INT (offset)));
2677	}
2678
2679      /* Reset the CFA to be SP + FRAME_SIZE.  */
2680      rtx new_cfa = stack_pointer_rtx;
2681      if (frame_size > 0)
2682	new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2683      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2684      REG_NOTES (insn) = cfi_ops;
2685      RTX_FRAME_RELATED_P (insn) = 1;
2686    }
2687
2688  if (frame_size > 0)
2689    {
2690      if (need_barrier_p)
2691	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2692
2693      if (frame_size >= 0x1000000)
2694	{
2695	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2696	  emit_move_insn (op0, GEN_INT (frame_size));
2697	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2698	}
2699      else
2700	{
2701          int hi_ofs = frame_size & 0xfff000;
2702          int lo_ofs = frame_size & 0x000fff;
2703
2704	  if (hi_ofs && lo_ofs)
2705	    {
2706	      insn = emit_insn (gen_add2_insn
2707				(stack_pointer_rtx, GEN_INT (hi_ofs)));
2708	      RTX_FRAME_RELATED_P (insn) = 1;
2709	      frame_size = lo_ofs;
2710	    }
2711	  insn = emit_insn (gen_add2_insn
2712			    (stack_pointer_rtx, GEN_INT (frame_size)));
2713	}
2714
2715      /* Reset the CFA to be SP + 0.  */
2716      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2717      RTX_FRAME_RELATED_P (insn) = 1;
2718    }
2719
2720  /* Stack adjustment for exception handler.  */
2721  if (crtl->calls_eh_return)
2722    {
2723      /* We need to unwind the stack by the offset computed by
2724	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2725	 to be SP; letting the CFA move during this adjustment
2726	 is just as correct as retaining the CFA from the body
2727	 of the function.  Therefore, do nothing special.  */
2728      emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2729    }
2730
2731  emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2732  if (!for_sibcall)
2733    emit_jump_insn (ret_rtx);
2734}
2735
2736/* Return the place to copy the exception unwinding return address to.
2737   This will probably be a stack slot, but could (in theory be the
2738   return register).  */
2739rtx
2740aarch64_final_eh_return_addr (void)
2741{
2742  HOST_WIDE_INT fp_offset;
2743
2744  aarch64_layout_frame ();
2745
2746  fp_offset = cfun->machine->frame.frame_size
2747	      - cfun->machine->frame.hard_fp_offset;
2748
2749  if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2750    return gen_rtx_REG (DImode, LR_REGNUM);
2751
2752  /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2753     result in a store to save LR introduced by builtin_eh_return () being
2754     incorrectly deleted because the alias is not detected.
2755     So in the calculation of the address to copy the exception unwinding
2756     return address to, we note 2 cases.
2757     If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2758     we return a SP-relative location since all the addresses are SP-relative
2759     in this case.  This prevents the store from being optimized away.
2760     If the fp_offset is not 0, then the addresses will be FP-relative and
2761     therefore we return a FP-relative location.  */
2762
2763  if (frame_pointer_needed)
2764    {
2765      if (fp_offset)
2766        return gen_frame_mem (DImode,
2767			      plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2768      else
2769        return gen_frame_mem (DImode,
2770			      plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2771    }
2772
2773  /* If FP is not needed, we calculate the location of LR, which would be
2774     at the top of the saved registers block.  */
2775
2776  return gen_frame_mem (DImode,
2777			plus_constant (Pmode,
2778				       stack_pointer_rtx,
2779				       fp_offset
2780				       + cfun->machine->frame.saved_regs_size
2781				       - 2 * UNITS_PER_WORD));
2782}
2783
2784/* Possibly output code to build up a constant in a register.  For
2785   the benefit of the costs infrastructure, returns the number of
2786   instructions which would be emitted.  GENERATE inhibits or
2787   enables code generation.  */
2788
2789static int
2790aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2791{
2792  int insns = 0;
2793
2794  if (aarch64_bitmask_imm (val, DImode))
2795    {
2796      if (generate)
2797	emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2798      insns = 1;
2799    }
2800  else
2801    {
2802      int i;
2803      int ncount = 0;
2804      int zcount = 0;
2805      HOST_WIDE_INT valp = val >> 16;
2806      HOST_WIDE_INT valm;
2807      HOST_WIDE_INT tval;
2808
2809      for (i = 16; i < 64; i += 16)
2810	{
2811	  valm = (valp & 0xffff);
2812
2813	  if (valm != 0)
2814	    ++ zcount;
2815
2816	  if (valm != 0xffff)
2817	    ++ ncount;
2818
2819	  valp >>= 16;
2820	}
2821
2822      /* zcount contains the number of additional MOVK instructions
2823	 required if the constant is built up with an initial MOVZ instruction,
2824	 while ncount is the number of MOVK instructions required if starting
2825	 with a MOVN instruction.  Choose the sequence that yields the fewest
2826	 number of instructions, preferring MOVZ instructions when they are both
2827	 the same.  */
2828      if (ncount < zcount)
2829	{
2830	  if (generate)
2831	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
2832			    GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2833	  tval = 0xffff;
2834	  insns++;
2835	}
2836      else
2837	{
2838	  if (generate)
2839	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
2840			    GEN_INT (val & 0xffff));
2841	  tval = 0;
2842	  insns++;
2843	}
2844
2845      val >>= 16;
2846
2847      for (i = 16; i < 64; i += 16)
2848	{
2849	  if ((val & 0xffff) != tval)
2850	    {
2851	      if (generate)
2852		emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2853					   GEN_INT (i),
2854					   GEN_INT (val & 0xffff)));
2855	      insns++;
2856	    }
2857	  val >>= 16;
2858	}
2859    }
2860  return insns;
2861}
2862
2863static void
2864aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2865{
2866  HOST_WIDE_INT mdelta = delta;
2867  rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2868  rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2869
2870  if (mdelta < 0)
2871    mdelta = -mdelta;
2872
2873  if (mdelta >= 4096 * 4096)
2874    {
2875      (void) aarch64_build_constant (scratchreg, delta, true);
2876      emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2877    }
2878  else if (mdelta > 0)
2879    {
2880      if (mdelta >= 4096)
2881	{
2882	  emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2883	  rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2884	  if (delta < 0)
2885	    emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886				    gen_rtx_MINUS (Pmode, this_rtx, shift)));
2887	  else
2888	    emit_insn (gen_rtx_SET (Pmode, this_rtx,
2889				    gen_rtx_PLUS (Pmode, this_rtx, shift)));
2890	}
2891      if (mdelta % 4096 != 0)
2892	{
2893	  scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2894	  emit_insn (gen_rtx_SET (Pmode, this_rtx,
2895				  gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2896	}
2897    }
2898}
2899
2900/* Output code to add DELTA to the first argument, and then jump
2901   to FUNCTION.  Used for C++ multiple inheritance.  */
2902static void
2903aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2904			 HOST_WIDE_INT delta,
2905			 HOST_WIDE_INT vcall_offset,
2906			 tree function)
2907{
2908  /* The this pointer is always in x0.  Note that this differs from
2909     Arm where the this pointer maybe bumped to r1 if r0 is required
2910     to return a pointer to an aggregate.  On AArch64 a result value
2911     pointer will be in x8.  */
2912  int this_regno = R0_REGNUM;
2913  rtx this_rtx, temp0, temp1, addr, funexp;
2914  rtx_insn *insn;
2915
2916  reload_completed = 1;
2917  emit_note (NOTE_INSN_PROLOGUE_END);
2918
2919  if (vcall_offset == 0)
2920    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2921  else
2922    {
2923      gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2924
2925      this_rtx = gen_rtx_REG (Pmode, this_regno);
2926      temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2927      temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2928
2929      addr = this_rtx;
2930      if (delta != 0)
2931	{
2932	  if (delta >= -256 && delta < 256)
2933	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2934				       plus_constant (Pmode, this_rtx, delta));
2935	  else
2936	    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2937	}
2938
2939      if (Pmode == ptr_mode)
2940	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2941      else
2942	aarch64_emit_move (temp0,
2943			   gen_rtx_ZERO_EXTEND (Pmode,
2944						gen_rtx_MEM (ptr_mode, addr)));
2945
2946      if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2947	  addr = plus_constant (Pmode, temp0, vcall_offset);
2948      else
2949	{
2950	  (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2951	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2952	}
2953
2954      if (Pmode == ptr_mode)
2955	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2956      else
2957	aarch64_emit_move (temp1,
2958			   gen_rtx_SIGN_EXTEND (Pmode,
2959						gen_rtx_MEM (ptr_mode, addr)));
2960
2961      emit_insn (gen_add2_insn (this_rtx, temp1));
2962    }
2963
2964  /* Generate a tail call to the target function.  */
2965  if (!TREE_USED (function))
2966    {
2967      assemble_external (function);
2968      TREE_USED (function) = 1;
2969    }
2970  funexp = XEXP (DECL_RTL (function), 0);
2971  funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2972  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2973  SIBLING_CALL_P (insn) = 1;
2974
2975  insn = get_insns ();
2976  shorten_branches (insn);
2977  final_start_function (insn, file, 1);
2978  final (insn, file, 1);
2979  final_end_function ();
2980
2981  /* Stop pretending to be a post-reload pass.  */
2982  reload_completed = 0;
2983}
2984
2985static bool
2986aarch64_tls_referenced_p (rtx x)
2987{
2988  if (!TARGET_HAVE_TLS)
2989    return false;
2990  subrtx_iterator::array_type array;
2991  FOR_EACH_SUBRTX (iter, array, x, ALL)
2992    {
2993      const_rtx x = *iter;
2994      if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2995	return true;
2996      /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2997	 TLS offsets, not real symbol references.  */
2998      if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2999	iter.skip_subrtxes ();
3000    }
3001  return false;
3002}
3003
3004
3005static int
3006aarch64_bitmasks_cmp (const void *i1, const void *i2)
3007{
3008  const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3009  const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3010
3011  if (*imm1 < *imm2)
3012    return -1;
3013  if (*imm1 > *imm2)
3014    return +1;
3015  return 0;
3016}
3017
3018
3019static void
3020aarch64_build_bitmask_table (void)
3021{
3022  unsigned HOST_WIDE_INT mask, imm;
3023  unsigned int log_e, e, s, r;
3024  unsigned int nimms = 0;
3025
3026  for (log_e = 1; log_e <= 6; log_e++)
3027    {
3028      e = 1 << log_e;
3029      if (e == 64)
3030	mask = ~(HOST_WIDE_INT) 0;
3031      else
3032	mask = ((HOST_WIDE_INT) 1 << e) - 1;
3033      for (s = 1; s < e; s++)
3034	{
3035	  for (r = 0; r < e; r++)
3036	    {
3037	      /* set s consecutive bits to 1 (s < 64) */
3038	      imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3039	      /* rotate right by r */
3040	      if (r != 0)
3041		imm = ((imm >> r) | (imm << (e - r))) & mask;
3042	      /* replicate the constant depending on SIMD size */
3043	      switch (log_e) {
3044	      case 1: imm |= (imm <<  2);
3045	      case 2: imm |= (imm <<  4);
3046	      case 3: imm |= (imm <<  8);
3047	      case 4: imm |= (imm << 16);
3048	      case 5: imm |= (imm << 32);
3049	      case 6:
3050		break;
3051	      default:
3052		gcc_unreachable ();
3053	      }
3054	      gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3055	      aarch64_bitmasks[nimms++] = imm;
3056	    }
3057	}
3058    }
3059
3060  gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3061  qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3062	 aarch64_bitmasks_cmp);
3063}
3064
3065
3066/* Return true if val can be encoded as a 12-bit unsigned immediate with
3067   a left shift of 0 or 12 bits.  */
3068bool
3069aarch64_uimm12_shift (HOST_WIDE_INT val)
3070{
3071  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3072	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3073	  );
3074}
3075
3076
3077/* Return true if val is an immediate that can be loaded into a
3078   register by a MOVZ instruction.  */
3079static bool
3080aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3081{
3082  if (GET_MODE_SIZE (mode) > 4)
3083    {
3084      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3085	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3086	return 1;
3087    }
3088  else
3089    {
3090      /* Ignore sign extension.  */
3091      val &= (HOST_WIDE_INT) 0xffffffff;
3092    }
3093  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3094	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3095}
3096
3097
3098/* Return true if val is a valid bitmask immediate.  */
3099bool
3100aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3101{
3102  if (GET_MODE_SIZE (mode) < 8)
3103    {
3104      /* Replicate bit pattern.  */
3105      val &= (HOST_WIDE_INT) 0xffffffff;
3106      val |= val << 32;
3107    }
3108  return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3109		  sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3110}
3111
3112
3113/* Return true if val is an immediate that can be loaded into a
3114   register in a single instruction.  */
3115bool
3116aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3117{
3118  if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3119    return 1;
3120  return aarch64_bitmask_imm (val, mode);
3121}
3122
3123static bool
3124aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3125{
3126  rtx base, offset;
3127
3128  if (GET_CODE (x) == HIGH)
3129    return true;
3130
3131  split_const (x, &base, &offset);
3132  if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3133    {
3134      if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3135	  != SYMBOL_FORCE_TO_MEM)
3136	return true;
3137      else
3138	/* Avoid generating a 64-bit relocation in ILP32; leave
3139	   to aarch64_expand_mov_immediate to handle it properly.  */
3140	return mode != ptr_mode;
3141    }
3142
3143  return aarch64_tls_referenced_p (x);
3144}
3145
3146/* Return true if register REGNO is a valid index register.
3147   STRICT_P is true if REG_OK_STRICT is in effect.  */
3148
3149bool
3150aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3151{
3152  if (!HARD_REGISTER_NUM_P (regno))
3153    {
3154      if (!strict_p)
3155	return true;
3156
3157      if (!reg_renumber)
3158	return false;
3159
3160      regno = reg_renumber[regno];
3161    }
3162  return GP_REGNUM_P (regno);
3163}
3164
3165/* Return true if register REGNO is a valid base register for mode MODE.
3166   STRICT_P is true if REG_OK_STRICT is in effect.  */
3167
3168bool
3169aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3170{
3171  if (!HARD_REGISTER_NUM_P (regno))
3172    {
3173      if (!strict_p)
3174	return true;
3175
3176      if (!reg_renumber)
3177	return false;
3178
3179      regno = reg_renumber[regno];
3180    }
3181
3182  /* The fake registers will be eliminated to either the stack or
3183     hard frame pointer, both of which are usually valid base registers.
3184     Reload deals with the cases where the eliminated form isn't valid.  */
3185  return (GP_REGNUM_P (regno)
3186	  || regno == SP_REGNUM
3187	  || regno == FRAME_POINTER_REGNUM
3188	  || regno == ARG_POINTER_REGNUM);
3189}
3190
3191/* Return true if X is a valid base register for mode MODE.
3192   STRICT_P is true if REG_OK_STRICT is in effect.  */
3193
3194static bool
3195aarch64_base_register_rtx_p (rtx x, bool strict_p)
3196{
3197  if (!strict_p && GET_CODE (x) == SUBREG)
3198    x = SUBREG_REG (x);
3199
3200  return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3201}
3202
3203/* Return true if address offset is a valid index.  If it is, fill in INFO
3204   appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3205
3206static bool
3207aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3208			machine_mode mode, bool strict_p)
3209{
3210  enum aarch64_address_type type;
3211  rtx index;
3212  int shift;
3213
3214  /* (reg:P) */
3215  if ((REG_P (x) || GET_CODE (x) == SUBREG)
3216      && GET_MODE (x) == Pmode)
3217    {
3218      type = ADDRESS_REG_REG;
3219      index = x;
3220      shift = 0;
3221    }
3222  /* (sign_extend:DI (reg:SI)) */
3223  else if ((GET_CODE (x) == SIGN_EXTEND
3224	    || GET_CODE (x) == ZERO_EXTEND)
3225	   && GET_MODE (x) == DImode
3226	   && GET_MODE (XEXP (x, 0)) == SImode)
3227    {
3228      type = (GET_CODE (x) == SIGN_EXTEND)
3229	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3230      index = XEXP (x, 0);
3231      shift = 0;
3232    }
3233  /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3234  else if (GET_CODE (x) == MULT
3235	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3236	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3237	   && GET_MODE (XEXP (x, 0)) == DImode
3238	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3239	   && CONST_INT_P (XEXP (x, 1)))
3240    {
3241      type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3242	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3243      index = XEXP (XEXP (x, 0), 0);
3244      shift = exact_log2 (INTVAL (XEXP (x, 1)));
3245    }
3246  /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3247  else if (GET_CODE (x) == ASHIFT
3248	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3249	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3250	   && GET_MODE (XEXP (x, 0)) == DImode
3251	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3252	   && CONST_INT_P (XEXP (x, 1)))
3253    {
3254      type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3255	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3256      index = XEXP (XEXP (x, 0), 0);
3257      shift = INTVAL (XEXP (x, 1));
3258    }
3259  /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3260  else if ((GET_CODE (x) == SIGN_EXTRACT
3261	    || GET_CODE (x) == ZERO_EXTRACT)
3262	   && GET_MODE (x) == DImode
3263	   && GET_CODE (XEXP (x, 0)) == MULT
3264	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3265	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3266    {
3267      type = (GET_CODE (x) == SIGN_EXTRACT)
3268	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3269      index = XEXP (XEXP (x, 0), 0);
3270      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3271      if (INTVAL (XEXP (x, 1)) != 32 + shift
3272	  || INTVAL (XEXP (x, 2)) != 0)
3273	shift = -1;
3274    }
3275  /* (and:DI (mult:DI (reg:DI) (const_int scale))
3276     (const_int 0xffffffff<<shift)) */
3277  else if (GET_CODE (x) == AND
3278	   && GET_MODE (x) == DImode
3279	   && GET_CODE (XEXP (x, 0)) == MULT
3280	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3281	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3282	   && CONST_INT_P (XEXP (x, 1)))
3283    {
3284      type = ADDRESS_REG_UXTW;
3285      index = XEXP (XEXP (x, 0), 0);
3286      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3287      if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3288	shift = -1;
3289    }
3290  /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3291  else if ((GET_CODE (x) == SIGN_EXTRACT
3292	    || GET_CODE (x) == ZERO_EXTRACT)
3293	   && GET_MODE (x) == DImode
3294	   && GET_CODE (XEXP (x, 0)) == ASHIFT
3295	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3296	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3297    {
3298      type = (GET_CODE (x) == SIGN_EXTRACT)
3299	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3300      index = XEXP (XEXP (x, 0), 0);
3301      shift = INTVAL (XEXP (XEXP (x, 0), 1));
3302      if (INTVAL (XEXP (x, 1)) != 32 + shift
3303	  || INTVAL (XEXP (x, 2)) != 0)
3304	shift = -1;
3305    }
3306  /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3307     (const_int 0xffffffff<<shift)) */
3308  else if (GET_CODE (x) == AND
3309	   && GET_MODE (x) == DImode
3310	   && GET_CODE (XEXP (x, 0)) == ASHIFT
3311	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3312	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3313	   && CONST_INT_P (XEXP (x, 1)))
3314    {
3315      type = ADDRESS_REG_UXTW;
3316      index = XEXP (XEXP (x, 0), 0);
3317      shift = INTVAL (XEXP (XEXP (x, 0), 1));
3318      if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3319	shift = -1;
3320    }
3321  /* (mult:P (reg:P) (const_int scale)) */
3322  else if (GET_CODE (x) == MULT
3323	   && GET_MODE (x) == Pmode
3324	   && GET_MODE (XEXP (x, 0)) == Pmode
3325	   && CONST_INT_P (XEXP (x, 1)))
3326    {
3327      type = ADDRESS_REG_REG;
3328      index = XEXP (x, 0);
3329      shift = exact_log2 (INTVAL (XEXP (x, 1)));
3330    }
3331  /* (ashift:P (reg:P) (const_int shift)) */
3332  else if (GET_CODE (x) == ASHIFT
3333	   && GET_MODE (x) == Pmode
3334	   && GET_MODE (XEXP (x, 0)) == Pmode
3335	   && CONST_INT_P (XEXP (x, 1)))
3336    {
3337      type = ADDRESS_REG_REG;
3338      index = XEXP (x, 0);
3339      shift = INTVAL (XEXP (x, 1));
3340    }
3341  else
3342    return false;
3343
3344  if (GET_CODE (index) == SUBREG)
3345    index = SUBREG_REG (index);
3346
3347  if ((shift == 0 ||
3348       (shift > 0 && shift <= 3
3349	&& (1 << shift) == GET_MODE_SIZE (mode)))
3350      && REG_P (index)
3351      && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3352    {
3353      info->type = type;
3354      info->offset = index;
3355      info->shift = shift;
3356      return true;
3357    }
3358
3359  return false;
3360}
3361
3362bool
3363aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3364{
3365  return (offset >= -64 * GET_MODE_SIZE (mode)
3366	  && offset < 64 * GET_MODE_SIZE (mode)
3367	  && offset % GET_MODE_SIZE (mode) == 0);
3368}
3369
3370static inline bool
3371offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3372			       HOST_WIDE_INT offset)
3373{
3374  return offset >= -256 && offset < 256;
3375}
3376
3377static inline bool
3378offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3379{
3380  return (offset >= 0
3381	  && offset < 4096 * GET_MODE_SIZE (mode)
3382	  && offset % GET_MODE_SIZE (mode) == 0);
3383}
3384
3385/* Return true if X is a valid address for machine mode MODE.  If it is,
3386   fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3387   effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3388
3389static bool
3390aarch64_classify_address (struct aarch64_address_info *info,
3391			  rtx x, machine_mode mode,
3392			  RTX_CODE outer_code, bool strict_p)
3393{
3394  enum rtx_code code = GET_CODE (x);
3395  rtx op0, op1;
3396
3397  /* On BE, we use load/store pair for all large int mode load/stores.  */
3398  bool load_store_pair_p = (outer_code == PARALLEL
3399			    || (BYTES_BIG_ENDIAN
3400				&& aarch64_vect_struct_mode_p (mode)));
3401
3402  bool allow_reg_index_p =
3403    !load_store_pair_p
3404    && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3405    && !aarch64_vect_struct_mode_p (mode);
3406
3407  /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3408     REG addressing.  */
3409  if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3410      && (code != POST_INC && code != REG))
3411    return false;
3412
3413  switch (code)
3414    {
3415    case REG:
3416    case SUBREG:
3417      info->type = ADDRESS_REG_IMM;
3418      info->base = x;
3419      info->offset = const0_rtx;
3420      return aarch64_base_register_rtx_p (x, strict_p);
3421
3422    case PLUS:
3423      op0 = XEXP (x, 0);
3424      op1 = XEXP (x, 1);
3425
3426      if (! strict_p
3427	  && REG_P (op0)
3428	  && (op0 == virtual_stack_vars_rtx
3429	      || op0 == frame_pointer_rtx
3430	      || op0 == arg_pointer_rtx)
3431	  && CONST_INT_P (op1))
3432	{
3433	  info->type = ADDRESS_REG_IMM;
3434	  info->base = op0;
3435	  info->offset = op1;
3436
3437	  return true;
3438	}
3439
3440      if (GET_MODE_SIZE (mode) != 0
3441	  && CONST_INT_P (op1)
3442	  && aarch64_base_register_rtx_p (op0, strict_p))
3443	{
3444	  HOST_WIDE_INT offset = INTVAL (op1);
3445
3446	  info->type = ADDRESS_REG_IMM;
3447	  info->base = op0;
3448	  info->offset = op1;
3449
3450	  /* TImode and TFmode values are allowed in both pairs of X
3451	     registers and individual Q registers.  The available
3452	     address modes are:
3453	     X,X: 7-bit signed scaled offset
3454	     Q:   9-bit signed offset
3455	     We conservatively require an offset representable in either mode.
3456	   */
3457	  if (mode == TImode || mode == TFmode)
3458	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3459		    && offset_9bit_signed_unscaled_p (mode, offset));
3460
3461	  /* A 7bit offset check because OImode will emit a ldp/stp
3462	     instruction (only big endian will get here).
3463	     For ldp/stp instructions, the offset is scaled for the size of a
3464	     single element of the pair.  */
3465	  if (mode == OImode)
3466	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3467
3468	  /* Three 9/12 bit offsets checks because CImode will emit three
3469	     ldr/str instructions (only big endian will get here).  */
3470	  if (mode == CImode)
3471	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3472		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3473			|| offset_12bit_unsigned_scaled_p (V16QImode,
3474							   offset + 32)));
3475
3476	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
3477	     instructions (only big endian will get here).  */
3478	  if (mode == XImode)
3479	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3480		    && aarch64_offset_7bit_signed_scaled_p (TImode,
3481							    offset + 32));
3482
3483	  if (load_store_pair_p)
3484	    return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3485		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3486	  else
3487	    return (offset_9bit_signed_unscaled_p (mode, offset)
3488		    || offset_12bit_unsigned_scaled_p (mode, offset));
3489	}
3490
3491      if (allow_reg_index_p)
3492	{
3493	  /* Look for base + (scaled/extended) index register.  */
3494	  if (aarch64_base_register_rtx_p (op0, strict_p)
3495	      && aarch64_classify_index (info, op1, mode, strict_p))
3496	    {
3497	      info->base = op0;
3498	      return true;
3499	    }
3500	  if (aarch64_base_register_rtx_p (op1, strict_p)
3501	      && aarch64_classify_index (info, op0, mode, strict_p))
3502	    {
3503	      info->base = op1;
3504	      return true;
3505	    }
3506	}
3507
3508      return false;
3509
3510    case POST_INC:
3511    case POST_DEC:
3512    case PRE_INC:
3513    case PRE_DEC:
3514      info->type = ADDRESS_REG_WB;
3515      info->base = XEXP (x, 0);
3516      info->offset = NULL_RTX;
3517      return aarch64_base_register_rtx_p (info->base, strict_p);
3518
3519    case POST_MODIFY:
3520    case PRE_MODIFY:
3521      info->type = ADDRESS_REG_WB;
3522      info->base = XEXP (x, 0);
3523      if (GET_CODE (XEXP (x, 1)) == PLUS
3524	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3525	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3526	  && aarch64_base_register_rtx_p (info->base, strict_p))
3527	{
3528	  HOST_WIDE_INT offset;
3529	  info->offset = XEXP (XEXP (x, 1), 1);
3530	  offset = INTVAL (info->offset);
3531
3532	  /* TImode and TFmode values are allowed in both pairs of X
3533	     registers and individual Q registers.  The available
3534	     address modes are:
3535	     X,X: 7-bit signed scaled offset
3536	     Q:   9-bit signed offset
3537	     We conservatively require an offset representable in either mode.
3538	   */
3539	  if (mode == TImode || mode == TFmode)
3540	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3541		    && offset_9bit_signed_unscaled_p (mode, offset));
3542
3543	  if (load_store_pair_p)
3544	    return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3545		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3546	  else
3547	    return offset_9bit_signed_unscaled_p (mode, offset);
3548	}
3549      return false;
3550
3551    case CONST:
3552    case SYMBOL_REF:
3553    case LABEL_REF:
3554      /* load literal: pc-relative constant pool entry.  Only supported
3555         for SI mode or larger.  */
3556      info->type = ADDRESS_SYMBOLIC;
3557
3558      if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3559	{
3560	  rtx sym, addend;
3561
3562	  split_const (x, &sym, &addend);
3563	  return (GET_CODE (sym) == LABEL_REF
3564		  || (GET_CODE (sym) == SYMBOL_REF
3565		      && CONSTANT_POOL_ADDRESS_P (sym)));
3566	}
3567      return false;
3568
3569    case LO_SUM:
3570      info->type = ADDRESS_LO_SUM;
3571      info->base = XEXP (x, 0);
3572      info->offset = XEXP (x, 1);
3573      if (allow_reg_index_p
3574	  && aarch64_base_register_rtx_p (info->base, strict_p))
3575	{
3576	  rtx sym, offs;
3577	  split_const (info->offset, &sym, &offs);
3578	  if (GET_CODE (sym) == SYMBOL_REF
3579	      && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3580		  == SYMBOL_SMALL_ABSOLUTE))
3581	    {
3582	      /* The symbol and offset must be aligned to the access size.  */
3583	      unsigned int align;
3584	      unsigned int ref_size;
3585
3586	      if (CONSTANT_POOL_ADDRESS_P (sym))
3587		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3588	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3589		{
3590		  tree exp = SYMBOL_REF_DECL (sym);
3591		  align = TYPE_ALIGN (TREE_TYPE (exp));
3592		  align = CONSTANT_ALIGNMENT (exp, align);
3593		}
3594	      else if (SYMBOL_REF_DECL (sym))
3595		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3596	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3597		       && SYMBOL_REF_BLOCK (sym) != NULL)
3598		align = SYMBOL_REF_BLOCK (sym)->alignment;
3599	      else
3600		align = BITS_PER_UNIT;
3601
3602	      ref_size = GET_MODE_SIZE (mode);
3603	      if (ref_size == 0)
3604		ref_size = GET_MODE_SIZE (DImode);
3605
3606	      return ((INTVAL (offs) & (ref_size - 1)) == 0
3607		      && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3608	    }
3609	}
3610      return false;
3611
3612    default:
3613      return false;
3614    }
3615}
3616
3617bool
3618aarch64_symbolic_address_p (rtx x)
3619{
3620  rtx offset;
3621
3622  split_const (x, &x, &offset);
3623  return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3624}
3625
3626/* Classify the base of symbolic expression X, given that X appears in
3627   context CONTEXT.  */
3628
3629enum aarch64_symbol_type
3630aarch64_classify_symbolic_expression (rtx x,
3631				      enum aarch64_symbol_context context)
3632{
3633  rtx offset;
3634
3635  split_const (x, &x, &offset);
3636  return aarch64_classify_symbol (x, offset, context);
3637}
3638
3639
3640/* Return TRUE if X is a legitimate address for accessing memory in
3641   mode MODE.  */
3642static bool
3643aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3644{
3645  struct aarch64_address_info addr;
3646
3647  return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3648}
3649
3650/* Return TRUE if X is a legitimate address for accessing memory in
3651   mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3652   pair operation.  */
3653bool
3654aarch64_legitimate_address_p (machine_mode mode, rtx x,
3655			      RTX_CODE outer_code, bool strict_p)
3656{
3657  struct aarch64_address_info addr;
3658
3659  return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3660}
3661
3662/* Return TRUE if rtx X is immediate constant 0.0 */
3663bool
3664aarch64_float_const_zero_rtx_p (rtx x)
3665{
3666  REAL_VALUE_TYPE r;
3667
3668  if (GET_MODE (x) == VOIDmode)
3669    return false;
3670
3671  REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3672  if (REAL_VALUE_MINUS_ZERO (r))
3673    return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3674  return REAL_VALUES_EQUAL (r, dconst0);
3675}
3676
3677/* Return the fixed registers used for condition codes.  */
3678
3679static bool
3680aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3681{
3682  *p1 = CC_REGNUM;
3683  *p2 = INVALID_REGNUM;
3684  return true;
3685}
3686
3687/* Emit call insn with PAT and do aarch64-specific handling.  */
3688
3689void
3690aarch64_emit_call_insn (rtx pat)
3691{
3692  rtx insn = emit_call_insn (pat);
3693
3694  rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3695  clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3696  clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3697}
3698
3699machine_mode
3700aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3701{
3702  /* All floating point compares return CCFP if it is an equality
3703     comparison, and CCFPE otherwise.  */
3704  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3705    {
3706      switch (code)
3707	{
3708	case EQ:
3709	case NE:
3710	case UNORDERED:
3711	case ORDERED:
3712	case UNLT:
3713	case UNLE:
3714	case UNGT:
3715	case UNGE:
3716	case UNEQ:
3717	case LTGT:
3718	  return CCFPmode;
3719
3720	case LT:
3721	case LE:
3722	case GT:
3723	case GE:
3724	  return CCFPEmode;
3725
3726	default:
3727	  gcc_unreachable ();
3728	}
3729    }
3730
3731  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3732      && y == const0_rtx
3733      && (code == EQ || code == NE || code == LT || code == GE)
3734      && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3735	  || GET_CODE (x) == NEG))
3736    return CC_NZmode;
3737
3738  /* A compare with a shifted operand.  Because of canonicalization,
3739     the comparison will have to be swapped when we emit the assembly
3740     code.  */
3741  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742      && (REG_P (y) || GET_CODE (y) == SUBREG)
3743      && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3744	  || GET_CODE (x) == LSHIFTRT
3745	  || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3746    return CC_SWPmode;
3747
3748  /* Similarly for a negated operand, but we can only do this for
3749     equalities.  */
3750  if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3751      && (REG_P (y) || GET_CODE (y) == SUBREG)
3752      && (code == EQ || code == NE)
3753      && GET_CODE (x) == NEG)
3754    return CC_Zmode;
3755
3756  /* A compare of a mode narrower than SI mode against zero can be done
3757     by extending the value in the comparison.  */
3758  if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3759      && y == const0_rtx)
3760    /* Only use sign-extension if we really need it.  */
3761    return ((code == GT || code == GE || code == LE || code == LT)
3762	    ? CC_SESWPmode : CC_ZESWPmode);
3763
3764  /* A test for unsigned overflow.  */
3765  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
3766      && code == NE
3767      && GET_CODE (x) == PLUS
3768      && GET_CODE (y) == ZERO_EXTEND)
3769    return CC_Cmode;
3770
3771  /* For everything else, return CCmode.  */
3772  return CCmode;
3773}
3774
3775static int
3776aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3777
3778int
3779aarch64_get_condition_code (rtx x)
3780{
3781  machine_mode mode = GET_MODE (XEXP (x, 0));
3782  enum rtx_code comp_code = GET_CODE (x);
3783
3784  if (GET_MODE_CLASS (mode) != MODE_CC)
3785    mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3786  return aarch64_get_condition_code_1 (mode, comp_code);
3787}
3788
3789static int
3790aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3791{
3792  int ne = -1, eq = -1;
3793  switch (mode)
3794    {
3795    case CCFPmode:
3796    case CCFPEmode:
3797      switch (comp_code)
3798	{
3799	case GE: return AARCH64_GE;
3800	case GT: return AARCH64_GT;
3801	case LE: return AARCH64_LS;
3802	case LT: return AARCH64_MI;
3803	case NE: return AARCH64_NE;
3804	case EQ: return AARCH64_EQ;
3805	case ORDERED: return AARCH64_VC;
3806	case UNORDERED: return AARCH64_VS;
3807	case UNLT: return AARCH64_LT;
3808	case UNLE: return AARCH64_LE;
3809	case UNGT: return AARCH64_HI;
3810	case UNGE: return AARCH64_PL;
3811	default: return -1;
3812	}
3813      break;
3814
3815    case CC_DNEmode:
3816      ne = AARCH64_NE;
3817      eq = AARCH64_EQ;
3818      break;
3819
3820    case CC_DEQmode:
3821      ne = AARCH64_EQ;
3822      eq = AARCH64_NE;
3823      break;
3824
3825    case CC_DGEmode:
3826      ne = AARCH64_GE;
3827      eq = AARCH64_LT;
3828      break;
3829
3830    case CC_DLTmode:
3831      ne = AARCH64_LT;
3832      eq = AARCH64_GE;
3833      break;
3834
3835    case CC_DGTmode:
3836      ne = AARCH64_GT;
3837      eq = AARCH64_LE;
3838      break;
3839
3840    case CC_DLEmode:
3841      ne = AARCH64_LE;
3842      eq = AARCH64_GT;
3843      break;
3844
3845    case CC_DGEUmode:
3846      ne = AARCH64_CS;
3847      eq = AARCH64_CC;
3848      break;
3849
3850    case CC_DLTUmode:
3851      ne = AARCH64_CC;
3852      eq = AARCH64_CS;
3853      break;
3854
3855    case CC_DGTUmode:
3856      ne = AARCH64_HI;
3857      eq = AARCH64_LS;
3858      break;
3859
3860    case CC_DLEUmode:
3861      ne = AARCH64_LS;
3862      eq = AARCH64_HI;
3863      break;
3864
3865    case CCmode:
3866      switch (comp_code)
3867	{
3868	case NE: return AARCH64_NE;
3869	case EQ: return AARCH64_EQ;
3870	case GE: return AARCH64_GE;
3871	case GT: return AARCH64_GT;
3872	case LE: return AARCH64_LE;
3873	case LT: return AARCH64_LT;
3874	case GEU: return AARCH64_CS;
3875	case GTU: return AARCH64_HI;
3876	case LEU: return AARCH64_LS;
3877	case LTU: return AARCH64_CC;
3878	default: return -1;
3879	}
3880      break;
3881
3882    case CC_SWPmode:
3883    case CC_ZESWPmode:
3884    case CC_SESWPmode:
3885      switch (comp_code)
3886	{
3887	case NE: return AARCH64_NE;
3888	case EQ: return AARCH64_EQ;
3889	case GE: return AARCH64_LE;
3890	case GT: return AARCH64_LT;
3891	case LE: return AARCH64_GE;
3892	case LT: return AARCH64_GT;
3893	case GEU: return AARCH64_LS;
3894	case GTU: return AARCH64_CC;
3895	case LEU: return AARCH64_CS;
3896	case LTU: return AARCH64_HI;
3897	default: return -1;
3898	}
3899      break;
3900
3901    case CC_NZmode:
3902      switch (comp_code)
3903	{
3904	case NE: return AARCH64_NE;
3905	case EQ: return AARCH64_EQ;
3906	case GE: return AARCH64_PL;
3907	case LT: return AARCH64_MI;
3908	default: return -1;
3909	}
3910      break;
3911
3912    case CC_Zmode:
3913      switch (comp_code)
3914	{
3915	case NE: return AARCH64_NE;
3916	case EQ: return AARCH64_EQ;
3917	default: return -1;
3918	}
3919      break;
3920
3921    case CC_Cmode:
3922      switch (comp_code)
3923	{
3924	case NE: return AARCH64_CS;
3925	case EQ: return AARCH64_CC;
3926	default: return -1;
3927	}
3928      break;
3929
3930    default:
3931      return -1;
3932      break;
3933    }
3934
3935  if (comp_code == NE)
3936    return ne;
3937
3938  if (comp_code == EQ)
3939    return eq;
3940
3941  return -1;
3942}
3943
3944bool
3945aarch64_const_vec_all_same_in_range_p (rtx x,
3946				  HOST_WIDE_INT minval,
3947				  HOST_WIDE_INT maxval)
3948{
3949  HOST_WIDE_INT firstval;
3950  int count, i;
3951
3952  if (GET_CODE (x) != CONST_VECTOR
3953      || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3954    return false;
3955
3956  firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3957  if (firstval < minval || firstval > maxval)
3958    return false;
3959
3960  count = CONST_VECTOR_NUNITS (x);
3961  for (i = 1; i < count; i++)
3962    if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3963      return false;
3964
3965  return true;
3966}
3967
3968bool
3969aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3970{
3971  return aarch64_const_vec_all_same_in_range_p (x, val, val);
3972}
3973
3974static unsigned
3975bit_count (unsigned HOST_WIDE_INT value)
3976{
3977  unsigned count = 0;
3978
3979  while (value)
3980    {
3981      count++;
3982      value &= value - 1;
3983    }
3984
3985  return count;
3986}
3987
3988/* N Z C V.  */
3989#define AARCH64_CC_V 1
3990#define AARCH64_CC_C (1 << 1)
3991#define AARCH64_CC_Z (1 << 2)
3992#define AARCH64_CC_N (1 << 3)
3993
3994/* N Z C V flags for ccmp.  The first code is for AND op and the other
3995   is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3996static const int aarch64_nzcv_codes[][2] =
3997{
3998  {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3999  {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4000  {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4001  {0, AARCH64_CC_C}, /* CC, C == 0.  */
4002  {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4003  {0, AARCH64_CC_N}, /* PL, N == 0.  */
4004  {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4005  {0, AARCH64_CC_V}, /* VC, V == 0.  */
4006  {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4007  {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4008  {0, AARCH64_CC_V}, /* GE, N == V.  */
4009  {AARCH64_CC_V, 0}, /* LT, N != V.  */
4010  {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4011  {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4012  {0, 0}, /* AL, Any.  */
4013  {0, 0}, /* NV, Any.  */
4014};
4015
4016int
4017aarch64_ccmp_mode_to_code (enum machine_mode mode)
4018{
4019  switch (mode)
4020    {
4021    case CC_DNEmode:
4022      return NE;
4023
4024    case CC_DEQmode:
4025      return EQ;
4026
4027    case CC_DLEmode:
4028      return LE;
4029
4030    case CC_DGTmode:
4031      return GT;
4032
4033    case CC_DLTmode:
4034      return LT;
4035
4036    case CC_DGEmode:
4037      return GE;
4038
4039    case CC_DLEUmode:
4040      return LEU;
4041
4042    case CC_DGTUmode:
4043      return GTU;
4044
4045    case CC_DLTUmode:
4046      return LTU;
4047
4048    case CC_DGEUmode:
4049      return GEU;
4050
4051    default:
4052      gcc_unreachable ();
4053    }
4054}
4055
4056
4057void
4058aarch64_print_operand (FILE *f, rtx x, char code)
4059{
4060  switch (code)
4061    {
4062    /* An integer or symbol address without a preceding # sign.  */
4063    case 'c':
4064      switch (GET_CODE (x))
4065	{
4066	case CONST_INT:
4067	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4068	  break;
4069
4070	case SYMBOL_REF:
4071	  output_addr_const (f, x);
4072	  break;
4073
4074	case CONST:
4075	  if (GET_CODE (XEXP (x, 0)) == PLUS
4076	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4077	    {
4078	      output_addr_const (f, x);
4079	      break;
4080	    }
4081	  /* Fall through.  */
4082
4083	default:
4084	  output_operand_lossage ("Unsupported operand for code '%c'", code);
4085	}
4086      break;
4087
4088    case 'e':
4089      /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4090      {
4091	int n;
4092
4093	if (!CONST_INT_P (x)
4094	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4095	  {
4096	    output_operand_lossage ("invalid operand for '%%%c'", code);
4097	    return;
4098	  }
4099
4100	switch (n)
4101	  {
4102	  case 3:
4103	    fputc ('b', f);
4104	    break;
4105	  case 4:
4106	    fputc ('h', f);
4107	    break;
4108	  case 5:
4109	    fputc ('w', f);
4110	    break;
4111	  default:
4112	    output_operand_lossage ("invalid operand for '%%%c'", code);
4113	    return;
4114	  }
4115      }
4116      break;
4117
4118    case 'p':
4119      {
4120	int n;
4121
4122	/* Print N such that 2^N == X.  */
4123	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4124	  {
4125	    output_operand_lossage ("invalid operand for '%%%c'", code);
4126	    return;
4127	  }
4128
4129	asm_fprintf (f, "%d", n);
4130      }
4131      break;
4132
4133    case 'P':
4134      /* Print the number of non-zero bits in X (a const_int).  */
4135      if (!CONST_INT_P (x))
4136	{
4137	  output_operand_lossage ("invalid operand for '%%%c'", code);
4138	  return;
4139	}
4140
4141      asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4142      break;
4143
4144    case 'H':
4145      /* Print the higher numbered register of a pair (TImode) of regs.  */
4146      if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4147	{
4148	  output_operand_lossage ("invalid operand for '%%%c'", code);
4149	  return;
4150	}
4151
4152      asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4153      break;
4154
4155    case 'm':
4156      {
4157        int cond_code;
4158        /* Print a condition (eq, ne, etc).  */
4159
4160        /* CONST_TRUE_RTX means always -- that's the default.  */
4161        if (x == const_true_rtx)
4162	  return;
4163
4164        if (!COMPARISON_P (x))
4165	  {
4166	    output_operand_lossage ("invalid operand for '%%%c'", code);
4167	    return;
4168	  }
4169
4170        cond_code = aarch64_get_condition_code (x);
4171        gcc_assert (cond_code >= 0);
4172        fputs (aarch64_condition_codes[cond_code], f);
4173      }
4174      break;
4175
4176    case 'M':
4177      {
4178        int cond_code;
4179        /* Print the inverse of a condition (eq <-> ne, etc).  */
4180
4181        /* CONST_TRUE_RTX means never -- that's the default.  */
4182        if (x == const_true_rtx)
4183	  {
4184	    fputs ("nv", f);
4185	    return;
4186	  }
4187
4188        if (!COMPARISON_P (x))
4189	  {
4190	    output_operand_lossage ("invalid operand for '%%%c'", code);
4191	    return;
4192	  }
4193        cond_code = aarch64_get_condition_code (x);
4194        gcc_assert (cond_code >= 0);
4195        fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4196                                       (cond_code)], f);
4197      }
4198      break;
4199
4200    case 'b':
4201    case 'h':
4202    case 's':
4203    case 'd':
4204    case 'q':
4205      /* Print a scalar FP/SIMD register name.  */
4206      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4207	{
4208	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4209	  return;
4210	}
4211      asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4212      break;
4213
4214    case 'S':
4215    case 'T':
4216    case 'U':
4217    case 'V':
4218      /* Print the first FP/SIMD register name in a list.  */
4219      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4220	{
4221	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4222	  return;
4223	}
4224      asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4225      break;
4226
4227    case 'R':
4228      /* Print a scalar FP/SIMD register name + 1.  */
4229      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4230	{
4231	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4232	  return;
4233	}
4234      asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4235      break;
4236
4237    case 'X':
4238      /* Print bottom 16 bits of integer constant in hex.  */
4239      if (!CONST_INT_P (x))
4240	{
4241	  output_operand_lossage ("invalid operand for '%%%c'", code);
4242	  return;
4243	}
4244      asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4245      break;
4246
4247    case 'w':
4248    case 'x':
4249      /* Print a general register name or the zero register (32-bit or
4250         64-bit).  */
4251      if (x == const0_rtx
4252	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4253	{
4254	  asm_fprintf (f, "%czr", code);
4255	  break;
4256	}
4257
4258      if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4259	{
4260	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4261	  break;
4262	}
4263
4264      if (REG_P (x) && REGNO (x) == SP_REGNUM)
4265	{
4266	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4267	  break;
4268	}
4269
4270      /* Fall through */
4271
4272    case 0:
4273      /* Print a normal operand, if it's a general register, then we
4274	 assume DImode.  */
4275      if (x == NULL)
4276	{
4277	  output_operand_lossage ("missing operand");
4278	  return;
4279	}
4280
4281      switch (GET_CODE (x))
4282	{
4283	case REG:
4284	  asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4285	  break;
4286
4287	case MEM:
4288	  aarch64_memory_reference_mode = GET_MODE (x);
4289	  output_address (XEXP (x, 0));
4290	  break;
4291
4292	case LABEL_REF:
4293	case SYMBOL_REF:
4294	  output_addr_const (asm_out_file, x);
4295	  break;
4296
4297	case CONST_INT:
4298	  asm_fprintf (f, "%wd", INTVAL (x));
4299	  break;
4300
4301	case CONST_VECTOR:
4302	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4303	    {
4304	      gcc_assert (
4305		  aarch64_const_vec_all_same_in_range_p (x,
4306							 HOST_WIDE_INT_MIN,
4307							 HOST_WIDE_INT_MAX));
4308	      asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4309	    }
4310	  else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4311	    {
4312	      fputc ('0', f);
4313	    }
4314	  else
4315	    gcc_unreachable ();
4316	  break;
4317
4318	case CONST_DOUBLE:
4319	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4320	     be getting CONST_DOUBLEs holding integers.  */
4321	  gcc_assert (GET_MODE (x) != VOIDmode);
4322	  if (aarch64_float_const_zero_rtx_p (x))
4323	    {
4324	      fputc ('0', f);
4325	      break;
4326	    }
4327	  else if (aarch64_float_const_representable_p (x))
4328	    {
4329#define buf_size 20
4330	      char float_buf[buf_size] = {'\0'};
4331	      REAL_VALUE_TYPE r;
4332	      REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4333	      real_to_decimal_for_mode (float_buf, &r,
4334					buf_size, buf_size,
4335					1, GET_MODE (x));
4336	      asm_fprintf (asm_out_file, "%s", float_buf);
4337	      break;
4338#undef buf_size
4339	    }
4340	  output_operand_lossage ("invalid constant");
4341	  return;
4342	default:
4343	  output_operand_lossage ("invalid operand");
4344	  return;
4345	}
4346      break;
4347
4348    case 'A':
4349      if (GET_CODE (x) == HIGH)
4350	x = XEXP (x, 0);
4351
4352      switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4353	{
4354	case SYMBOL_SMALL_GOT:
4355	  asm_fprintf (asm_out_file, ":got:");
4356	  break;
4357
4358	case SYMBOL_SMALL_TLSGD:
4359	  asm_fprintf (asm_out_file, ":tlsgd:");
4360	  break;
4361
4362	case SYMBOL_SMALL_TLSDESC:
4363	  asm_fprintf (asm_out_file, ":tlsdesc:");
4364	  break;
4365
4366	case SYMBOL_SMALL_GOTTPREL:
4367	  asm_fprintf (asm_out_file, ":gottprel:");
4368	  break;
4369
4370	case SYMBOL_SMALL_TPREL:
4371	  asm_fprintf (asm_out_file, ":tprel:");
4372	  break;
4373
4374	case SYMBOL_TINY_GOT:
4375	  gcc_unreachable ();
4376	  break;
4377
4378	default:
4379	  break;
4380	}
4381      output_addr_const (asm_out_file, x);
4382      break;
4383
4384    case 'L':
4385      switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4386	{
4387	case SYMBOL_SMALL_GOT:
4388	  asm_fprintf (asm_out_file, ":lo12:");
4389	  break;
4390
4391	case SYMBOL_SMALL_TLSGD:
4392	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4393	  break;
4394
4395	case SYMBOL_SMALL_TLSDESC:
4396	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4397	  break;
4398
4399	case SYMBOL_SMALL_GOTTPREL:
4400	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
4401	  break;
4402
4403	case SYMBOL_SMALL_TPREL:
4404	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4405	  break;
4406
4407	case SYMBOL_TINY_GOT:
4408	  asm_fprintf (asm_out_file, ":got:");
4409	  break;
4410
4411	default:
4412	  break;
4413	}
4414      output_addr_const (asm_out_file, x);
4415      break;
4416
4417    case 'G':
4418
4419      switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4420	{
4421	case SYMBOL_SMALL_TPREL:
4422	  asm_fprintf (asm_out_file, ":tprel_hi12:");
4423	  break;
4424	default:
4425	  break;
4426	}
4427      output_addr_const (asm_out_file, x);
4428      break;
4429
4430    case 'K':
4431      {
4432	int cond_code;
4433	/* Print nzcv.  */
4434
4435	if (!COMPARISON_P (x))
4436	  {
4437	    output_operand_lossage ("invalid operand for '%%%c'", code);
4438	    return;
4439	  }
4440
4441	cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4442	gcc_assert (cond_code >= 0);
4443	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4444      }
4445      break;
4446
4447    case 'k':
4448      {
4449	int cond_code;
4450	/* Print nzcv.  */
4451
4452	if (!COMPARISON_P (x))
4453	  {
4454	    output_operand_lossage ("invalid operand for '%%%c'", code);
4455	    return;
4456	  }
4457
4458	cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4459	gcc_assert (cond_code >= 0);
4460	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4461      }
4462      break;
4463
4464    default:
4465      output_operand_lossage ("invalid operand prefix '%%%c'", code);
4466      return;
4467    }
4468}
4469
4470void
4471aarch64_print_operand_address (FILE *f, rtx x)
4472{
4473  struct aarch64_address_info addr;
4474
4475  if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4476			     MEM, true))
4477    switch (addr.type)
4478      {
4479      case ADDRESS_REG_IMM:
4480	if (addr.offset == const0_rtx)
4481	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4482	else
4483	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4484		       INTVAL (addr.offset));
4485	return;
4486
4487      case ADDRESS_REG_REG:
4488	if (addr.shift == 0)
4489	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4490		       reg_names [REGNO (addr.offset)]);
4491	else
4492	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4493		       reg_names [REGNO (addr.offset)], addr.shift);
4494	return;
4495
4496      case ADDRESS_REG_UXTW:
4497	if (addr.shift == 0)
4498	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4499		       REGNO (addr.offset) - R0_REGNUM);
4500	else
4501	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4502		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
4503	return;
4504
4505      case ADDRESS_REG_SXTW:
4506	if (addr.shift == 0)
4507	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4508		       REGNO (addr.offset) - R0_REGNUM);
4509	else
4510	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4511		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
4512	return;
4513
4514      case ADDRESS_REG_WB:
4515	switch (GET_CODE (x))
4516	  {
4517	  case PRE_INC:
4518	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4519			 GET_MODE_SIZE (aarch64_memory_reference_mode));
4520	    return;
4521	  case POST_INC:
4522	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4523			 GET_MODE_SIZE (aarch64_memory_reference_mode));
4524	    return;
4525	  case PRE_DEC:
4526	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4527			 GET_MODE_SIZE (aarch64_memory_reference_mode));
4528	    return;
4529	  case POST_DEC:
4530	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4531			 GET_MODE_SIZE (aarch64_memory_reference_mode));
4532	    return;
4533	  case PRE_MODIFY:
4534	    asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4535			 INTVAL (addr.offset));
4536	    return;
4537	  case POST_MODIFY:
4538	    asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4539			 INTVAL (addr.offset));
4540	    return;
4541	  default:
4542	    break;
4543	  }
4544	break;
4545
4546      case ADDRESS_LO_SUM:
4547	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4548	output_addr_const (f, addr.offset);
4549	asm_fprintf (f, "]");
4550	return;
4551
4552      case ADDRESS_SYMBOLIC:
4553	break;
4554      }
4555
4556  output_addr_const (f, x);
4557}
4558
4559bool
4560aarch64_label_mentioned_p (rtx x)
4561{
4562  const char *fmt;
4563  int i;
4564
4565  if (GET_CODE (x) == LABEL_REF)
4566    return true;
4567
4568  /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4569     referencing instruction, but they are constant offsets, not
4570     symbols.  */
4571  if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4572    return false;
4573
4574  fmt = GET_RTX_FORMAT (GET_CODE (x));
4575  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4576    {
4577      if (fmt[i] == 'E')
4578	{
4579	  int j;
4580
4581	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4582	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4583	      return 1;
4584	}
4585      else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4586	return 1;
4587    }
4588
4589  return 0;
4590}
4591
4592/* Implement REGNO_REG_CLASS.  */
4593
4594enum reg_class
4595aarch64_regno_regclass (unsigned regno)
4596{
4597  if (GP_REGNUM_P (regno))
4598    return GENERAL_REGS;
4599
4600  if (regno == SP_REGNUM)
4601    return STACK_REG;
4602
4603  if (regno == FRAME_POINTER_REGNUM
4604      || regno == ARG_POINTER_REGNUM)
4605    return POINTER_REGS;
4606
4607  if (FP_REGNUM_P (regno))
4608    return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4609
4610  return NO_REGS;
4611}
4612
4613static rtx
4614aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4615{
4616  /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4617     where mask is selected by alignment and size of the offset.
4618     We try to pick as large a range for the offset as possible to
4619     maximize the chance of a CSE.  However, for aligned addresses
4620     we limit the range to 4k so that structures with different sized
4621     elements are likely to use the same base.  */
4622
4623  if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4624    {
4625      HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4626      HOST_WIDE_INT base_offset;
4627
4628      /* Does it look like we'll need a load/store-pair operation?  */
4629      if (GET_MODE_SIZE (mode) > 16
4630	  || mode == TImode)
4631	base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4632		       & ~((128 * GET_MODE_SIZE (mode)) - 1));
4633      /* For offsets aren't a multiple of the access size, the limit is
4634	 -256...255.  */
4635      else if (offset & (GET_MODE_SIZE (mode) - 1))
4636	base_offset = (offset + 0x100) & ~0x1ff;
4637      else
4638	base_offset = offset & ~0xfff;
4639
4640      if (base_offset == 0)
4641	return x;
4642
4643      offset -= base_offset;
4644      rtx base_reg = gen_reg_rtx (Pmode);
4645      rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4646			   NULL_RTX);
4647      emit_move_insn (base_reg, val);
4648      x = plus_constant (Pmode, base_reg, offset);
4649    }
4650
4651  return x;
4652}
4653
4654/* Try a machine-dependent way of reloading an illegitimate address
4655   operand.  If we find one, push the reload and return the new rtx.  */
4656
4657rtx
4658aarch64_legitimize_reload_address (rtx *x_p,
4659				   machine_mode mode,
4660				   int opnum, int type,
4661				   int ind_levels ATTRIBUTE_UNUSED)
4662{
4663  rtx x = *x_p;
4664
4665  /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4666  if (aarch64_vect_struct_mode_p (mode)
4667      && GET_CODE (x) == PLUS
4668      && REG_P (XEXP (x, 0))
4669      && CONST_INT_P (XEXP (x, 1)))
4670    {
4671      rtx orig_rtx = x;
4672      x = copy_rtx (x);
4673      push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4674		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4675		   opnum, (enum reload_type) type);
4676      return x;
4677    }
4678
4679  /* We must recognize output that we have already generated ourselves.  */
4680  if (GET_CODE (x) == PLUS
4681      && GET_CODE (XEXP (x, 0)) == PLUS
4682      && REG_P (XEXP (XEXP (x, 0), 0))
4683      && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4684      && CONST_INT_P (XEXP (x, 1)))
4685    {
4686      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4687		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4688		   opnum, (enum reload_type) type);
4689      return x;
4690    }
4691
4692  /* We wish to handle large displacements off a base register by splitting
4693     the addend across an add and the mem insn.  This can cut the number of
4694     extra insns needed from 3 to 1.  It is only useful for load/store of a
4695     single register with 12 bit offset field.  */
4696  if (GET_CODE (x) == PLUS
4697      && REG_P (XEXP (x, 0))
4698      && CONST_INT_P (XEXP (x, 1))
4699      && HARD_REGISTER_P (XEXP (x, 0))
4700      && mode != TImode
4701      && mode != TFmode
4702      && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4703    {
4704      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4705      HOST_WIDE_INT low = val & 0xfff;
4706      HOST_WIDE_INT high = val - low;
4707      HOST_WIDE_INT offs;
4708      rtx cst;
4709      machine_mode xmode = GET_MODE (x);
4710
4711      /* In ILP32, xmode can be either DImode or SImode.  */
4712      gcc_assert (xmode == DImode || xmode == SImode);
4713
4714      /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4715	 BLKmode alignment.  */
4716      if (GET_MODE_SIZE (mode) == 0)
4717	return NULL_RTX;
4718
4719      offs = low % GET_MODE_SIZE (mode);
4720
4721      /* Align misaligned offset by adjusting high part to compensate.  */
4722      if (offs != 0)
4723	{
4724	  if (aarch64_uimm12_shift (high + offs))
4725	    {
4726	      /* Align down.  */
4727	      low = low - offs;
4728	      high = high + offs;
4729	    }
4730	  else
4731	    {
4732	      /* Align up.  */
4733	      offs = GET_MODE_SIZE (mode) - offs;
4734	      low = low + offs;
4735	      high = high + (low & 0x1000) - offs;
4736	      low &= 0xfff;
4737	    }
4738	}
4739
4740      /* Check for overflow.  */
4741      if (high + low != val)
4742	return NULL_RTX;
4743
4744      cst = GEN_INT (high);
4745      if (!aarch64_uimm12_shift (high))
4746	cst = force_const_mem (xmode, cst);
4747
4748      /* Reload high part into base reg, leaving the low part
4749	 in the mem instruction.
4750	 Note that replacing this gen_rtx_PLUS with plus_constant is
4751	 wrong in this case because we rely on the
4752	 (plus (plus reg c1) c2) structure being preserved so that
4753	 XEXP (*p, 0) in push_reload below uses the correct term.  */
4754      x = gen_rtx_PLUS (xmode,
4755			gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4756			GEN_INT (low));
4757
4758      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4759		   BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4760		   opnum, (enum reload_type) type);
4761      return x;
4762    }
4763
4764  return NULL_RTX;
4765}
4766
4767
4768static reg_class_t
4769aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4770			  reg_class_t rclass,
4771			  machine_mode mode,
4772			  secondary_reload_info *sri)
4773{
4774  /* Without the TARGET_SIMD instructions we cannot move a Q register
4775     to a Q register directly.  We need a scratch.  */
4776  if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4777      && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4778      && reg_class_subset_p (rclass, FP_REGS))
4779    {
4780      if (mode == TFmode)
4781        sri->icode = CODE_FOR_aarch64_reload_movtf;
4782      else if (mode == TImode)
4783        sri->icode = CODE_FOR_aarch64_reload_movti;
4784      return NO_REGS;
4785    }
4786
4787  /* A TFmode or TImode memory access should be handled via an FP_REGS
4788     because AArch64 has richer addressing modes for LDR/STR instructions
4789     than LDP/STP instructions.  */
4790  if (TARGET_FLOAT && rclass == GENERAL_REGS
4791      && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4792    return FP_REGS;
4793
4794  if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4795      return GENERAL_REGS;
4796
4797  return NO_REGS;
4798}
4799
4800static bool
4801aarch64_can_eliminate (const int from, const int to)
4802{
4803  /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4804     HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4805
4806  if (frame_pointer_needed)
4807    {
4808      if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4809	return true;
4810      if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4811	return false;
4812      if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4813	  && !cfun->calls_alloca)
4814	return true;
4815      if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4816	return true;
4817
4818      return false;
4819    }
4820  else
4821    {
4822      /* If we decided that we didn't need a leaf frame pointer but then used
4823	 LR in the function, then we'll want a frame pointer after all, so
4824	 prevent this elimination to ensure a frame pointer is used.  */
4825      if (to == STACK_POINTER_REGNUM
4826	  && flag_omit_leaf_frame_pointer
4827	  && df_regs_ever_live_p (LR_REGNUM))
4828	return false;
4829    }
4830
4831  return true;
4832}
4833
4834HOST_WIDE_INT
4835aarch64_initial_elimination_offset (unsigned from, unsigned to)
4836{
4837  aarch64_layout_frame ();
4838
4839  if (to == HARD_FRAME_POINTER_REGNUM)
4840    {
4841      if (from == ARG_POINTER_REGNUM)
4842	return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4843
4844      if (from == FRAME_POINTER_REGNUM)
4845	return (cfun->machine->frame.hard_fp_offset
4846		- cfun->machine->frame.saved_varargs_size);
4847    }
4848
4849  if (to == STACK_POINTER_REGNUM)
4850    {
4851      if (from == FRAME_POINTER_REGNUM)
4852	  return (cfun->machine->frame.frame_size
4853		  - cfun->machine->frame.saved_varargs_size);
4854    }
4855
4856  return cfun->machine->frame.frame_size;
4857}
4858
4859/* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4860   previous frame.  */
4861
4862rtx
4863aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4864{
4865  if (count != 0)
4866    return const0_rtx;
4867  return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4868}
4869
4870
4871static void
4872aarch64_asm_trampoline_template (FILE *f)
4873{
4874  if (TARGET_ILP32)
4875    {
4876      asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4877      asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4878    }
4879  else
4880    {
4881      asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4882      asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4883    }
4884  asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4885  assemble_aligned_integer (4, const0_rtx);
4886  assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4887  assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4888}
4889
4890static void
4891aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4892{
4893  rtx fnaddr, mem, a_tramp;
4894  const int tramp_code_sz = 16;
4895
4896  /* Don't need to copy the trailing D-words, we fill those in below.  */
4897  emit_block_move (m_tramp, assemble_trampoline_template (),
4898		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4899  mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4900  fnaddr = XEXP (DECL_RTL (fndecl), 0);
4901  if (GET_MODE (fnaddr) != ptr_mode)
4902    fnaddr = convert_memory_address (ptr_mode, fnaddr);
4903  emit_move_insn (mem, fnaddr);
4904
4905  mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4906  emit_move_insn (mem, chain_value);
4907
4908  /* XXX We should really define a "clear_cache" pattern and use
4909     gen_clear_cache().  */
4910  a_tramp = XEXP (m_tramp, 0);
4911  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4912		     LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4913		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4914		     ptr_mode);
4915}
4916
4917static unsigned char
4918aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4919{
4920  switch (regclass)
4921    {
4922    case CALLER_SAVE_REGS:
4923    case POINTER_REGS:
4924    case GENERAL_REGS:
4925    case ALL_REGS:
4926    case FP_REGS:
4927    case FP_LO_REGS:
4928      return
4929	aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4930				       (GET_MODE_SIZE (mode) + 7) / 8;
4931    case STACK_REG:
4932      return 1;
4933
4934    case NO_REGS:
4935      return 0;
4936
4937    default:
4938      break;
4939    }
4940  gcc_unreachable ();
4941}
4942
4943static reg_class_t
4944aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4945{
4946  if (regclass == POINTER_REGS)
4947    return GENERAL_REGS;
4948
4949  if (regclass == STACK_REG)
4950    {
4951      if (REG_P(x)
4952	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4953	  return regclass;
4954
4955      return NO_REGS;
4956    }
4957
4958  /* If it's an integer immediate that MOVI can't handle, then
4959     FP_REGS is not an option, so we return NO_REGS instead.  */
4960  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4961      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4962    return NO_REGS;
4963
4964  /* Register eliminiation can result in a request for
4965     SP+constant->FP_REGS.  We cannot support such operations which
4966     use SP as source and an FP_REG as destination, so reject out
4967     right now.  */
4968  if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4969    {
4970      rtx lhs = XEXP (x, 0);
4971
4972      /* Look through a possible SUBREG introduced by ILP32.  */
4973      if (GET_CODE (lhs) == SUBREG)
4974	lhs = SUBREG_REG (lhs);
4975
4976      gcc_assert (REG_P (lhs));
4977      gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4978				      POINTER_REGS));
4979      return NO_REGS;
4980    }
4981
4982  return regclass;
4983}
4984
4985void
4986aarch64_asm_output_labelref (FILE* f, const char *name)
4987{
4988  asm_fprintf (f, "%U%s", name);
4989}
4990
4991static void
4992aarch64_elf_asm_constructor (rtx symbol, int priority)
4993{
4994  if (priority == DEFAULT_INIT_PRIORITY)
4995    default_ctor_section_asm_out_constructor (symbol, priority);
4996  else
4997    {
4998      section *s;
4999      char buf[18];
5000      snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5001      s = get_section (buf, SECTION_WRITE, NULL);
5002      switch_to_section (s);
5003      assemble_align (POINTER_SIZE);
5004      assemble_aligned_integer (POINTER_BYTES, symbol);
5005    }
5006}
5007
5008static void
5009aarch64_elf_asm_destructor (rtx symbol, int priority)
5010{
5011  if (priority == DEFAULT_INIT_PRIORITY)
5012    default_dtor_section_asm_out_destructor (symbol, priority);
5013  else
5014    {
5015      section *s;
5016      char buf[18];
5017      snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5018      s = get_section (buf, SECTION_WRITE, NULL);
5019      switch_to_section (s);
5020      assemble_align (POINTER_SIZE);
5021      assemble_aligned_integer (POINTER_BYTES, symbol);
5022    }
5023}
5024
5025const char*
5026aarch64_output_casesi (rtx *operands)
5027{
5028  char buf[100];
5029  char label[100];
5030  rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5031  int index;
5032  static const char *const patterns[4][2] =
5033  {
5034    {
5035      "ldrb\t%w3, [%0,%w1,uxtw]",
5036      "add\t%3, %4, %w3, sxtb #2"
5037    },
5038    {
5039      "ldrh\t%w3, [%0,%w1,uxtw #1]",
5040      "add\t%3, %4, %w3, sxth #2"
5041    },
5042    {
5043      "ldr\t%w3, [%0,%w1,uxtw #2]",
5044      "add\t%3, %4, %w3, sxtw #2"
5045    },
5046    /* We assume that DImode is only generated when not optimizing and
5047       that we don't really need 64-bit address offsets.  That would
5048       imply an object file with 8GB of code in a single function!  */
5049    {
5050      "ldr\t%w3, [%0,%w1,uxtw #2]",
5051      "add\t%3, %4, %w3, sxtw #2"
5052    }
5053  };
5054
5055  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5056
5057  index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5058
5059  gcc_assert (index >= 0 && index <= 3);
5060
5061  /* Need to implement table size reduction, by chaning the code below.  */
5062  output_asm_insn (patterns[index][0], operands);
5063  ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5064  snprintf (buf, sizeof (buf),
5065	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
5066  output_asm_insn (buf, operands);
5067  output_asm_insn (patterns[index][1], operands);
5068  output_asm_insn ("br\t%3", operands);
5069  assemble_label (asm_out_file, label);
5070  return "";
5071}
5072
5073
5074/* Return size in bits of an arithmetic operand which is shifted/scaled and
5075   masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5076   operator.  */
5077
5078int
5079aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5080{
5081  if (shift >= 0 && shift <= 3)
5082    {
5083      int size;
5084      for (size = 8; size <= 32; size *= 2)
5085	{
5086	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5087	  if (mask == bits << shift)
5088	    return size;
5089	}
5090    }
5091  return 0;
5092}
5093
5094static bool
5095aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5096				   const_rtx x ATTRIBUTE_UNUSED)
5097{
5098  /* We can't use blocks for constants when we're using a per-function
5099     constant pool.  */
5100  return false;
5101}
5102
5103static section *
5104aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5105			    rtx x ATTRIBUTE_UNUSED,
5106			    unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5107{
5108  /* Force all constant pool entries into the current function section.  */
5109  return function_section (current_function_decl);
5110}
5111
5112
5113/* Costs.  */
5114
5115/* Helper function for rtx cost calculation.  Strip a shift expression
5116   from X.  Returns the inner operand if successful, or the original
5117   expression on failure.  */
5118static rtx
5119aarch64_strip_shift (rtx x)
5120{
5121  rtx op = x;
5122
5123  /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5124     we can convert both to ROR during final output.  */
5125  if ((GET_CODE (op) == ASHIFT
5126       || GET_CODE (op) == ASHIFTRT
5127       || GET_CODE (op) == LSHIFTRT
5128       || GET_CODE (op) == ROTATERT
5129       || GET_CODE (op) == ROTATE)
5130      && CONST_INT_P (XEXP (op, 1)))
5131    return XEXP (op, 0);
5132
5133  if (GET_CODE (op) == MULT
5134      && CONST_INT_P (XEXP (op, 1))
5135      && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5136    return XEXP (op, 0);
5137
5138  return x;
5139}
5140
5141/* Helper function for rtx cost calculation.  Strip an extend
5142   expression from X.  Returns the inner operand if successful, or the
5143   original expression on failure.  We deal with a number of possible
5144   canonicalization variations here.  */
5145static rtx
5146aarch64_strip_extend (rtx x)
5147{
5148  rtx op = x;
5149
5150  /* Zero and sign extraction of a widened value.  */
5151  if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5152      && XEXP (op, 2) == const0_rtx
5153      && GET_CODE (XEXP (op, 0)) == MULT
5154      && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5155					 XEXP (op, 1)))
5156    return XEXP (XEXP (op, 0), 0);
5157
5158  /* It can also be represented (for zero-extend) as an AND with an
5159     immediate.  */
5160  if (GET_CODE (op) == AND
5161      && GET_CODE (XEXP (op, 0)) == MULT
5162      && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5163      && CONST_INT_P (XEXP (op, 1))
5164      && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5165			   INTVAL (XEXP (op, 1))) != 0)
5166    return XEXP (XEXP (op, 0), 0);
5167
5168  /* Now handle extended register, as this may also have an optional
5169     left shift by 1..4.  */
5170  if (GET_CODE (op) == ASHIFT
5171      && CONST_INT_P (XEXP (op, 1))
5172      && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5173    op = XEXP (op, 0);
5174
5175  if (GET_CODE (op) == ZERO_EXTEND
5176      || GET_CODE (op) == SIGN_EXTEND)
5177    op = XEXP (op, 0);
5178
5179  if (op != x)
5180    return op;
5181
5182  return x;
5183}
5184
5185/* Helper function for rtx cost calculation.  Calculate the cost of
5186   a MULT, which may be part of a multiply-accumulate rtx.  Return
5187   the calculated cost of the expression, recursing manually in to
5188   operands where needed.  */
5189
5190static int
5191aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5192{
5193  rtx op0, op1;
5194  const struct cpu_cost_table *extra_cost
5195    = aarch64_tune_params->insn_extra_cost;
5196  int cost = 0;
5197  bool maybe_fma = (outer == PLUS || outer == MINUS);
5198  machine_mode mode = GET_MODE (x);
5199
5200  gcc_checking_assert (code == MULT);
5201
5202  op0 = XEXP (x, 0);
5203  op1 = XEXP (x, 1);
5204
5205  if (VECTOR_MODE_P (mode))
5206    mode = GET_MODE_INNER (mode);
5207
5208  /* Integer multiply/fma.  */
5209  if (GET_MODE_CLASS (mode) == MODE_INT)
5210    {
5211      /* The multiply will be canonicalized as a shift, cost it as such.  */
5212      if (CONST_INT_P (op1)
5213	  && exact_log2 (INTVAL (op1)) > 0)
5214	{
5215	  if (speed)
5216	    {
5217	      if (maybe_fma)
5218		/* ADD (shifted register).  */
5219		cost += extra_cost->alu.arith_shift;
5220	      else
5221		/* LSL (immediate).  */
5222		cost += extra_cost->alu.shift;
5223	    }
5224
5225	  cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5226
5227	  return cost;
5228	}
5229
5230      /* Integer multiplies or FMAs have zero/sign extending variants.  */
5231      if ((GET_CODE (op0) == ZERO_EXTEND
5232	   && GET_CODE (op1) == ZERO_EXTEND)
5233	  || (GET_CODE (op0) == SIGN_EXTEND
5234	      && GET_CODE (op1) == SIGN_EXTEND))
5235	{
5236	  cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5237		  + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5238
5239	  if (speed)
5240	    {
5241	      if (maybe_fma)
5242		/* MADD/SMADDL/UMADDL.  */
5243		cost += extra_cost->mult[0].extend_add;
5244	      else
5245		/* MUL/SMULL/UMULL.  */
5246		cost += extra_cost->mult[0].extend;
5247	    }
5248
5249	  return cost;
5250	}
5251
5252      /* This is either an integer multiply or an FMA.  In both cases
5253	 we want to recurse and cost the operands.  */
5254      cost += rtx_cost (op0, MULT, 0, speed)
5255	      + rtx_cost (op1, MULT, 1, speed);
5256
5257      if (speed)
5258	{
5259	  if (maybe_fma)
5260	    /* MADD.  */
5261	    cost += extra_cost->mult[mode == DImode].add;
5262	  else
5263	    /* MUL.  */
5264	    cost += extra_cost->mult[mode == DImode].simple;
5265	}
5266
5267      return cost;
5268    }
5269  else
5270    {
5271      if (speed)
5272	{
5273	  /* Floating-point FMA/FMUL can also support negations of the
5274	     operands, unless the rounding mode is upward or downward in
5275	     which case FNMUL is different than FMUL with operand negation.  */
5276	  bool neg0 = GET_CODE (op0) == NEG;
5277	  bool neg1 = GET_CODE (op1) == NEG;
5278	  if (maybe_fma || !flag_rounding_math || (neg0 && neg1))
5279	    {
5280	      if (neg0)
5281		op0 = XEXP (op0, 0);
5282	      if (neg1)
5283		op1 = XEXP (op1, 0);
5284	    }
5285
5286	  if (maybe_fma)
5287	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5288	    cost += extra_cost->fp[mode == DFmode].fma;
5289	  else
5290	    /* FMUL/FNMUL.  */
5291	    cost += extra_cost->fp[mode == DFmode].mult;
5292	}
5293
5294      cost += rtx_cost (op0, MULT, 0, speed)
5295	      + rtx_cost (op1, MULT, 1, speed);
5296      return cost;
5297    }
5298}
5299
5300static int
5301aarch64_address_cost (rtx x,
5302		      machine_mode mode,
5303		      addr_space_t as ATTRIBUTE_UNUSED,
5304		      bool speed)
5305{
5306  enum rtx_code c = GET_CODE (x);
5307  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5308  struct aarch64_address_info info;
5309  int cost = 0;
5310  info.shift = 0;
5311
5312  if (!aarch64_classify_address (&info, x, mode, c, false))
5313    {
5314      if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5315	{
5316	  /* This is a CONST or SYMBOL ref which will be split
5317	     in a different way depending on the code model in use.
5318	     Cost it through the generic infrastructure.  */
5319	  int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5320	  /* Divide through by the cost of one instruction to
5321	     bring it to the same units as the address costs.  */
5322	  cost_symbol_ref /= COSTS_N_INSNS (1);
5323	  /* The cost is then the cost of preparing the address,
5324	     followed by an immediate (possibly 0) offset.  */
5325	  return cost_symbol_ref + addr_cost->imm_offset;
5326	}
5327      else
5328	{
5329	  /* This is most likely a jump table from a case
5330	     statement.  */
5331	  return addr_cost->register_offset;
5332	}
5333    }
5334
5335  switch (info.type)
5336    {
5337      case ADDRESS_LO_SUM:
5338      case ADDRESS_SYMBOLIC:
5339      case ADDRESS_REG_IMM:
5340	cost += addr_cost->imm_offset;
5341	break;
5342
5343      case ADDRESS_REG_WB:
5344	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5345	  cost += addr_cost->pre_modify;
5346	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5347	  cost += addr_cost->post_modify;
5348	else
5349	  gcc_unreachable ();
5350
5351	break;
5352
5353      case ADDRESS_REG_REG:
5354	cost += addr_cost->register_offset;
5355	break;
5356
5357      case ADDRESS_REG_UXTW:
5358      case ADDRESS_REG_SXTW:
5359	cost += addr_cost->register_extend;
5360	break;
5361
5362      default:
5363	gcc_unreachable ();
5364    }
5365
5366
5367  if (info.shift > 0)
5368    {
5369      /* For the sake of calculating the cost of the shifted register
5370	 component, we can treat same sized modes in the same way.  */
5371      switch (GET_MODE_BITSIZE (mode))
5372	{
5373	  case 16:
5374	    cost += addr_cost->addr_scale_costs.hi;
5375	    break;
5376
5377	  case 32:
5378	    cost += addr_cost->addr_scale_costs.si;
5379	    break;
5380
5381	  case 64:
5382	    cost += addr_cost->addr_scale_costs.di;
5383	    break;
5384
5385	  /* We can't tell, or this is a 128-bit vector.  */
5386	  default:
5387	    cost += addr_cost->addr_scale_costs.ti;
5388	    break;
5389	}
5390    }
5391
5392  return cost;
5393}
5394
5395/* Return true if the RTX X in mode MODE is a zero or sign extract
5396   usable in an ADD or SUB (extended register) instruction.  */
5397static bool
5398aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5399{
5400  /* Catch add with a sign extract.
5401     This is add_<optab><mode>_multp2.  */
5402  if (GET_CODE (x) == SIGN_EXTRACT
5403      || GET_CODE (x) == ZERO_EXTRACT)
5404    {
5405      rtx op0 = XEXP (x, 0);
5406      rtx op1 = XEXP (x, 1);
5407      rtx op2 = XEXP (x, 2);
5408
5409      if (GET_CODE (op0) == MULT
5410	  && CONST_INT_P (op1)
5411	  && op2 == const0_rtx
5412	  && CONST_INT_P (XEXP (op0, 1))
5413	  && aarch64_is_extend_from_extract (mode,
5414					     XEXP (op0, 1),
5415					     op1))
5416	{
5417	  return true;
5418	}
5419    }
5420
5421  return false;
5422}
5423
5424static bool
5425aarch64_frint_unspec_p (unsigned int u)
5426{
5427  switch (u)
5428    {
5429      case UNSPEC_FRINTZ:
5430      case UNSPEC_FRINTP:
5431      case UNSPEC_FRINTM:
5432      case UNSPEC_FRINTA:
5433      case UNSPEC_FRINTN:
5434      case UNSPEC_FRINTX:
5435      case UNSPEC_FRINTI:
5436        return true;
5437
5438      default:
5439        return false;
5440    }
5441}
5442
5443/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5444   storing it in *COST.  Result is true if the total cost of the operation
5445   has now been calculated.  */
5446static bool
5447aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5448{
5449  rtx inner;
5450  rtx comparator;
5451  enum rtx_code cmpcode;
5452
5453  if (COMPARISON_P (op0))
5454    {
5455      inner = XEXP (op0, 0);
5456      comparator = XEXP (op0, 1);
5457      cmpcode = GET_CODE (op0);
5458    }
5459  else
5460    {
5461      inner = op0;
5462      comparator = const0_rtx;
5463      cmpcode = NE;
5464    }
5465
5466  if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5467    {
5468      /* Conditional branch.  */
5469      if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5470	return true;
5471      else
5472	{
5473	  if (cmpcode == NE || cmpcode == EQ)
5474	    {
5475	      if (comparator == const0_rtx)
5476		{
5477		  /* TBZ/TBNZ/CBZ/CBNZ.  */
5478		  if (GET_CODE (inner) == ZERO_EXTRACT)
5479		    /* TBZ/TBNZ.  */
5480		    *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5481			 	       0, speed);
5482		else
5483		  /* CBZ/CBNZ.  */
5484		  *cost += rtx_cost (inner, cmpcode, 0, speed);
5485
5486	        return true;
5487	      }
5488	    }
5489	  else if (cmpcode == LT || cmpcode == GE)
5490	    {
5491	      /* TBZ/TBNZ.  */
5492	      if (comparator == const0_rtx)
5493		return true;
5494	    }
5495	}
5496    }
5497  else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5498    {
5499      /* It's a conditional operation based on the status flags,
5500	 so it must be some flavor of CSEL.  */
5501
5502      /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5503      if (GET_CODE (op1) == NEG
5504          || GET_CODE (op1) == NOT
5505          || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5506	op1 = XEXP (op1, 0);
5507
5508      *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5509      *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5510      return true;
5511    }
5512
5513  /* We don't know what this is, cost all operands.  */
5514  return false;
5515}
5516
5517/* Calculate the cost of calculating X, storing it in *COST.  Result
5518   is true if the total cost of the operation has now been calculated.  */
5519static bool
5520aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5521		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5522{
5523  rtx op0, op1, op2;
5524  const struct cpu_cost_table *extra_cost
5525    = aarch64_tune_params->insn_extra_cost;
5526  machine_mode mode = GET_MODE (x);
5527
5528  /* By default, assume that everything has equivalent cost to the
5529     cheapest instruction.  Any additional costs are applied as a delta
5530     above this default.  */
5531  *cost = COSTS_N_INSNS (1);
5532
5533  /* TODO: The cost infrastructure currently does not handle
5534     vector operations.  Assume that all vector operations
5535     are equally expensive.  */
5536  if (VECTOR_MODE_P (mode))
5537    {
5538      if (speed)
5539	*cost += extra_cost->vect.alu;
5540      return true;
5541    }
5542
5543  switch (code)
5544    {
5545    case SET:
5546      /* The cost depends entirely on the operands to SET.  */
5547      *cost = 0;
5548      op0 = SET_DEST (x);
5549      op1 = SET_SRC (x);
5550
5551      switch (GET_CODE (op0))
5552	{
5553	case MEM:
5554	  if (speed)
5555	    {
5556	      rtx address = XEXP (op0, 0);
5557	      if (GET_MODE_CLASS (mode) == MODE_INT)
5558		*cost += extra_cost->ldst.store;
5559	      else if (mode == SFmode)
5560		*cost += extra_cost->ldst.storef;
5561	      else if (mode == DFmode)
5562		*cost += extra_cost->ldst.stored;
5563
5564	      *cost +=
5565		COSTS_N_INSNS (aarch64_address_cost (address, mode,
5566						     0, speed));
5567	    }
5568
5569	  *cost += rtx_cost (op1, SET, 1, speed);
5570	  return true;
5571
5572	case SUBREG:
5573	  if (! REG_P (SUBREG_REG (op0)))
5574	    *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5575
5576	  /* Fall through.  */
5577	case REG:
5578	  /* const0_rtx is in general free, but we will use an
5579	     instruction to set a register to 0.  */
5580          if (REG_P (op1) || op1 == const0_rtx)
5581            {
5582              /* The cost is 1 per register copied.  */
5583              int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5584			      / UNITS_PER_WORD;
5585              *cost = COSTS_N_INSNS (n_minus_1 + 1);
5586            }
5587          else
5588	    /* Cost is just the cost of the RHS of the set.  */
5589	    *cost += rtx_cost (op1, SET, 1, speed);
5590	  return true;
5591
5592	case ZERO_EXTRACT:
5593	case SIGN_EXTRACT:
5594	  /* Bit-field insertion.  Strip any redundant widening of
5595	     the RHS to meet the width of the target.  */
5596	  if (GET_CODE (op1) == SUBREG)
5597	    op1 = SUBREG_REG (op1);
5598	  if ((GET_CODE (op1) == ZERO_EXTEND
5599	       || GET_CODE (op1) == SIGN_EXTEND)
5600	      && CONST_INT_P (XEXP (op0, 1))
5601	      && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5602		  >= INTVAL (XEXP (op0, 1))))
5603	    op1 = XEXP (op1, 0);
5604
5605          if (CONST_INT_P (op1))
5606            {
5607              /* MOV immediate is assumed to always be cheap.  */
5608              *cost = COSTS_N_INSNS (1);
5609            }
5610          else
5611            {
5612              /* BFM.  */
5613	      if (speed)
5614		*cost += extra_cost->alu.bfi;
5615              *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5616            }
5617
5618	  return true;
5619
5620	default:
5621	  /* We can't make sense of this, assume default cost.  */
5622          *cost = COSTS_N_INSNS (1);
5623	  return false;
5624	}
5625      return false;
5626
5627    case CONST_INT:
5628      /* If an instruction can incorporate a constant within the
5629	 instruction, the instruction's expression avoids calling
5630	 rtx_cost() on the constant.  If rtx_cost() is called on a
5631	 constant, then it is usually because the constant must be
5632	 moved into a register by one or more instructions.
5633
5634	 The exception is constant 0, which can be expressed
5635	 as XZR/WZR and is therefore free.  The exception to this is
5636	 if we have (set (reg) (const0_rtx)) in which case we must cost
5637	 the move.  However, we can catch that when we cost the SET, so
5638	 we don't need to consider that here.  */
5639      if (x == const0_rtx)
5640	*cost = 0;
5641      else
5642	{
5643	  /* To an approximation, building any other constant is
5644	     proportionally expensive to the number of instructions
5645	     required to build that constant.  This is true whether we
5646	     are compiling for SPEED or otherwise.  */
5647	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5648				 (NULL_RTX, x, false, mode));
5649	}
5650      return true;
5651
5652    case CONST_DOUBLE:
5653      if (speed)
5654	{
5655	  /* mov[df,sf]_aarch64.  */
5656	  if (aarch64_float_const_representable_p (x))
5657	    /* FMOV (scalar immediate).  */
5658	    *cost += extra_cost->fp[mode == DFmode].fpconst;
5659	  else if (!aarch64_float_const_zero_rtx_p (x))
5660	    {
5661	      /* This will be a load from memory.  */
5662	      if (mode == DFmode)
5663		*cost += extra_cost->ldst.loadd;
5664	      else
5665		*cost += extra_cost->ldst.loadf;
5666	    }
5667	  else
5668	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5669	       or MOV v0.s[0], wzr - neither of which are modeled by the
5670	       cost tables.  Just use the default cost.  */
5671	    {
5672	    }
5673	}
5674
5675      return true;
5676
5677    case MEM:
5678      if (speed)
5679	{
5680	  /* For loads we want the base cost of a load, plus an
5681	     approximation for the additional cost of the addressing
5682	     mode.  */
5683	  rtx address = XEXP (x, 0);
5684	  if (GET_MODE_CLASS (mode) == MODE_INT)
5685	    *cost += extra_cost->ldst.load;
5686	  else if (mode == SFmode)
5687	    *cost += extra_cost->ldst.loadf;
5688	  else if (mode == DFmode)
5689	    *cost += extra_cost->ldst.loadd;
5690
5691	  *cost +=
5692		COSTS_N_INSNS (aarch64_address_cost (address, mode,
5693						     0, speed));
5694	}
5695
5696      return true;
5697
5698    case NEG:
5699      op0 = XEXP (x, 0);
5700
5701      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5702       {
5703          if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5704              || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5705            {
5706              /* CSETM.  */
5707              *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5708              return true;
5709            }
5710
5711	  /* Cost this as SUB wzr, X.  */
5712          op0 = CONST0_RTX (GET_MODE (x));
5713          op1 = XEXP (x, 0);
5714          goto cost_minus;
5715        }
5716
5717      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5718        {
5719          /* Support (neg(fma...)) as a single instruction only if
5720             sign of zeros is unimportant.  This matches the decision
5721             making in aarch64.md.  */
5722          if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5723            {
5724	      /* FNMADD.  */
5725              *cost = rtx_cost (op0, NEG, 0, speed);
5726              return true;
5727            }
5728	  if (GET_CODE (op0) == MULT)
5729	    {
5730	      /* FNMUL.  */
5731	      *cost = rtx_cost (op0, NEG, 0, speed);
5732	      return true;
5733	    }
5734	  if (speed)
5735	    /* FNEG.  */
5736	    *cost += extra_cost->fp[mode == DFmode].neg;
5737          return false;
5738        }
5739
5740      return false;
5741
5742    case CLRSB:
5743    case CLZ:
5744      if (speed)
5745        *cost += extra_cost->alu.clz;
5746
5747      return false;
5748
5749    case COMPARE:
5750      op0 = XEXP (x, 0);
5751      op1 = XEXP (x, 1);
5752
5753      if (op1 == const0_rtx
5754	  && GET_CODE (op0) == AND)
5755	{
5756	  x = op0;
5757	  goto cost_logic;
5758	}
5759
5760      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5761        {
5762          /* TODO: A write to the CC flags possibly costs extra, this
5763	     needs encoding in the cost tables.  */
5764
5765          /* CC_ZESWPmode supports zero extend for free.  */
5766          if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5767            op0 = XEXP (op0, 0);
5768
5769          /* ANDS.  */
5770          if (GET_CODE (op0) == AND)
5771            {
5772              x = op0;
5773              goto cost_logic;
5774            }
5775
5776          if (GET_CODE (op0) == PLUS)
5777            {
5778	      /* ADDS (and CMN alias).  */
5779              x = op0;
5780              goto cost_plus;
5781            }
5782
5783          if (GET_CODE (op0) == MINUS)
5784            {
5785	      /* SUBS.  */
5786              x = op0;
5787              goto cost_minus;
5788            }
5789
5790          if (GET_CODE (op1) == NEG)
5791            {
5792	      /* CMN.  */
5793	      if (speed)
5794		*cost += extra_cost->alu.arith;
5795
5796              *cost += rtx_cost (op0, COMPARE, 0, speed);
5797	      *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5798              return true;
5799            }
5800
5801          /* CMP.
5802
5803	     Compare can freely swap the order of operands, and
5804             canonicalization puts the more complex operation first.
5805             But the integer MINUS logic expects the shift/extend
5806             operation in op1.  */
5807          if (! (REG_P (op0)
5808                 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5809          {
5810            op0 = XEXP (x, 1);
5811            op1 = XEXP (x, 0);
5812          }
5813          goto cost_minus;
5814        }
5815
5816      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5817        {
5818	  /* FCMP.  */
5819	  if (speed)
5820	    *cost += extra_cost->fp[mode == DFmode].compare;
5821
5822          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5823            {
5824              /* FCMP supports constant 0.0 for no extra cost. */
5825              return true;
5826            }
5827          return false;
5828        }
5829
5830      return false;
5831
5832    case MINUS:
5833      {
5834	op0 = XEXP (x, 0);
5835	op1 = XEXP (x, 1);
5836
5837cost_minus:
5838	/* Detect valid immediates.  */
5839	if ((GET_MODE_CLASS (mode) == MODE_INT
5840	     || (GET_MODE_CLASS (mode) == MODE_CC
5841		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5842	    && CONST_INT_P (op1)
5843	    && aarch64_uimm12_shift (INTVAL (op1)))
5844	  {
5845	    *cost += rtx_cost (op0, MINUS, 0, speed);
5846
5847	    if (speed)
5848	      /* SUB(S) (immediate).  */
5849	      *cost += extra_cost->alu.arith;
5850	    return true;
5851
5852	  }
5853
5854	/* Look for SUB (extended register).  */
5855        if (aarch64_rtx_arith_op_extract_p (op1, mode))
5856	  {
5857	    if (speed)
5858	      *cost += extra_cost->alu.arith_shift;
5859
5860	    *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5861			       (enum rtx_code) GET_CODE (op1),
5862			       0, speed);
5863	    return true;
5864	  }
5865
5866	rtx new_op1 = aarch64_strip_extend (op1);
5867
5868	/* Cost this as an FMA-alike operation.  */
5869	if ((GET_CODE (new_op1) == MULT
5870	     || GET_CODE (new_op1) == ASHIFT)
5871	    && code != COMPARE)
5872	  {
5873	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5874					    (enum rtx_code) code,
5875					    speed);
5876	    *cost += rtx_cost (op0, MINUS, 0, speed);
5877	    return true;
5878	  }
5879
5880	*cost += rtx_cost (new_op1, MINUS, 1, speed);
5881
5882	if (speed)
5883	  {
5884	    if (GET_MODE_CLASS (mode) == MODE_INT)
5885	      /* SUB(S).  */
5886	      *cost += extra_cost->alu.arith;
5887	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5888	      /* FSUB.  */
5889	      *cost += extra_cost->fp[mode == DFmode].addsub;
5890	  }
5891	return true;
5892      }
5893
5894    case PLUS:
5895      {
5896	rtx new_op0;
5897
5898	op0 = XEXP (x, 0);
5899	op1 = XEXP (x, 1);
5900
5901cost_plus:
5902	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5903	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5904	  {
5905	    /* CSINC.  */
5906	    *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5907	    *cost += rtx_cost (op1, PLUS, 1, speed);
5908	    return true;
5909	  }
5910
5911	if (GET_MODE_CLASS (mode) == MODE_INT
5912	    && CONST_INT_P (op1)
5913	    && aarch64_uimm12_shift (INTVAL (op1)))
5914	  {
5915	    *cost += rtx_cost (op0, PLUS, 0, speed);
5916
5917	    if (speed)
5918	      /* ADD (immediate).  */
5919	      *cost += extra_cost->alu.arith;
5920	    return true;
5921	  }
5922
5923	/* Look for ADD (extended register).  */
5924        if (aarch64_rtx_arith_op_extract_p (op0, mode))
5925	  {
5926	    if (speed)
5927	      *cost += extra_cost->alu.arith_shift;
5928
5929	    *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5930			       (enum rtx_code) GET_CODE (op0),
5931			       0, speed);
5932	    return true;
5933	  }
5934
5935	/* Strip any extend, leave shifts behind as we will
5936	   cost them through mult_cost.  */
5937	new_op0 = aarch64_strip_extend (op0);
5938
5939	if (GET_CODE (new_op0) == MULT
5940	    || GET_CODE (new_op0) == ASHIFT)
5941	  {
5942	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5943					    speed);
5944	    *cost += rtx_cost (op1, PLUS, 1, speed);
5945	    return true;
5946	  }
5947
5948	*cost += (rtx_cost (new_op0, PLUS, 0, speed)
5949		  + rtx_cost (op1, PLUS, 1, speed));
5950
5951	if (speed)
5952	  {
5953	    if (GET_MODE_CLASS (mode) == MODE_INT)
5954	      /* ADD.  */
5955	      *cost += extra_cost->alu.arith;
5956	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5957	      /* FADD.  */
5958	      *cost += extra_cost->fp[mode == DFmode].addsub;
5959	  }
5960	return true;
5961      }
5962
5963    case BSWAP:
5964      *cost = COSTS_N_INSNS (1);
5965
5966      if (speed)
5967        *cost += extra_cost->alu.rev;
5968
5969      return false;
5970
5971    case IOR:
5972      if (aarch_rev16_p (x))
5973        {
5974          *cost = COSTS_N_INSNS (1);
5975
5976          if (speed)
5977            *cost += extra_cost->alu.rev;
5978
5979          return true;
5980        }
5981    /* Fall through.  */
5982    case XOR:
5983    case AND:
5984    cost_logic:
5985      op0 = XEXP (x, 0);
5986      op1 = XEXP (x, 1);
5987
5988      if (code == AND
5989          && GET_CODE (op0) == MULT
5990          && CONST_INT_P (XEXP (op0, 1))
5991          && CONST_INT_P (op1)
5992          && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5993                               INTVAL (op1)) != 0)
5994        {
5995          /* This is a UBFM/SBFM.  */
5996          *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5997	  if (speed)
5998	    *cost += extra_cost->alu.bfx;
5999          return true;
6000        }
6001
6002      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6003	{
6004	  /* We possibly get the immediate for free, this is not
6005	     modelled.  */
6006	  if (CONST_INT_P (op1)
6007	      && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6008	    {
6009	      *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6010
6011	      if (speed)
6012		*cost += extra_cost->alu.logical;
6013
6014	      return true;
6015	    }
6016	  else
6017	    {
6018	      rtx new_op0 = op0;
6019
6020	      /* Handle ORN, EON, or BIC.  */
6021	      if (GET_CODE (op0) == NOT)
6022		op0 = XEXP (op0, 0);
6023
6024	      new_op0 = aarch64_strip_shift (op0);
6025
6026	      /* If we had a shift on op0 then this is a logical-shift-
6027		 by-register/immediate operation.  Otherwise, this is just
6028		 a logical operation.  */
6029	      if (speed)
6030		{
6031		  if (new_op0 != op0)
6032		    {
6033		      /* Shift by immediate.  */
6034		      if (CONST_INT_P (XEXP (op0, 1)))
6035			*cost += extra_cost->alu.log_shift;
6036		      else
6037			*cost += extra_cost->alu.log_shift_reg;
6038		    }
6039		  else
6040		    *cost += extra_cost->alu.logical;
6041		}
6042
6043	      /* In both cases we want to cost both operands.  */
6044	      *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6045		       + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6046
6047	      return true;
6048	    }
6049	}
6050      return false;
6051
6052    case NOT:
6053      /* MVN.  */
6054      if (speed)
6055	*cost += extra_cost->alu.logical;
6056
6057      /* The logical instruction could have the shifted register form,
6058         but the cost is the same if the shift is processed as a separate
6059         instruction, so we don't bother with it here.  */
6060      return false;
6061
6062    case ZERO_EXTEND:
6063
6064      op0 = XEXP (x, 0);
6065      /* If a value is written in SI mode, then zero extended to DI
6066	 mode, the operation will in general be free as a write to
6067	 a 'w' register implicitly zeroes the upper bits of an 'x'
6068	 register.  However, if this is
6069
6070	   (set (reg) (zero_extend (reg)))
6071
6072	 we must cost the explicit register move.  */
6073      if (mode == DImode
6074	  && GET_MODE (op0) == SImode
6075	  && outer == SET)
6076	{
6077	  int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6078
6079	  if (!op_cost && speed)
6080	    /* MOV.  */
6081	    *cost += extra_cost->alu.extend;
6082	  else
6083	    /* Free, the cost is that of the SI mode operation.  */
6084	    *cost = op_cost;
6085
6086	  return true;
6087	}
6088      else if (MEM_P (XEXP (x, 0)))
6089	{
6090	  /* All loads can zero extend to any size for free.  */
6091	  *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6092	  return true;
6093	}
6094
6095      /* UXTB/UXTH.  */
6096      if (speed)
6097	*cost += extra_cost->alu.extend;
6098
6099      return false;
6100
6101    case SIGN_EXTEND:
6102      if (MEM_P (XEXP (x, 0)))
6103	{
6104	  /* LDRSH.  */
6105	  if (speed)
6106	    {
6107	      rtx address = XEXP (XEXP (x, 0), 0);
6108	      *cost += extra_cost->ldst.load_sign_extend;
6109
6110	      *cost +=
6111		COSTS_N_INSNS (aarch64_address_cost (address, mode,
6112						     0, speed));
6113	    }
6114	  return true;
6115	}
6116
6117      if (speed)
6118	*cost += extra_cost->alu.extend;
6119      return false;
6120
6121    case ASHIFT:
6122      op0 = XEXP (x, 0);
6123      op1 = XEXP (x, 1);
6124
6125      if (CONST_INT_P (op1))
6126        {
6127	  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6128	     aliases.  */
6129	  if (speed)
6130	    *cost += extra_cost->alu.shift;
6131
6132          /* We can incorporate zero/sign extend for free.  */
6133          if (GET_CODE (op0) == ZERO_EXTEND
6134              || GET_CODE (op0) == SIGN_EXTEND)
6135            op0 = XEXP (op0, 0);
6136
6137          *cost += rtx_cost (op0, ASHIFT, 0, speed);
6138          return true;
6139        }
6140      else
6141        {
6142	  /* LSLV.  */
6143	  if (speed)
6144	    *cost += extra_cost->alu.shift_reg;
6145
6146	  return false;  /* All arguments need to be in registers.  */
6147        }
6148
6149    case ROTATE:
6150    case ROTATERT:
6151    case LSHIFTRT:
6152    case ASHIFTRT:
6153      op0 = XEXP (x, 0);
6154      op1 = XEXP (x, 1);
6155
6156      if (CONST_INT_P (op1))
6157	{
6158	  /* ASR (immediate) and friends.  */
6159	  if (speed)
6160	    *cost += extra_cost->alu.shift;
6161
6162	  *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6163	  return true;
6164	}
6165      else
6166	{
6167
6168	  /* ASR (register) and friends.  */
6169	  if (speed)
6170	    *cost += extra_cost->alu.shift_reg;
6171
6172	  return false;  /* All arguments need to be in registers.  */
6173	}
6174
6175    case SYMBOL_REF:
6176
6177      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6178	{
6179	  /* LDR.  */
6180	  if (speed)
6181	    *cost += extra_cost->ldst.load;
6182	}
6183      else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6184	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6185	{
6186	  /* ADRP, followed by ADD.  */
6187	  *cost += COSTS_N_INSNS (1);
6188	  if (speed)
6189	    *cost += 2 * extra_cost->alu.arith;
6190	}
6191      else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6192	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6193	{
6194	  /* ADR.  */
6195	  if (speed)
6196	    *cost += extra_cost->alu.arith;
6197	}
6198
6199      if (flag_pic)
6200	{
6201	  /* One extra load instruction, after accessing the GOT.  */
6202	  *cost += COSTS_N_INSNS (1);
6203	  if (speed)
6204	    *cost += extra_cost->ldst.load;
6205	}
6206      return true;
6207
6208    case HIGH:
6209    case LO_SUM:
6210      /* ADRP/ADD (immediate).  */
6211      if (speed)
6212	*cost += extra_cost->alu.arith;
6213      return true;
6214
6215    case ZERO_EXTRACT:
6216    case SIGN_EXTRACT:
6217      /* UBFX/SBFX.  */
6218      if (speed)
6219	*cost += extra_cost->alu.bfx;
6220
6221      /* We can trust that the immediates used will be correct (there
6222	 are no by-register forms), so we need only cost op0.  */
6223      *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6224      return true;
6225
6226    case MULT:
6227      *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6228      /* aarch64_rtx_mult_cost always handles recursion to its
6229	 operands.  */
6230      return true;
6231
6232    case MOD:
6233    case UMOD:
6234      if (speed)
6235	{
6236	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6237	    *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6238		      + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6239	  else if (GET_MODE (x) == DFmode)
6240	    *cost += (extra_cost->fp[1].mult
6241		      + extra_cost->fp[1].div);
6242	  else if (GET_MODE (x) == SFmode)
6243	    *cost += (extra_cost->fp[0].mult
6244		      + extra_cost->fp[0].div);
6245	}
6246      return false;  /* All arguments need to be in registers.  */
6247
6248    case DIV:
6249    case UDIV:
6250    case SQRT:
6251      if (speed)
6252	{
6253	  if (GET_MODE_CLASS (mode) == MODE_INT)
6254	    /* There is no integer SQRT, so only DIV and UDIV can get
6255	       here.  */
6256	    *cost += extra_cost->mult[mode == DImode].idiv;
6257	  else
6258	    *cost += extra_cost->fp[mode == DFmode].div;
6259	}
6260      return false;  /* All arguments need to be in registers.  */
6261
6262    case IF_THEN_ELSE:
6263      return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6264					 XEXP (x, 2), cost, speed);
6265
6266    case EQ:
6267    case NE:
6268    case GT:
6269    case GTU:
6270    case LT:
6271    case LTU:
6272    case GE:
6273    case GEU:
6274    case LE:
6275    case LEU:
6276
6277      return false; /* All arguments must be in registers.  */
6278
6279    case FMA:
6280      op0 = XEXP (x, 0);
6281      op1 = XEXP (x, 1);
6282      op2 = XEXP (x, 2);
6283
6284      if (speed)
6285	*cost += extra_cost->fp[mode == DFmode].fma;
6286
6287      /* FMSUB, FNMADD, and FNMSUB are free.  */
6288      if (GET_CODE (op0) == NEG)
6289        op0 = XEXP (op0, 0);
6290
6291      if (GET_CODE (op2) == NEG)
6292        op2 = XEXP (op2, 0);
6293
6294      /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6295	 and the by-element operand as operand 0.  */
6296      if (GET_CODE (op1) == NEG)
6297        op1 = XEXP (op1, 0);
6298
6299      /* Catch vector-by-element operations.  The by-element operand can
6300	 either be (vec_duplicate (vec_select (x))) or just
6301	 (vec_select (x)), depending on whether we are multiplying by
6302	 a vector or a scalar.
6303
6304	 Canonicalization is not very good in these cases, FMA4 will put the
6305	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6306      if (GET_CODE (op0) == VEC_DUPLICATE)
6307	op0 = XEXP (op0, 0);
6308      else if (GET_CODE (op1) == VEC_DUPLICATE)
6309	op1 = XEXP (op1, 0);
6310
6311      if (GET_CODE (op0) == VEC_SELECT)
6312	op0 = XEXP (op0, 0);
6313      else if (GET_CODE (op1) == VEC_SELECT)
6314	op1 = XEXP (op1, 0);
6315
6316      /* If the remaining parameters are not registers,
6317         get the cost to put them into registers.  */
6318      *cost += rtx_cost (op0, FMA, 0, speed);
6319      *cost += rtx_cost (op1, FMA, 1, speed);
6320      *cost += rtx_cost (op2, FMA, 2, speed);
6321      return true;
6322
6323    case FLOAT_EXTEND:
6324      if (speed)
6325	*cost += extra_cost->fp[mode == DFmode].widen;
6326      return false;
6327
6328    case FLOAT_TRUNCATE:
6329      if (speed)
6330	*cost += extra_cost->fp[mode == DFmode].narrow;
6331      return false;
6332
6333    case FIX:
6334    case UNSIGNED_FIX:
6335      x = XEXP (x, 0);
6336      /* Strip the rounding part.  They will all be implemented
6337         by the fcvt* family of instructions anyway.  */
6338      if (GET_CODE (x) == UNSPEC)
6339        {
6340          unsigned int uns_code = XINT (x, 1);
6341
6342          if (uns_code == UNSPEC_FRINTA
6343              || uns_code == UNSPEC_FRINTM
6344              || uns_code == UNSPEC_FRINTN
6345              || uns_code == UNSPEC_FRINTP
6346              || uns_code == UNSPEC_FRINTZ)
6347            x = XVECEXP (x, 0, 0);
6348        }
6349
6350      if (speed)
6351        *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6352
6353      *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6354      return true;
6355
6356    case ABS:
6357      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6358	{
6359	  /* FABS and FNEG are analogous.  */
6360	  if (speed)
6361	    *cost += extra_cost->fp[mode == DFmode].neg;
6362	}
6363      else
6364	{
6365	  /* Integer ABS will either be split to
6366	     two arithmetic instructions, or will be an ABS
6367	     (scalar), which we don't model.  */
6368	  *cost = COSTS_N_INSNS (2);
6369	  if (speed)
6370	    *cost += 2 * extra_cost->alu.arith;
6371	}
6372      return false;
6373
6374    case SMAX:
6375    case SMIN:
6376      if (speed)
6377	{
6378	  /* FMAXNM/FMINNM/FMAX/FMIN.
6379	     TODO: This may not be accurate for all implementations, but
6380	     we do not model this in the cost tables.  */
6381	  *cost += extra_cost->fp[mode == DFmode].addsub;
6382	}
6383      return false;
6384
6385    case UNSPEC:
6386      /* The floating point round to integer frint* instructions.  */
6387      if (aarch64_frint_unspec_p (XINT (x, 1)))
6388        {
6389          if (speed)
6390            *cost += extra_cost->fp[mode == DFmode].roundint;
6391
6392          return false;
6393        }
6394
6395      if (XINT (x, 1) == UNSPEC_RBIT)
6396        {
6397          if (speed)
6398            *cost += extra_cost->alu.rev;
6399
6400          return false;
6401        }
6402      break;
6403
6404    case TRUNCATE:
6405
6406      /* Decompose <su>muldi3_highpart.  */
6407      if (/* (truncate:DI  */
6408	  mode == DImode
6409	  /*   (lshiftrt:TI  */
6410          && GET_MODE (XEXP (x, 0)) == TImode
6411          && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6412	  /*      (mult:TI  */
6413          && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6414	  /*        (ANY_EXTEND:TI (reg:DI))
6415	            (ANY_EXTEND:TI (reg:DI)))  */
6416          && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6417               && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6418              || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6419                  && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6420          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6421          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6422	  /*     (const_int 64)  */
6423          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6424          && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6425        {
6426          /* UMULH/SMULH.  */
6427	  if (speed)
6428	    *cost += extra_cost->mult[mode == DImode].extend;
6429          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6430			     MULT, 0, speed);
6431          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6432			     MULT, 1, speed);
6433          return true;
6434        }
6435
6436      /* Fall through.  */
6437    default:
6438      break;
6439    }
6440
6441  if (dump_file && (dump_flags & TDF_DETAILS))
6442    fprintf (dump_file,
6443      "\nFailed to cost RTX.  Assuming default cost.\n");
6444
6445  return true;
6446}
6447
6448/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6449   calculated for X.  This cost is stored in *COST.  Returns true
6450   if the total cost of X was calculated.  */
6451static bool
6452aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6453		   int param, int *cost, bool speed)
6454{
6455  bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6456
6457  if (dump_file && (dump_flags & TDF_DETAILS))
6458    {
6459      print_rtl_single (dump_file, x);
6460      fprintf (dump_file, "\n%s cost: %d (%s)\n",
6461	       speed ? "Hot" : "Cold",
6462	       *cost, result ? "final" : "partial");
6463    }
6464
6465  return result;
6466}
6467
6468static int
6469aarch64_register_move_cost (machine_mode mode,
6470			    reg_class_t from_i, reg_class_t to_i)
6471{
6472  enum reg_class from = (enum reg_class) from_i;
6473  enum reg_class to = (enum reg_class) to_i;
6474  const struct cpu_regmove_cost *regmove_cost
6475    = aarch64_tune_params->regmove_cost;
6476
6477  /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6478  if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6479    to = GENERAL_REGS;
6480
6481  if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6482    from = GENERAL_REGS;
6483
6484  /* Moving between GPR and stack cost is the same as GP2GP.  */
6485  if ((from == GENERAL_REGS && to == STACK_REG)
6486      || (to == GENERAL_REGS && from == STACK_REG))
6487    return regmove_cost->GP2GP;
6488
6489  /* To/From the stack register, we move via the gprs.  */
6490  if (to == STACK_REG || from == STACK_REG)
6491    return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6492            + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6493
6494  if (GET_MODE_SIZE (mode) == 16)
6495    {
6496      /* 128-bit operations on general registers require 2 instructions.  */
6497      if (from == GENERAL_REGS && to == GENERAL_REGS)
6498	return regmove_cost->GP2GP * 2;
6499      else if (from == GENERAL_REGS)
6500	return regmove_cost->GP2FP * 2;
6501      else if (to == GENERAL_REGS)
6502	return regmove_cost->FP2GP * 2;
6503
6504      /* When AdvSIMD instructions are disabled it is not possible to move
6505	 a 128-bit value directly between Q registers.  This is handled in
6506	 secondary reload.  A general register is used as a scratch to move
6507	 the upper DI value and the lower DI value is moved directly,
6508	 hence the cost is the sum of three moves. */
6509      if (! TARGET_SIMD)
6510	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6511
6512      return regmove_cost->FP2FP;
6513    }
6514
6515  if (from == GENERAL_REGS && to == GENERAL_REGS)
6516    return regmove_cost->GP2GP;
6517  else if (from == GENERAL_REGS)
6518    return regmove_cost->GP2FP;
6519  else if (to == GENERAL_REGS)
6520    return regmove_cost->FP2GP;
6521
6522  return regmove_cost->FP2FP;
6523}
6524
6525static int
6526aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6527			  reg_class_t rclass ATTRIBUTE_UNUSED,
6528			  bool in ATTRIBUTE_UNUSED)
6529{
6530  return aarch64_tune_params->memmov_cost;
6531}
6532
6533/* Return the number of instructions that can be issued per cycle.  */
6534static int
6535aarch64_sched_issue_rate (void)
6536{
6537  return aarch64_tune_params->issue_rate;
6538}
6539
6540static int
6541aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6542{
6543  int issue_rate = aarch64_sched_issue_rate ();
6544
6545  return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6546}
6547
6548/* Vectorizer cost model target hooks.  */
6549
6550/* Implement targetm.vectorize.builtin_vectorization_cost.  */
6551static int
6552aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6553				    tree vectype,
6554				    int misalign ATTRIBUTE_UNUSED)
6555{
6556  unsigned elements;
6557
6558  switch (type_of_cost)
6559    {
6560      case scalar_stmt:
6561	return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6562
6563      case scalar_load:
6564	return aarch64_tune_params->vec_costs->scalar_load_cost;
6565
6566      case scalar_store:
6567	return aarch64_tune_params->vec_costs->scalar_store_cost;
6568
6569      case vector_stmt:
6570	return aarch64_tune_params->vec_costs->vec_stmt_cost;
6571
6572      case vector_load:
6573	return aarch64_tune_params->vec_costs->vec_align_load_cost;
6574
6575      case vector_store:
6576	return aarch64_tune_params->vec_costs->vec_store_cost;
6577
6578      case vec_to_scalar:
6579	return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6580
6581      case scalar_to_vec:
6582	return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6583
6584      case unaligned_load:
6585	return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6586
6587      case unaligned_store:
6588	return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6589
6590      case cond_branch_taken:
6591	return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6592
6593      case cond_branch_not_taken:
6594	return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6595
6596      case vec_perm:
6597      case vec_promote_demote:
6598	return aarch64_tune_params->vec_costs->vec_stmt_cost;
6599
6600      case vec_construct:
6601        elements = TYPE_VECTOR_SUBPARTS (vectype);
6602	return elements / 2 + 1;
6603
6604      default:
6605	gcc_unreachable ();
6606    }
6607}
6608
6609/* Implement targetm.vectorize.add_stmt_cost.  */
6610static unsigned
6611aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6612		       struct _stmt_vec_info *stmt_info, int misalign,
6613		       enum vect_cost_model_location where)
6614{
6615  unsigned *cost = (unsigned *) data;
6616  unsigned retval = 0;
6617
6618  if (flag_vect_cost_model)
6619    {
6620      tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6621      int stmt_cost =
6622	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6623
6624      /* Statements in an inner loop relative to the loop being
6625	 vectorized are weighted more heavily.  The value here is
6626	 a function (linear for now) of the loop nest level.  */
6627      if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6628	{
6629	  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6630	  struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6631	  unsigned nest_level = loop_depth (loop);
6632
6633	  count *= nest_level;
6634	}
6635
6636      retval = (unsigned) (count * stmt_cost);
6637      cost[where] += retval;
6638    }
6639
6640  return retval;
6641}
6642
6643static void initialize_aarch64_code_model (void);
6644
6645/* Parse the architecture extension string.  */
6646
6647static void
6648aarch64_parse_extension (char *str)
6649{
6650  /* The extension string is parsed left to right.  */
6651  const struct aarch64_option_extension *opt = NULL;
6652
6653  /* Flag to say whether we are adding or removing an extension.  */
6654  int adding_ext = -1;
6655
6656  while (str != NULL && *str != 0)
6657    {
6658      char *ext;
6659      size_t len;
6660
6661      str++;
6662      ext = strchr (str, '+');
6663
6664      if (ext != NULL)
6665	len = ext - str;
6666      else
6667	len = strlen (str);
6668
6669      if (len >= 2 && strncmp (str, "no", 2) == 0)
6670	{
6671	  adding_ext = 0;
6672	  len -= 2;
6673	  str += 2;
6674	}
6675      else if (len > 0)
6676	adding_ext = 1;
6677
6678      if (len == 0)
6679	{
6680	  error ("missing feature modifier after %qs", adding_ext ? "+"
6681	                                                          : "+no");
6682	  return;
6683	}
6684
6685      /* Scan over the extensions table trying to find an exact match.  */
6686      for (opt = all_extensions; opt->name != NULL; opt++)
6687	{
6688	  if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6689	    {
6690	      /* Add or remove the extension.  */
6691	      if (adding_ext)
6692		aarch64_isa_flags |= opt->flags_on;
6693	      else
6694		aarch64_isa_flags &= ~(opt->flags_off);
6695	      break;
6696	    }
6697	}
6698
6699      if (opt->name == NULL)
6700	{
6701	  /* Extension not found in list.  */
6702	  error ("unknown feature modifier %qs", str);
6703	  return;
6704	}
6705
6706      str = ext;
6707    };
6708
6709  return;
6710}
6711
6712/* Parse the ARCH string.  */
6713
6714static void
6715aarch64_parse_arch (void)
6716{
6717  char *ext;
6718  const struct processor *arch;
6719  char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6720  size_t len;
6721
6722  strcpy (str, aarch64_arch_string);
6723
6724  ext = strchr (str, '+');
6725
6726  if (ext != NULL)
6727    len = ext - str;
6728  else
6729    len = strlen (str);
6730
6731  if (len == 0)
6732    {
6733      error ("missing arch name in -march=%qs", str);
6734      return;
6735    }
6736
6737  /* Loop through the list of supported ARCHs to find a match.  */
6738  for (arch = all_architectures; arch->name != NULL; arch++)
6739    {
6740      if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6741	{
6742	  selected_arch = arch;
6743	  aarch64_isa_flags = selected_arch->flags;
6744
6745	  if (!selected_cpu)
6746	    selected_cpu = &all_cores[selected_arch->core];
6747
6748	  if (ext != NULL)
6749	    {
6750	      /* ARCH string contains at least one extension.  */
6751	      aarch64_parse_extension (ext);
6752	    }
6753
6754	  if (strcmp (selected_arch->arch, selected_cpu->arch))
6755	    {
6756	      warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6757		       selected_cpu->name, selected_arch->name);
6758	    }
6759
6760	  return;
6761	}
6762    }
6763
6764  /* ARCH name not found in list.  */
6765  error ("unknown value %qs for -march", str);
6766  return;
6767}
6768
6769/* Parse the CPU string.  */
6770
6771static void
6772aarch64_parse_cpu (void)
6773{
6774  char *ext;
6775  const struct processor *cpu;
6776  char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6777  size_t len;
6778
6779  strcpy (str, aarch64_cpu_string);
6780
6781  ext = strchr (str, '+');
6782
6783  if (ext != NULL)
6784    len = ext - str;
6785  else
6786    len = strlen (str);
6787
6788  if (len == 0)
6789    {
6790      error ("missing cpu name in -mcpu=%qs", str);
6791      return;
6792    }
6793
6794  /* Loop through the list of supported CPUs to find a match.  */
6795  for (cpu = all_cores; cpu->name != NULL; cpu++)
6796    {
6797      if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6798	{
6799	  selected_cpu = cpu;
6800	  aarch64_isa_flags = selected_cpu->flags;
6801
6802	  if (ext != NULL)
6803	    {
6804	      /* CPU string contains at least one extension.  */
6805	      aarch64_parse_extension (ext);
6806	    }
6807
6808	  return;
6809	}
6810    }
6811
6812  /* CPU name not found in list.  */
6813  error ("unknown value %qs for -mcpu", str);
6814  return;
6815}
6816
6817/* Parse the TUNE string.  */
6818
6819static void
6820aarch64_parse_tune (void)
6821{
6822  const struct processor *cpu;
6823  char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6824  strcpy (str, aarch64_tune_string);
6825
6826  /* Loop through the list of supported CPUs to find a match.  */
6827  for (cpu = all_cores; cpu->name != NULL; cpu++)
6828    {
6829      if (strcmp (cpu->name, str) == 0)
6830	{
6831	  selected_tune = cpu;
6832	  return;
6833	}
6834    }
6835
6836  /* CPU name not found in list.  */
6837  error ("unknown value %qs for -mtune", str);
6838  return;
6839}
6840
6841
6842/* Implement TARGET_OPTION_OVERRIDE.  */
6843
6844static void
6845aarch64_override_options (void)
6846{
6847  /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6848     If either of -march or -mtune is given, they override their
6849     respective component of -mcpu.
6850
6851     So, first parse AARCH64_CPU_STRING, then the others, be careful
6852     with -march as, if -mcpu is not present on the command line, march
6853     must set a sensible default CPU.  */
6854  if (aarch64_cpu_string)
6855    {
6856      aarch64_parse_cpu ();
6857    }
6858
6859  if (aarch64_arch_string)
6860    {
6861      aarch64_parse_arch ();
6862    }
6863
6864  if (aarch64_tune_string)
6865    {
6866      aarch64_parse_tune ();
6867    }
6868
6869#ifndef HAVE_AS_MABI_OPTION
6870  /* The compiler may have been configured with 2.23.* binutils, which does
6871     not have support for ILP32.  */
6872  if (TARGET_ILP32)
6873    error ("Assembler does not support -mabi=ilp32");
6874#endif
6875
6876  initialize_aarch64_code_model ();
6877
6878  aarch64_build_bitmask_table ();
6879
6880  /* This target defaults to strict volatile bitfields.  */
6881  if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6882    flag_strict_volatile_bitfields = 1;
6883
6884  /* If the user did not specify a processor, choose the default
6885     one for them.  This will be the CPU set during configuration using
6886     --with-cpu, otherwise it is "generic".  */
6887  if (!selected_cpu)
6888    {
6889      selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6890      aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6891    }
6892
6893  gcc_assert (selected_cpu);
6894
6895  if (!selected_tune)
6896    selected_tune = selected_cpu;
6897
6898  aarch64_tune_flags = selected_tune->flags;
6899  aarch64_tune = selected_tune->core;
6900  aarch64_tune_params = selected_tune->tune;
6901  aarch64_architecture_version = selected_cpu->architecture_version;
6902
6903  if (aarch64_fix_a53_err835769 == 2)
6904    {
6905#ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6906      aarch64_fix_a53_err835769 = 1;
6907#else
6908      aarch64_fix_a53_err835769 = 0;
6909#endif
6910    }
6911
6912  aarch64_override_options_after_change ();
6913}
6914
6915/* Implement targetm.override_options_after_change.  */
6916
6917static void
6918aarch64_override_options_after_change (void)
6919{
6920  /* The logic here is that if we are disabling all frame pointer generation
6921     then we do not need to disable leaf frame pointer generation as a
6922     separate operation.  But if we are *only* disabling leaf frame pointer
6923     generation then we set flag_omit_frame_pointer to true, but in
6924     aarch64_frame_pointer_required we return false only for leaf functions.
6925
6926     PR 70044: We have to be careful about being called multiple times for the
6927     same function.  Once we have decided to set flag_omit_frame_pointer just
6928     so that we can omit leaf frame pointers, we must then not interpret a
6929     second call as meaning that all frame pointer generation should be
6930     omitted.  We do this by setting flag_omit_frame_pointer to a special,
6931     non-zero value.  */
6932
6933  if (flag_omit_frame_pointer == 2)
6934    flag_omit_frame_pointer = 0;
6935
6936  if (flag_omit_frame_pointer)
6937    flag_omit_leaf_frame_pointer = false;
6938  else if (flag_omit_leaf_frame_pointer)
6939    flag_omit_frame_pointer = 2;
6940
6941  /* If not optimizing for size, set the default
6942     alignment to what the target wants */
6943  if (!optimize_size)
6944    {
6945      if (align_loops <= 0)
6946	align_loops = aarch64_tune_params->loop_align;
6947      if (align_jumps <= 0)
6948	align_jumps = aarch64_tune_params->jump_align;
6949      if (align_functions <= 0)
6950	align_functions = aarch64_tune_params->function_align;
6951    }
6952}
6953
6954static struct machine_function *
6955aarch64_init_machine_status (void)
6956{
6957  struct machine_function *machine;
6958  machine = ggc_cleared_alloc<machine_function> ();
6959  return machine;
6960}
6961
6962void
6963aarch64_init_expanders (void)
6964{
6965  init_machine_status = aarch64_init_machine_status;
6966}
6967
6968/* A checking mechanism for the implementation of the various code models.  */
6969static void
6970initialize_aarch64_code_model (void)
6971{
6972   if (flag_pic)
6973     {
6974       switch (aarch64_cmodel_var)
6975	 {
6976	 case AARCH64_CMODEL_TINY:
6977	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6978	   break;
6979	 case AARCH64_CMODEL_SMALL:
6980	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6981	   break;
6982	 case AARCH64_CMODEL_LARGE:
6983	   sorry ("code model %qs with -f%s", "large",
6984		  flag_pic > 1 ? "PIC" : "pic");
6985	 default:
6986	   gcc_unreachable ();
6987	 }
6988     }
6989   else
6990     aarch64_cmodel = aarch64_cmodel_var;
6991}
6992
6993/* Return true if SYMBOL_REF X binds locally.  */
6994
6995static bool
6996aarch64_symbol_binds_local_p (const_rtx x)
6997{
6998  return (SYMBOL_REF_DECL (x)
6999	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7000	  : SYMBOL_REF_LOCAL_P (x));
7001}
7002
7003/* Return true if SYMBOL_REF X is thread local */
7004static bool
7005aarch64_tls_symbol_p (rtx x)
7006{
7007  if (! TARGET_HAVE_TLS)
7008    return false;
7009
7010  if (GET_CODE (x) != SYMBOL_REF)
7011    return false;
7012
7013  return SYMBOL_REF_TLS_MODEL (x) != 0;
7014}
7015
7016/* Classify a TLS symbol into one of the TLS kinds.  */
7017enum aarch64_symbol_type
7018aarch64_classify_tls_symbol (rtx x)
7019{
7020  enum tls_model tls_kind = tls_symbolic_operand_type (x);
7021
7022  switch (tls_kind)
7023    {
7024    case TLS_MODEL_GLOBAL_DYNAMIC:
7025    case TLS_MODEL_LOCAL_DYNAMIC:
7026      return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7027
7028    case TLS_MODEL_INITIAL_EXEC:
7029      return SYMBOL_SMALL_GOTTPREL;
7030
7031    case TLS_MODEL_LOCAL_EXEC:
7032      return SYMBOL_SMALL_TPREL;
7033
7034    case TLS_MODEL_EMULATED:
7035    case TLS_MODEL_NONE:
7036      return SYMBOL_FORCE_TO_MEM;
7037
7038    default:
7039      gcc_unreachable ();
7040    }
7041}
7042
7043/* Return the method that should be used to access SYMBOL_REF or
7044   LABEL_REF X in context CONTEXT.  */
7045
7046enum aarch64_symbol_type
7047aarch64_classify_symbol (rtx x, rtx offset,
7048			 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7049{
7050  if (GET_CODE (x) == LABEL_REF)
7051    {
7052      switch (aarch64_cmodel)
7053	{
7054	case AARCH64_CMODEL_LARGE:
7055	  return SYMBOL_FORCE_TO_MEM;
7056
7057	case AARCH64_CMODEL_TINY_PIC:
7058	case AARCH64_CMODEL_TINY:
7059	  return SYMBOL_TINY_ABSOLUTE;
7060
7061	case AARCH64_CMODEL_SMALL_PIC:
7062	case AARCH64_CMODEL_SMALL:
7063	  return SYMBOL_SMALL_ABSOLUTE;
7064
7065	default:
7066	  gcc_unreachable ();
7067	}
7068    }
7069
7070  if (GET_CODE (x) == SYMBOL_REF)
7071    {
7072      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7073	  return SYMBOL_FORCE_TO_MEM;
7074
7075      if (aarch64_tls_symbol_p (x))
7076	return aarch64_classify_tls_symbol (x);
7077
7078      switch (aarch64_cmodel)
7079	{
7080	case AARCH64_CMODEL_TINY:
7081	  /* When we retreive symbol + offset address, we have to make sure
7082	     the offset does not cause overflow of the final address.  But
7083	     we have no way of knowing the address of symbol at compile time
7084	     so we can't accurately say if the distance between the PC and
7085	     symbol + offset is outside the addressible range of +/-1M in the
7086	     TINY code model.  So we rely on images not being greater than
7087	     1M and cap the offset at 1M and anything beyond 1M will have to
7088	     be loaded using an alternative mechanism.  */
7089	  if (SYMBOL_REF_WEAK (x)
7090	      || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7091	    return SYMBOL_FORCE_TO_MEM;
7092	  return SYMBOL_TINY_ABSOLUTE;
7093
7094	case AARCH64_CMODEL_SMALL:
7095	  /* Same reasoning as the tiny code model, but the offset cap here is
7096	     4G.  */
7097	  if (SYMBOL_REF_WEAK (x)
7098	      || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7099			    HOST_WIDE_INT_C (4294967264)))
7100	    return SYMBOL_FORCE_TO_MEM;
7101	  return SYMBOL_SMALL_ABSOLUTE;
7102
7103	case AARCH64_CMODEL_TINY_PIC:
7104	  if (!aarch64_symbol_binds_local_p (x))
7105	    return SYMBOL_TINY_GOT;
7106	  return SYMBOL_TINY_ABSOLUTE;
7107
7108	case AARCH64_CMODEL_SMALL_PIC:
7109	  if (!aarch64_symbol_binds_local_p (x))
7110	    return SYMBOL_SMALL_GOT;
7111	  return SYMBOL_SMALL_ABSOLUTE;
7112
7113	default:
7114	  gcc_unreachable ();
7115	}
7116    }
7117
7118  /* By default push everything into the constant pool.  */
7119  return SYMBOL_FORCE_TO_MEM;
7120}
7121
7122bool
7123aarch64_constant_address_p (rtx x)
7124{
7125  return (CONSTANT_P (x) && memory_address_p (DImode, x));
7126}
7127
7128bool
7129aarch64_legitimate_pic_operand_p (rtx x)
7130{
7131  if (GET_CODE (x) == SYMBOL_REF
7132      || (GET_CODE (x) == CONST
7133	  && GET_CODE (XEXP (x, 0)) == PLUS
7134	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7135     return false;
7136
7137  return true;
7138}
7139
7140/* Return true if X holds either a quarter-precision or
7141     floating-point +0.0 constant.  */
7142static bool
7143aarch64_valid_floating_const (machine_mode mode, rtx x)
7144{
7145  if (!CONST_DOUBLE_P (x))
7146    return false;
7147
7148  /* TODO: We could handle moving 0.0 to a TFmode register,
7149     but first we would like to refactor the movtf_aarch64
7150     to be more amicable to split moves properly and
7151     correctly gate on TARGET_SIMD.  For now - reject all
7152     constants which are not to SFmode or DFmode registers.  */
7153  if (!(mode == SFmode || mode == DFmode))
7154    return false;
7155
7156  if (aarch64_float_const_zero_rtx_p (x))
7157    return true;
7158  return aarch64_float_const_representable_p (x);
7159}
7160
7161static bool
7162aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7163{
7164  /* Do not allow vector struct mode constants.  We could support
7165     0 and -1 easily, but they need support in aarch64-simd.md.  */
7166  if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7167    return false;
7168
7169  /* This could probably go away because
7170     we now decompose CONST_INTs according to expand_mov_immediate.  */
7171  if ((GET_CODE (x) == CONST_VECTOR
7172       && aarch64_simd_valid_immediate (x, mode, false, NULL))
7173      || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7174	return !targetm.cannot_force_const_mem (mode, x);
7175
7176  if (GET_CODE (x) == HIGH
7177      && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7178    return true;
7179
7180  return aarch64_constant_address_p (x);
7181}
7182
7183rtx
7184aarch64_load_tp (rtx target)
7185{
7186  if (!target
7187      || GET_MODE (target) != Pmode
7188      || !register_operand (target, Pmode))
7189    target = gen_reg_rtx (Pmode);
7190
7191  /* Can return in any reg.  */
7192  emit_insn (gen_aarch64_load_tp_hard (target));
7193  return target;
7194}
7195
7196/* On AAPCS systems, this is the "struct __va_list".  */
7197static GTY(()) tree va_list_type;
7198
7199/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7200   Return the type to use as __builtin_va_list.
7201
7202   AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7203
7204   struct __va_list
7205   {
7206     void *__stack;
7207     void *__gr_top;
7208     void *__vr_top;
7209     int   __gr_offs;
7210     int   __vr_offs;
7211   };  */
7212
7213static tree
7214aarch64_build_builtin_va_list (void)
7215{
7216  tree va_list_name;
7217  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7218
7219  /* Create the type.  */
7220  va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7221  /* Give it the required name.  */
7222  va_list_name = build_decl (BUILTINS_LOCATION,
7223			     TYPE_DECL,
7224			     get_identifier ("__va_list"),
7225			     va_list_type);
7226  DECL_ARTIFICIAL (va_list_name) = 1;
7227  TYPE_NAME (va_list_type) = va_list_name;
7228  TYPE_STUB_DECL (va_list_type) = va_list_name;
7229
7230  /* Create the fields.  */
7231  f_stack = build_decl (BUILTINS_LOCATION,
7232			FIELD_DECL, get_identifier ("__stack"),
7233			ptr_type_node);
7234  f_grtop = build_decl (BUILTINS_LOCATION,
7235			FIELD_DECL, get_identifier ("__gr_top"),
7236			ptr_type_node);
7237  f_vrtop = build_decl (BUILTINS_LOCATION,
7238			FIELD_DECL, get_identifier ("__vr_top"),
7239			ptr_type_node);
7240  f_groff = build_decl (BUILTINS_LOCATION,
7241			FIELD_DECL, get_identifier ("__gr_offs"),
7242			integer_type_node);
7243  f_vroff = build_decl (BUILTINS_LOCATION,
7244			FIELD_DECL, get_identifier ("__vr_offs"),
7245			integer_type_node);
7246
7247  DECL_ARTIFICIAL (f_stack) = 1;
7248  DECL_ARTIFICIAL (f_grtop) = 1;
7249  DECL_ARTIFICIAL (f_vrtop) = 1;
7250  DECL_ARTIFICIAL (f_groff) = 1;
7251  DECL_ARTIFICIAL (f_vroff) = 1;
7252
7253  DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7254  DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7255  DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7256  DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7257  DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7258
7259  TYPE_FIELDS (va_list_type) = f_stack;
7260  DECL_CHAIN (f_stack) = f_grtop;
7261  DECL_CHAIN (f_grtop) = f_vrtop;
7262  DECL_CHAIN (f_vrtop) = f_groff;
7263  DECL_CHAIN (f_groff) = f_vroff;
7264
7265  /* Compute its layout.  */
7266  layout_type (va_list_type);
7267
7268  return va_list_type;
7269}
7270
7271/* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7272static void
7273aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7274{
7275  const CUMULATIVE_ARGS *cum;
7276  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7277  tree stack, grtop, vrtop, groff, vroff;
7278  tree t;
7279  int gr_save_area_size;
7280  int vr_save_area_size;
7281  int vr_offset;
7282
7283  cum = &crtl->args.info;
7284  gr_save_area_size
7285    = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7286  vr_save_area_size
7287    = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7288
7289  if (!TARGET_FLOAT)
7290    {
7291      if (cum->aapcs_nvrn > 0)
7292	sorry ("%qs and floating point or vector arguments",
7293	       "-mgeneral-regs-only");
7294      vr_save_area_size = 0;
7295    }
7296
7297  f_stack = TYPE_FIELDS (va_list_type_node);
7298  f_grtop = DECL_CHAIN (f_stack);
7299  f_vrtop = DECL_CHAIN (f_grtop);
7300  f_groff = DECL_CHAIN (f_vrtop);
7301  f_vroff = DECL_CHAIN (f_groff);
7302
7303  stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7304		  NULL_TREE);
7305  grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7306		  NULL_TREE);
7307  vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7308		  NULL_TREE);
7309  groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7310		  NULL_TREE);
7311  vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7312		  NULL_TREE);
7313
7314  /* Emit code to initialize STACK, which points to the next varargs stack
7315     argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7316     by named arguments.  STACK is 8-byte aligned.  */
7317  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7318  if (cum->aapcs_stack_size > 0)
7319    t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7320  t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7321  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7322
7323  /* Emit code to initialize GRTOP, the top of the GR save area.
7324     virtual_incoming_args_rtx should have been 16 byte aligned.  */
7325  t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7326  t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7327  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7328
7329  /* Emit code to initialize VRTOP, the top of the VR save area.
7330     This address is gr_save_area_bytes below GRTOP, rounded
7331     down to the next 16-byte boundary.  */
7332  t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7333  vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7334			     STACK_BOUNDARY / BITS_PER_UNIT);
7335
7336  if (vr_offset)
7337    t = fold_build_pointer_plus_hwi (t, -vr_offset);
7338  t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7339  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7340
7341  /* Emit code to initialize GROFF, the offset from GRTOP of the
7342     next GPR argument.  */
7343  t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7344	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7345  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7346
7347  /* Likewise emit code to initialize VROFF, the offset from FTOP
7348     of the next VR argument.  */
7349  t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7350	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7351  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7352}
7353
7354/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7355
7356static tree
7357aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7358			      gimple_seq *post_p ATTRIBUTE_UNUSED)
7359{
7360  tree addr;
7361  bool indirect_p;
7362  bool is_ha;		/* is HFA or HVA.  */
7363  bool dw_align;	/* double-word align.  */
7364  machine_mode ag_mode = VOIDmode;
7365  int nregs;
7366  machine_mode mode;
7367
7368  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7369  tree stack, f_top, f_off, off, arg, roundup, on_stack;
7370  HOST_WIDE_INT size, rsize, adjust, align;
7371  tree t, u, cond1, cond2;
7372
7373  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7374  if (indirect_p)
7375    type = build_pointer_type (type);
7376
7377  mode = TYPE_MODE (type);
7378
7379  f_stack = TYPE_FIELDS (va_list_type_node);
7380  f_grtop = DECL_CHAIN (f_stack);
7381  f_vrtop = DECL_CHAIN (f_grtop);
7382  f_groff = DECL_CHAIN (f_vrtop);
7383  f_vroff = DECL_CHAIN (f_groff);
7384
7385  stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7386		  f_stack, NULL_TREE);
7387  size = int_size_in_bytes (type);
7388  align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7389
7390  dw_align = false;
7391  adjust = 0;
7392  if (aarch64_vfp_is_call_or_return_candidate (mode,
7393					       type,
7394					       &ag_mode,
7395					       &nregs,
7396					       &is_ha))
7397    {
7398      /* TYPE passed in fp/simd registers.  */
7399      if (!TARGET_FLOAT)
7400	sorry ("%qs and floating point or vector arguments",
7401	       "-mgeneral-regs-only");
7402
7403      f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7404		      unshare_expr (valist), f_vrtop, NULL_TREE);
7405      f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7406		      unshare_expr (valist), f_vroff, NULL_TREE);
7407
7408      rsize = nregs * UNITS_PER_VREG;
7409
7410      if (is_ha)
7411	{
7412	  if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7413	    adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7414	}
7415      else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7416	       && size < UNITS_PER_VREG)
7417	{
7418	  adjust = UNITS_PER_VREG - size;
7419	}
7420    }
7421  else
7422    {
7423      /* TYPE passed in general registers.  */
7424      f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7425		      unshare_expr (valist), f_grtop, NULL_TREE);
7426      f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7427		      unshare_expr (valist), f_groff, NULL_TREE);
7428      rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7429      nregs = rsize / UNITS_PER_WORD;
7430
7431      if (align > 8)
7432	dw_align = true;
7433
7434      if (BLOCK_REG_PADDING (mode, type, 1) == downward
7435	  && size < UNITS_PER_WORD)
7436	{
7437	  adjust = UNITS_PER_WORD  - size;
7438	}
7439    }
7440
7441  /* Get a local temporary for the field value.  */
7442  off = get_initialized_tmp_var (f_off, pre_p, NULL);
7443
7444  /* Emit code to branch if off >= 0.  */
7445  t = build2 (GE_EXPR, boolean_type_node, off,
7446	      build_int_cst (TREE_TYPE (off), 0));
7447  cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7448
7449  if (dw_align)
7450    {
7451      /* Emit: offs = (offs + 15) & -16.  */
7452      t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7453		  build_int_cst (TREE_TYPE (off), 15));
7454      t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7455		  build_int_cst (TREE_TYPE (off), -16));
7456      roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7457    }
7458  else
7459    roundup = NULL;
7460
7461  /* Update ap.__[g|v]r_offs  */
7462  t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7463	      build_int_cst (TREE_TYPE (off), rsize));
7464  t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7465
7466  /* String up.  */
7467  if (roundup)
7468    t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7469
7470  /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7471  u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7472	      build_int_cst (TREE_TYPE (f_off), 0));
7473  cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7474
7475  /* String up: make sure the assignment happens before the use.  */
7476  t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7477  COND_EXPR_ELSE (cond1) = t;
7478
7479  /* Prepare the trees handling the argument that is passed on the stack;
7480     the top level node will store in ON_STACK.  */
7481  arg = get_initialized_tmp_var (stack, pre_p, NULL);
7482  if (align > 8)
7483    {
7484      /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7485      t = fold_convert (intDI_type_node, arg);
7486      t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7487		  build_int_cst (TREE_TYPE (t), 15));
7488      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7489		  build_int_cst (TREE_TYPE (t), -16));
7490      t = fold_convert (TREE_TYPE (arg), t);
7491      roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7492    }
7493  else
7494    roundup = NULL;
7495  /* Advance ap.__stack  */
7496  t = fold_convert (intDI_type_node, arg);
7497  t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7498	      build_int_cst (TREE_TYPE (t), size + 7));
7499  t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7500	      build_int_cst (TREE_TYPE (t), -8));
7501  t = fold_convert (TREE_TYPE (arg), t);
7502  t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7503  /* String up roundup and advance.  */
7504  if (roundup)
7505    t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7506  /* String up with arg */
7507  on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7508  /* Big-endianness related address adjustment.  */
7509  if (BLOCK_REG_PADDING (mode, type, 1) == downward
7510      && size < UNITS_PER_WORD)
7511  {
7512    t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7513		size_int (UNITS_PER_WORD - size));
7514    on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7515  }
7516
7517  COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7518  COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7519
7520  /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7521  t = off;
7522  if (adjust)
7523    t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7524		build_int_cst (TREE_TYPE (off), adjust));
7525
7526  t = fold_convert (sizetype, t);
7527  t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7528
7529  if (is_ha)
7530    {
7531      /* type ha; // treat as "struct {ftype field[n];}"
7532         ... [computing offs]
7533         for (i = 0; i <nregs; ++i, offs += 16)
7534	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7535	 return ha;  */
7536      int i;
7537      tree tmp_ha, field_t, field_ptr_t;
7538
7539      /* Declare a local variable.  */
7540      tmp_ha = create_tmp_var_raw (type, "ha");
7541      gimple_add_tmp_var (tmp_ha);
7542
7543      /* Establish the base type.  */
7544      switch (ag_mode)
7545	{
7546	case SFmode:
7547	  field_t = float_type_node;
7548	  field_ptr_t = float_ptr_type_node;
7549	  break;
7550	case DFmode:
7551	  field_t = double_type_node;
7552	  field_ptr_t = double_ptr_type_node;
7553	  break;
7554	case TFmode:
7555	  field_t = long_double_type_node;
7556	  field_ptr_t = long_double_ptr_type_node;
7557	  break;
7558/* The half precision and quad precision are not fully supported yet.  Enable
7559   the following code after the support is complete.  Need to find the correct
7560   type node for __fp16 *.  */
7561#if 0
7562	case HFmode:
7563	  field_t = float_type_node;
7564	  field_ptr_t = float_ptr_type_node;
7565	  break;
7566#endif
7567	case V2SImode:
7568	case V4SImode:
7569	    {
7570	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7571	      field_t = build_vector_type_for_mode (innertype, ag_mode);
7572	      field_ptr_t = build_pointer_type (field_t);
7573	    }
7574	  break;
7575	default:
7576	  gcc_assert (0);
7577	}
7578
7579      /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7580      tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7581      addr = t;
7582      t = fold_convert (field_ptr_t, addr);
7583      t = build2 (MODIFY_EXPR, field_t,
7584		  build1 (INDIRECT_REF, field_t, tmp_ha),
7585		  build1 (INDIRECT_REF, field_t, t));
7586
7587      /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7588      for (i = 1; i < nregs; ++i)
7589	{
7590	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7591	  u = fold_convert (field_ptr_t, addr);
7592	  u = build2 (MODIFY_EXPR, field_t,
7593		      build2 (MEM_REF, field_t, tmp_ha,
7594			      build_int_cst (field_ptr_t,
7595					     (i *
7596					      int_size_in_bytes (field_t)))),
7597		      build1 (INDIRECT_REF, field_t, u));
7598	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7599	}
7600
7601      u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7602      t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7603    }
7604
7605  COND_EXPR_ELSE (cond2) = t;
7606  addr = fold_convert (build_pointer_type (type), cond1);
7607  addr = build_va_arg_indirect_ref (addr);
7608
7609  if (indirect_p)
7610    addr = build_va_arg_indirect_ref (addr);
7611
7612  return addr;
7613}
7614
7615/* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7616
7617static void
7618aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7619				tree type, int *pretend_size ATTRIBUTE_UNUSED,
7620				int no_rtl)
7621{
7622  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7623  CUMULATIVE_ARGS local_cum;
7624  int gr_saved, vr_saved;
7625
7626  /* The caller has advanced CUM up to, but not beyond, the last named
7627     argument.  Advance a local copy of CUM past the last "real" named
7628     argument, to find out how many registers are left over.  */
7629  local_cum = *cum;
7630  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7631
7632  /* Found out how many registers we need to save.  */
7633  gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7634  vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7635
7636  if (!TARGET_FLOAT)
7637    {
7638      if (local_cum.aapcs_nvrn > 0)
7639	sorry ("%qs and floating point or vector arguments",
7640	       "-mgeneral-regs-only");
7641      vr_saved = 0;
7642    }
7643
7644  if (!no_rtl)
7645    {
7646      if (gr_saved > 0)
7647	{
7648	  rtx ptr, mem;
7649
7650	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7651	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7652			       - gr_saved * UNITS_PER_WORD);
7653	  mem = gen_frame_mem (BLKmode, ptr);
7654	  set_mem_alias_set (mem, get_varargs_alias_set ());
7655
7656	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7657			       mem, gr_saved);
7658	}
7659      if (vr_saved > 0)
7660	{
7661	  /* We can't use move_block_from_reg, because it will use
7662	     the wrong mode, storing D regs only.  */
7663	  machine_mode mode = TImode;
7664	  int off, i;
7665
7666	  /* Set OFF to the offset from virtual_incoming_args_rtx of
7667	     the first vector register.  The VR save area lies below
7668	     the GR one, and is aligned to 16 bytes.  */
7669	  off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7670				   STACK_BOUNDARY / BITS_PER_UNIT);
7671	  off -= vr_saved * UNITS_PER_VREG;
7672
7673	  for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7674	    {
7675	      rtx ptr, mem;
7676
7677	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7678	      mem = gen_frame_mem (mode, ptr);
7679	      set_mem_alias_set (mem, get_varargs_alias_set ());
7680	      aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7681	      off += UNITS_PER_VREG;
7682	    }
7683	}
7684    }
7685
7686  /* We don't save the size into *PRETEND_SIZE because we want to avoid
7687     any complication of having crtl->args.pretend_args_size changed.  */
7688  cfun->machine->frame.saved_varargs_size
7689    = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7690		      STACK_BOUNDARY / BITS_PER_UNIT)
7691       + vr_saved * UNITS_PER_VREG);
7692}
7693
7694static void
7695aarch64_conditional_register_usage (void)
7696{
7697  int i;
7698  if (!TARGET_FLOAT)
7699    {
7700      for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7701	{
7702	  fixed_regs[i] = 1;
7703	  call_used_regs[i] = 1;
7704	}
7705    }
7706}
7707
7708/* Walk down the type tree of TYPE counting consecutive base elements.
7709   If *MODEP is VOIDmode, then set it to the first valid floating point
7710   type.  If a non-floating point type is found, or if a floating point
7711   type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7712   otherwise return the count in the sub-tree.  */
7713static int
7714aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7715{
7716  machine_mode mode;
7717  HOST_WIDE_INT size;
7718
7719  switch (TREE_CODE (type))
7720    {
7721    case REAL_TYPE:
7722      mode = TYPE_MODE (type);
7723      if (mode != DFmode && mode != SFmode && mode != TFmode)
7724	return -1;
7725
7726      if (*modep == VOIDmode)
7727	*modep = mode;
7728
7729      if (*modep == mode)
7730	return 1;
7731
7732      break;
7733
7734    case COMPLEX_TYPE:
7735      mode = TYPE_MODE (TREE_TYPE (type));
7736      if (mode != DFmode && mode != SFmode && mode != TFmode)
7737	return -1;
7738
7739      if (*modep == VOIDmode)
7740	*modep = mode;
7741
7742      if (*modep == mode)
7743	return 2;
7744
7745      break;
7746
7747    case VECTOR_TYPE:
7748      /* Use V2SImode and V4SImode as representatives of all 64-bit
7749	 and 128-bit vector types.  */
7750      size = int_size_in_bytes (type);
7751      switch (size)
7752	{
7753	case 8:
7754	  mode = V2SImode;
7755	  break;
7756	case 16:
7757	  mode = V4SImode;
7758	  break;
7759	default:
7760	  return -1;
7761	}
7762
7763      if (*modep == VOIDmode)
7764	*modep = mode;
7765
7766      /* Vector modes are considered to be opaque: two vectors are
7767	 equivalent for the purposes of being homogeneous aggregates
7768	 if they are the same size.  */
7769      if (*modep == mode)
7770	return 1;
7771
7772      break;
7773
7774    case ARRAY_TYPE:
7775      {
7776	int count;
7777	tree index = TYPE_DOMAIN (type);
7778
7779	/* Can't handle incomplete types nor sizes that are not
7780	   fixed.  */
7781	if (!COMPLETE_TYPE_P (type)
7782	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7783	  return -1;
7784
7785	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7786	if (count == -1
7787	    || !index
7788	    || !TYPE_MAX_VALUE (index)
7789	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7790	    || !TYPE_MIN_VALUE (index)
7791	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7792	    || count < 0)
7793	  return -1;
7794
7795	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7796		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7797
7798	/* There must be no padding.  */
7799	if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7800	  return -1;
7801
7802	return count;
7803      }
7804
7805    case RECORD_TYPE:
7806      {
7807	int count = 0;
7808	int sub_count;
7809	tree field;
7810
7811	/* Can't handle incomplete types nor sizes that are not
7812	   fixed.  */
7813	if (!COMPLETE_TYPE_P (type)
7814	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7815	  return -1;
7816
7817	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7818	  {
7819	    if (TREE_CODE (field) != FIELD_DECL)
7820	      continue;
7821
7822	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7823	    if (sub_count < 0)
7824	      return -1;
7825	    count += sub_count;
7826	  }
7827
7828	/* There must be no padding.  */
7829	if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7830	  return -1;
7831
7832	return count;
7833      }
7834
7835    case UNION_TYPE:
7836    case QUAL_UNION_TYPE:
7837      {
7838	/* These aren't very interesting except in a degenerate case.  */
7839	int count = 0;
7840	int sub_count;
7841	tree field;
7842
7843	/* Can't handle incomplete types nor sizes that are not
7844	   fixed.  */
7845	if (!COMPLETE_TYPE_P (type)
7846	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7847	  return -1;
7848
7849	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7850	  {
7851	    if (TREE_CODE (field) != FIELD_DECL)
7852	      continue;
7853
7854	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7855	    if (sub_count < 0)
7856	      return -1;
7857	    count = count > sub_count ? count : sub_count;
7858	  }
7859
7860	/* There must be no padding.  */
7861	if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7862	  return -1;
7863
7864	return count;
7865      }
7866
7867    default:
7868      break;
7869    }
7870
7871  return -1;
7872}
7873
7874/* Return TRUE if the type, as described by TYPE and MODE, is a composite
7875   type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7876   array types.  The C99 floating-point complex types are also considered
7877   as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7878   types, which are GCC extensions and out of the scope of AAPCS64, are
7879   treated as composite types here as well.
7880
7881   Note that MODE itself is not sufficient in determining whether a type
7882   is such a composite type or not.  This is because
7883   stor-layout.c:compute_record_mode may have already changed the MODE
7884   (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7885   structure with only one field may have its MODE set to the mode of the
7886   field.  Also an integer mode whose size matches the size of the
7887   RECORD_TYPE type may be used to substitute the original mode
7888   (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7889   solely relied on.  */
7890
7891static bool
7892aarch64_composite_type_p (const_tree type,
7893			  machine_mode mode)
7894{
7895  if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7896    return true;
7897
7898  if (mode == BLKmode
7899      || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7900      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7901    return true;
7902
7903  return false;
7904}
7905
7906/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7907   type as described in AAPCS64 \S 4.1.2.
7908
7909   See the comment above aarch64_composite_type_p for the notes on MODE.  */
7910
7911static bool
7912aarch64_short_vector_p (const_tree type,
7913			machine_mode mode)
7914{
7915  HOST_WIDE_INT size = -1;
7916
7917  if (type && TREE_CODE (type) == VECTOR_TYPE)
7918    size = int_size_in_bytes (type);
7919  else if (!aarch64_composite_type_p (type, mode)
7920	   && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7921	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7922    size = GET_MODE_SIZE (mode);
7923
7924  return (size == 8 || size == 16) ? true : false;
7925}
7926
7927/* Return TRUE if an argument, whose type is described by TYPE and MODE,
7928   shall be passed or returned in simd/fp register(s) (providing these
7929   parameter passing registers are available).
7930
7931   Upon successful return, *COUNT returns the number of needed registers,
7932   *BASE_MODE returns the mode of the individual register and when IS_HAF
7933   is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7934   floating-point aggregate or a homogeneous short-vector aggregate.  */
7935
7936static bool
7937aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7938					 const_tree type,
7939					 machine_mode *base_mode,
7940					 int *count,
7941					 bool *is_ha)
7942{
7943  machine_mode new_mode = VOIDmode;
7944  bool composite_p = aarch64_composite_type_p (type, mode);
7945
7946  if (is_ha != NULL) *is_ha = false;
7947
7948  if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7949      || aarch64_short_vector_p (type, mode))
7950    {
7951      *count = 1;
7952      new_mode = mode;
7953    }
7954  else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7955    {
7956      if (is_ha != NULL) *is_ha = true;
7957      *count = 2;
7958      new_mode = GET_MODE_INNER (mode);
7959    }
7960  else if (type && composite_p)
7961    {
7962      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7963
7964      if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7965	{
7966	  if (is_ha != NULL) *is_ha = true;
7967	  *count = ag_count;
7968	}
7969      else
7970	return false;
7971    }
7972  else
7973    return false;
7974
7975  *base_mode = new_mode;
7976  return true;
7977}
7978
7979/* Implement TARGET_STRUCT_VALUE_RTX.  */
7980
7981static rtx
7982aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7983			  int incoming ATTRIBUTE_UNUSED)
7984{
7985  return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7986}
7987
7988/* Implements target hook vector_mode_supported_p.  */
7989static bool
7990aarch64_vector_mode_supported_p (machine_mode mode)
7991{
7992  if (TARGET_SIMD
7993      && (mode == V4SImode  || mode == V8HImode
7994	  || mode == V16QImode || mode == V2DImode
7995	  || mode == V2SImode  || mode == V4HImode
7996	  || mode == V8QImode || mode == V2SFmode
7997	  || mode == V4SFmode || mode == V2DFmode
7998	  || mode == V1DFmode))
7999    return true;
8000
8001  return false;
8002}
8003
8004/* Return appropriate SIMD container
8005   for MODE within a vector of WIDTH bits.  */
8006static machine_mode
8007aarch64_simd_container_mode (machine_mode mode, unsigned width)
8008{
8009  gcc_assert (width == 64 || width == 128);
8010  if (TARGET_SIMD)
8011    {
8012      if (width == 128)
8013	switch (mode)
8014	  {
8015	  case DFmode:
8016	    return V2DFmode;
8017	  case SFmode:
8018	    return V4SFmode;
8019	  case SImode:
8020	    return V4SImode;
8021	  case HImode:
8022	    return V8HImode;
8023	  case QImode:
8024	    return V16QImode;
8025	  case DImode:
8026	    return V2DImode;
8027	  default:
8028	    break;
8029	  }
8030      else
8031	switch (mode)
8032	  {
8033	  case SFmode:
8034	    return V2SFmode;
8035	  case SImode:
8036	    return V2SImode;
8037	  case HImode:
8038	    return V4HImode;
8039	  case QImode:
8040	    return V8QImode;
8041	  default:
8042	    break;
8043	  }
8044    }
8045  return word_mode;
8046}
8047
8048/* Return 128-bit container as the preferred SIMD mode for MODE.  */
8049static machine_mode
8050aarch64_preferred_simd_mode (machine_mode mode)
8051{
8052  return aarch64_simd_container_mode (mode, 128);
8053}
8054
8055/* Return the bitmask of possible vector sizes for the vectorizer
8056   to iterate over.  */
8057static unsigned int
8058aarch64_autovectorize_vector_sizes (void)
8059{
8060  return (16 | 8);
8061}
8062
8063/* Implement TARGET_MANGLE_TYPE.  */
8064
8065static const char *
8066aarch64_mangle_type (const_tree type)
8067{
8068  /* The AArch64 ABI documents say that "__va_list" has to be
8069     managled as if it is in the "std" namespace.  */
8070  if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8071    return "St9__va_list";
8072
8073  /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8074     builtin types.  */
8075  if (TYPE_NAME (type) != NULL)
8076    return aarch64_mangle_builtin_type (type);
8077
8078  /* Use the default mangling.  */
8079  return NULL;
8080}
8081
8082
8083/* Return true if the rtx_insn contains a MEM RTX somewhere
8084   in it.  */
8085
8086static bool
8087has_memory_op (rtx_insn *mem_insn)
8088{
8089  subrtx_iterator::array_type array;
8090  FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8091    if (MEM_P (*iter))
8092      return true;
8093
8094  return false;
8095}
8096
8097/* Find the first rtx_insn before insn that will generate an assembly
8098   instruction.  */
8099
8100static rtx_insn *
8101aarch64_prev_real_insn (rtx_insn *insn)
8102{
8103  if (!insn)
8104    return NULL;
8105
8106  do
8107    {
8108      insn = prev_real_insn (insn);
8109    }
8110  while (insn && recog_memoized (insn) < 0);
8111
8112  return insn;
8113}
8114
8115static bool
8116is_madd_op (enum attr_type t1)
8117{
8118  unsigned int i;
8119  /* A number of these may be AArch32 only.  */
8120  enum attr_type mlatypes[] = {
8121    TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8122    TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8123    TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8124  };
8125
8126  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8127    {
8128      if (t1 == mlatypes[i])
8129	return true;
8130    }
8131
8132  return false;
8133}
8134
8135/* Check if there is a register dependency between a load and the insn
8136   for which we hold recog_data.  */
8137
8138static bool
8139dep_between_memop_and_curr (rtx memop)
8140{
8141  rtx load_reg;
8142  int opno;
8143
8144  gcc_assert (GET_CODE (memop) == SET);
8145
8146  if (!REG_P (SET_DEST (memop)))
8147    return false;
8148
8149  load_reg = SET_DEST (memop);
8150  for (opno = 1; opno < recog_data.n_operands; opno++)
8151    {
8152      rtx operand = recog_data.operand[opno];
8153      if (REG_P (operand)
8154          && reg_overlap_mentioned_p (load_reg, operand))
8155        return true;
8156
8157    }
8158  return false;
8159}
8160
8161
8162/* When working around the Cortex-A53 erratum 835769,
8163   given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8164   instruction and has a preceding memory instruction such that a NOP
8165   should be inserted between them.  */
8166
8167bool
8168aarch64_madd_needs_nop (rtx_insn* insn)
8169{
8170  enum attr_type attr_type;
8171  rtx_insn *prev;
8172  rtx body;
8173
8174  if (!aarch64_fix_a53_err835769)
8175    return false;
8176
8177  if (!INSN_P (insn) || recog_memoized (insn) < 0)
8178    return false;
8179
8180  attr_type = get_attr_type (insn);
8181  if (!is_madd_op (attr_type))
8182    return false;
8183
8184  prev = aarch64_prev_real_insn (insn);
8185  /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8186     Restore recog state to INSN to avoid state corruption.  */
8187  extract_constrain_insn_cached (insn);
8188
8189  if (!prev || !has_memory_op (prev))
8190    return false;
8191
8192  body = single_set (prev);
8193
8194  /* If the previous insn is a memory op and there is no dependency between
8195     it and the DImode madd, emit a NOP between them.  If body is NULL then we
8196     have a complex memory operation, probably a load/store pair.
8197     Be conservative for now and emit a NOP.  */
8198  if (GET_MODE (recog_data.operand[0]) == DImode
8199      && (!body || !dep_between_memop_and_curr (body)))
8200    return true;
8201
8202  return false;
8203
8204}
8205
8206
8207/* Implement FINAL_PRESCAN_INSN.  */
8208
8209void
8210aarch64_final_prescan_insn (rtx_insn *insn)
8211{
8212  if (aarch64_madd_needs_nop (insn))
8213    fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8214}
8215
8216
8217/* Return the equivalent letter for size.  */
8218static char
8219sizetochar (int size)
8220{
8221  switch (size)
8222    {
8223    case 64: return 'd';
8224    case 32: return 's';
8225    case 16: return 'h';
8226    case 8 : return 'b';
8227    default: gcc_unreachable ();
8228    }
8229}
8230
8231/* Return true iff x is a uniform vector of floating-point
8232   constants, and the constant can be represented in
8233   quarter-precision form.  Note, as aarch64_float_const_representable
8234   rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8235static bool
8236aarch64_vect_float_const_representable_p (rtx x)
8237{
8238  int i = 0;
8239  REAL_VALUE_TYPE r0, ri;
8240  rtx x0, xi;
8241
8242  if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8243    return false;
8244
8245  x0 = CONST_VECTOR_ELT (x, 0);
8246  if (!CONST_DOUBLE_P (x0))
8247    return false;
8248
8249  REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8250
8251  for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8252    {
8253      xi = CONST_VECTOR_ELT (x, i);
8254      if (!CONST_DOUBLE_P (xi))
8255	return false;
8256
8257      REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8258      if (!REAL_VALUES_EQUAL (r0, ri))
8259	return false;
8260    }
8261
8262  return aarch64_float_const_representable_p (x0);
8263}
8264
8265/* Return true for valid and false for invalid.  */
8266bool
8267aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8268			      struct simd_immediate_info *info)
8269{
8270#define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)	\
8271  matches = 1;						\
8272  for (i = 0; i < idx; i += (STRIDE))			\
8273    if (!(TEST))					\
8274      matches = 0;					\
8275  if (matches)						\
8276    {							\
8277      immtype = (CLASS);				\
8278      elsize = (ELSIZE);				\
8279      eshift = (SHIFT);					\
8280      emvn = (NEG);					\
8281      break;						\
8282    }
8283
8284  unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8285  unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8286  unsigned char bytes[16];
8287  int immtype = -1, matches;
8288  unsigned int invmask = inverse ? 0xff : 0;
8289  int eshift, emvn;
8290
8291  if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8292    {
8293      if (! (aarch64_simd_imm_zero_p (op, mode)
8294	     || aarch64_vect_float_const_representable_p (op)))
8295	return false;
8296
8297      if (info)
8298	{
8299	  info->value = CONST_VECTOR_ELT (op, 0);
8300	  info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8301	  info->mvn = false;
8302	  info->shift = 0;
8303	}
8304
8305      return true;
8306    }
8307
8308  /* Splat vector constant out into a byte vector.  */
8309  for (i = 0; i < n_elts; i++)
8310    {
8311      /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8312         it must be laid out in the vector register in reverse order.  */
8313      rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8314      unsigned HOST_WIDE_INT elpart;
8315      unsigned int part, parts;
8316
8317      if (CONST_INT_P (el))
8318        {
8319          elpart = INTVAL (el);
8320          parts = 1;
8321        }
8322      else if (GET_CODE (el) == CONST_DOUBLE)
8323        {
8324          elpart = CONST_DOUBLE_LOW (el);
8325          parts = 2;
8326        }
8327      else
8328        gcc_unreachable ();
8329
8330      for (part = 0; part < parts; part++)
8331        {
8332          unsigned int byte;
8333          for (byte = 0; byte < innersize; byte++)
8334            {
8335              bytes[idx++] = (elpart & 0xff) ^ invmask;
8336              elpart >>= BITS_PER_UNIT;
8337            }
8338          if (GET_CODE (el) == CONST_DOUBLE)
8339            elpart = CONST_DOUBLE_HIGH (el);
8340        }
8341    }
8342
8343  /* Sanity check.  */
8344  gcc_assert (idx == GET_MODE_SIZE (mode));
8345
8346  do
8347    {
8348      CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8349	     && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8350
8351      CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8352	     && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8353
8354      CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8355	     && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8356
8357      CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8358	     && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8359
8360      CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8361
8362      CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8363
8364      CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8365	     && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8366
8367      CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8368	     && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8369
8370      CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8371	     && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8372
8373      CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8374	     && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8375
8376      CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8377
8378      CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8379
8380      CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8381	     && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8382
8383      CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8384	     && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8385
8386      CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8387	     && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8388
8389      CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8390	     && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8391
8392      CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8393
8394      CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8395	     && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8396    }
8397  while (0);
8398
8399  if (immtype == -1)
8400    return false;
8401
8402  if (info)
8403    {
8404      info->element_width = elsize;
8405      info->mvn = emvn != 0;
8406      info->shift = eshift;
8407
8408      unsigned HOST_WIDE_INT imm = 0;
8409
8410      if (immtype >= 12 && immtype <= 15)
8411	info->msl = true;
8412
8413      /* Un-invert bytes of recognized vector, if necessary.  */
8414      if (invmask != 0)
8415        for (i = 0; i < idx; i++)
8416          bytes[i] ^= invmask;
8417
8418      if (immtype == 17)
8419        {
8420          /* FIXME: Broken on 32-bit H_W_I hosts.  */
8421          gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8422
8423          for (i = 0; i < 8; i++)
8424            imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8425	      << (i * BITS_PER_UNIT);
8426
8427
8428	  info->value = GEN_INT (imm);
8429	}
8430      else
8431	{
8432	  for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8433	    imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8434
8435	  /* Construct 'abcdefgh' because the assembler cannot handle
8436	     generic constants.	 */
8437	  if (info->mvn)
8438	    imm = ~imm;
8439	  imm = (imm >> info->shift) & 0xff;
8440	  info->value = GEN_INT (imm);
8441	}
8442    }
8443
8444  return true;
8445#undef CHECK
8446}
8447
8448/* Check of immediate shift constants are within range.  */
8449bool
8450aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8451{
8452  int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8453  if (left)
8454    return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8455  else
8456    return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8457}
8458
8459/* Return true if X is a uniform vector where all elements
8460   are either the floating-point constant 0.0 or the
8461   integer constant 0.  */
8462bool
8463aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8464{
8465  return x == CONST0_RTX (mode);
8466}
8467
8468bool
8469aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8470{
8471  HOST_WIDE_INT imm = INTVAL (x);
8472  int i;
8473
8474  for (i = 0; i < 8; i++)
8475    {
8476      unsigned int byte = imm & 0xff;
8477      if (byte != 0xff && byte != 0)
8478       return false;
8479      imm >>= 8;
8480    }
8481
8482  return true;
8483}
8484
8485bool
8486aarch64_mov_operand_p (rtx x,
8487		       enum aarch64_symbol_context context,
8488		       machine_mode mode)
8489{
8490  if (GET_CODE (x) == HIGH
8491      && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8492    return true;
8493
8494  if (CONST_INT_P (x))
8495    return true;
8496
8497  if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8498    return true;
8499
8500  return aarch64_classify_symbolic_expression (x, context)
8501    == SYMBOL_TINY_ABSOLUTE;
8502}
8503
8504/* Return a const_int vector of VAL.  */
8505rtx
8506aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8507{
8508  int nunits = GET_MODE_NUNITS (mode);
8509  rtvec v = rtvec_alloc (nunits);
8510  int i;
8511
8512  for (i=0; i < nunits; i++)
8513    RTVEC_ELT (v, i) = GEN_INT (val);
8514
8515  return gen_rtx_CONST_VECTOR (mode, v);
8516}
8517
8518/* Check OP is a legal scalar immediate for the MOVI instruction.  */
8519
8520bool
8521aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8522{
8523  machine_mode vmode;
8524
8525  gcc_assert (!VECTOR_MODE_P (mode));
8526  vmode = aarch64_preferred_simd_mode (mode);
8527  rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8528  return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8529}
8530
8531/* Construct and return a PARALLEL RTX vector with elements numbering the
8532   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8533   the vector - from the perspective of the architecture.  This does not
8534   line up with GCC's perspective on lane numbers, so we end up with
8535   different masks depending on our target endian-ness.  The diagram
8536   below may help.  We must draw the distinction when building masks
8537   which select one half of the vector.  An instruction selecting
8538   architectural low-lanes for a big-endian target, must be described using
8539   a mask selecting GCC high-lanes.
8540
8541                 Big-Endian             Little-Endian
8542
8543GCC             0   1   2   3           3   2   1   0
8544              | x | x | x | x |       | x | x | x | x |
8545Architecture    3   2   1   0           3   2   1   0
8546
8547Low Mask:         { 2, 3 }                { 0, 1 }
8548High Mask:        { 0, 1 }                { 2, 3 }
8549*/
8550
8551rtx
8552aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8553{
8554  int nunits = GET_MODE_NUNITS (mode);
8555  rtvec v = rtvec_alloc (nunits / 2);
8556  int high_base = nunits / 2;
8557  int low_base = 0;
8558  int base;
8559  rtx t1;
8560  int i;
8561
8562  if (BYTES_BIG_ENDIAN)
8563    base = high ? low_base : high_base;
8564  else
8565    base = high ? high_base : low_base;
8566
8567  for (i = 0; i < nunits / 2; i++)
8568    RTVEC_ELT (v, i) = GEN_INT (base + i);
8569
8570  t1 = gen_rtx_PARALLEL (mode, v);
8571  return t1;
8572}
8573
8574/* Check OP for validity as a PARALLEL RTX vector with elements
8575   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8576   from the perspective of the architecture.  See the diagram above
8577   aarch64_simd_vect_par_cnst_half for more details.  */
8578
8579bool
8580aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8581				       bool high)
8582{
8583  rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8584  HOST_WIDE_INT count_op = XVECLEN (op, 0);
8585  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8586  int i = 0;
8587
8588  if (!VECTOR_MODE_P (mode))
8589    return false;
8590
8591  if (count_op != count_ideal)
8592    return false;
8593
8594  for (i = 0; i < count_ideal; i++)
8595    {
8596      rtx elt_op = XVECEXP (op, 0, i);
8597      rtx elt_ideal = XVECEXP (ideal, 0, i);
8598
8599      if (!CONST_INT_P (elt_op)
8600	  || INTVAL (elt_ideal) != INTVAL (elt_op))
8601	return false;
8602    }
8603  return true;
8604}
8605
8606/* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8607   HIGH (exclusive).  */
8608void
8609aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8610			  const_tree exp)
8611{
8612  HOST_WIDE_INT lane;
8613  gcc_assert (CONST_INT_P (operand));
8614  lane = INTVAL (operand);
8615
8616  if (lane < low || lane >= high)
8617  {
8618    if (exp)
8619      error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8620    else
8621      error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8622  }
8623}
8624
8625/* Emit code to place a AdvSIMD pair result in memory locations (with equal
8626   registers).  */
8627void
8628aarch64_simd_emit_pair_result_insn (machine_mode mode,
8629			    rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8630                            rtx op1)
8631{
8632  rtx mem = gen_rtx_MEM (mode, destaddr);
8633  rtx tmp1 = gen_reg_rtx (mode);
8634  rtx tmp2 = gen_reg_rtx (mode);
8635
8636  emit_insn (intfn (tmp1, op1, tmp2));
8637
8638  emit_move_insn (mem, tmp1);
8639  mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8640  emit_move_insn (mem, tmp2);
8641}
8642
8643/* Return TRUE if OP is a valid vector addressing mode.  */
8644bool
8645aarch64_simd_mem_operand_p (rtx op)
8646{
8647  return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8648			|| REG_P (XEXP (op, 0)));
8649}
8650
8651/* Emit a register copy from operand to operand, taking care not to
8652   early-clobber source registers in the process.
8653
8654   COUNT is the number of components into which the copy needs to be
8655   decomposed.  */
8656void
8657aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8658				unsigned int count)
8659{
8660  unsigned int i;
8661  int rdest = REGNO (operands[0]);
8662  int rsrc = REGNO (operands[1]);
8663
8664  if (!reg_overlap_mentioned_p (operands[0], operands[1])
8665      || rdest < rsrc)
8666    for (i = 0; i < count; i++)
8667      emit_move_insn (gen_rtx_REG (mode, rdest + i),
8668		      gen_rtx_REG (mode, rsrc + i));
8669  else
8670    for (i = 0; i < count; i++)
8671      emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8672		      gen_rtx_REG (mode, rsrc + count - i - 1));
8673}
8674
8675/* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8676   one of VSTRUCT modes: OI, CI or XI.  */
8677int
8678aarch64_simd_attr_length_move (rtx_insn *insn)
8679{
8680  machine_mode mode;
8681
8682  extract_insn_cached (insn);
8683
8684  if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8685    {
8686      mode = GET_MODE (recog_data.operand[0]);
8687      switch (mode)
8688	{
8689	case OImode:
8690	  return 8;
8691	case CImode:
8692	  return 12;
8693	case XImode:
8694	  return 16;
8695	default:
8696	  gcc_unreachable ();
8697	}
8698    }
8699  return 4;
8700}
8701
8702/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8703   one of VSTRUCT modes: OI, CI, EI, or XI.  */
8704int
8705aarch64_simd_attr_length_rglist (enum machine_mode mode)
8706{
8707  return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8708}
8709
8710/* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8711   alignment of a vector to 128 bits.  */
8712static HOST_WIDE_INT
8713aarch64_simd_vector_alignment (const_tree type)
8714{
8715  HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8716  return MIN (align, 128);
8717}
8718
8719/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8720static bool
8721aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8722{
8723  if (is_packed)
8724    return false;
8725
8726  /* We guarantee alignment for vectors up to 128-bits.  */
8727  if (tree_int_cst_compare (TYPE_SIZE (type),
8728			    bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8729    return false;
8730
8731  /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8732  return true;
8733}
8734
8735/* If VALS is a vector constant that can be loaded into a register
8736   using DUP, generate instructions to do so and return an RTX to
8737   assign to the register.  Otherwise return NULL_RTX.  */
8738static rtx
8739aarch64_simd_dup_constant (rtx vals)
8740{
8741  machine_mode mode = GET_MODE (vals);
8742  machine_mode inner_mode = GET_MODE_INNER (mode);
8743  int n_elts = GET_MODE_NUNITS (mode);
8744  bool all_same = true;
8745  rtx x;
8746  int i;
8747
8748  if (GET_CODE (vals) != CONST_VECTOR)
8749    return NULL_RTX;
8750
8751  for (i = 1; i < n_elts; ++i)
8752    {
8753      x = CONST_VECTOR_ELT (vals, i);
8754      if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8755	all_same = false;
8756    }
8757
8758  if (!all_same)
8759    return NULL_RTX;
8760
8761  /* We can load this constant by using DUP and a constant in a
8762     single ARM register.  This will be cheaper than a vector
8763     load.  */
8764  x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8765  return gen_rtx_VEC_DUPLICATE (mode, x);
8766}
8767
8768
8769/* Generate code to load VALS, which is a PARALLEL containing only
8770   constants (for vec_init) or CONST_VECTOR, efficiently into a
8771   register.  Returns an RTX to copy into the register, or NULL_RTX
8772   for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8773static rtx
8774aarch64_simd_make_constant (rtx vals)
8775{
8776  machine_mode mode = GET_MODE (vals);
8777  rtx const_dup;
8778  rtx const_vec = NULL_RTX;
8779  int n_elts = GET_MODE_NUNITS (mode);
8780  int n_const = 0;
8781  int i;
8782
8783  if (GET_CODE (vals) == CONST_VECTOR)
8784    const_vec = vals;
8785  else if (GET_CODE (vals) == PARALLEL)
8786    {
8787      /* A CONST_VECTOR must contain only CONST_INTs and
8788	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8789	 Only store valid constants in a CONST_VECTOR.  */
8790      for (i = 0; i < n_elts; ++i)
8791	{
8792	  rtx x = XVECEXP (vals, 0, i);
8793	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8794	    n_const++;
8795	}
8796      if (n_const == n_elts)
8797	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8798    }
8799  else
8800    gcc_unreachable ();
8801
8802  if (const_vec != NULL_RTX
8803      && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8804    /* Load using MOVI/MVNI.  */
8805    return const_vec;
8806  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8807    /* Loaded using DUP.  */
8808    return const_dup;
8809  else if (const_vec != NULL_RTX)
8810    /* Load from constant pool. We can not take advantage of single-cycle
8811       LD1 because we need a PC-relative addressing mode.  */
8812    return const_vec;
8813  else
8814    /* A PARALLEL containing something not valid inside CONST_VECTOR.
8815       We can not construct an initializer.  */
8816    return NULL_RTX;
8817}
8818
8819void
8820aarch64_expand_vector_init (rtx target, rtx vals)
8821{
8822  machine_mode mode = GET_MODE (target);
8823  machine_mode inner_mode = GET_MODE_INNER (mode);
8824  int n_elts = GET_MODE_NUNITS (mode);
8825  int n_var = 0, one_var = -1;
8826  bool all_same = true;
8827  rtx x, mem;
8828  int i;
8829
8830  x = XVECEXP (vals, 0, 0);
8831  if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8832    n_var = 1, one_var = 0;
8833
8834  for (i = 1; i < n_elts; ++i)
8835    {
8836      x = XVECEXP (vals, 0, i);
8837      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8838	++n_var, one_var = i;
8839
8840      if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8841	all_same = false;
8842    }
8843
8844  if (n_var == 0)
8845    {
8846      rtx constant = aarch64_simd_make_constant (vals);
8847      if (constant != NULL_RTX)
8848	{
8849	  emit_move_insn (target, constant);
8850	  return;
8851	}
8852    }
8853
8854  /* Splat a single non-constant element if we can.  */
8855  if (all_same)
8856    {
8857      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8858      aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8859      return;
8860    }
8861
8862  /* One field is non-constant.  Load constant then overwrite varying
8863     field.  This is more efficient than using the stack.  */
8864  if (n_var == 1)
8865    {
8866      rtx copy = copy_rtx (vals);
8867      rtx index = GEN_INT (one_var);
8868      enum insn_code icode;
8869
8870      /* Load constant part of vector, substitute neighboring value for
8871	 varying element.  */
8872      XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8873      aarch64_expand_vector_init (target, copy);
8874
8875      /* Insert variable.  */
8876      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8877      icode = optab_handler (vec_set_optab, mode);
8878      gcc_assert (icode != CODE_FOR_nothing);
8879      emit_insn (GEN_FCN (icode) (target, x, index));
8880      return;
8881    }
8882
8883  /* Construct the vector in memory one field at a time
8884     and load the whole vector.  */
8885  mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8886  for (i = 0; i < n_elts; i++)
8887    emit_move_insn (adjust_address_nv (mem, inner_mode,
8888				    i * GET_MODE_SIZE (inner_mode)),
8889		    XVECEXP (vals, 0, i));
8890  emit_move_insn (target, mem);
8891
8892}
8893
8894static unsigned HOST_WIDE_INT
8895aarch64_shift_truncation_mask (machine_mode mode)
8896{
8897  return
8898    (aarch64_vector_mode_supported_p (mode)
8899     || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8900}
8901
8902#ifndef TLS_SECTION_ASM_FLAG
8903#define TLS_SECTION_ASM_FLAG 'T'
8904#endif
8905
8906void
8907aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8908			       tree decl ATTRIBUTE_UNUSED)
8909{
8910  char flagchars[10], *f = flagchars;
8911
8912  /* If we have already declared this section, we can use an
8913     abbreviated form to switch back to it -- unless this section is
8914     part of a COMDAT groups, in which case GAS requires the full
8915     declaration every time.  */
8916  if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8917      && (flags & SECTION_DECLARED))
8918    {
8919      fprintf (asm_out_file, "\t.section\t%s\n", name);
8920      return;
8921    }
8922
8923  if (!(flags & SECTION_DEBUG))
8924    *f++ = 'a';
8925  if (flags & SECTION_WRITE)
8926    *f++ = 'w';
8927  if (flags & SECTION_CODE)
8928    *f++ = 'x';
8929  if (flags & SECTION_SMALL)
8930    *f++ = 's';
8931  if (flags & SECTION_MERGE)
8932    *f++ = 'M';
8933  if (flags & SECTION_STRINGS)
8934    *f++ = 'S';
8935  if (flags & SECTION_TLS)
8936    *f++ = TLS_SECTION_ASM_FLAG;
8937  if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8938    *f++ = 'G';
8939  *f = '\0';
8940
8941  fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8942
8943  if (!(flags & SECTION_NOTYPE))
8944    {
8945      const char *type;
8946      const char *format;
8947
8948      if (flags & SECTION_BSS)
8949	type = "nobits";
8950      else
8951	type = "progbits";
8952
8953#ifdef TYPE_OPERAND_FMT
8954      format = "," TYPE_OPERAND_FMT;
8955#else
8956      format = ",@%s";
8957#endif
8958
8959      fprintf (asm_out_file, format, type);
8960
8961      if (flags & SECTION_ENTSIZE)
8962	fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8963      if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8964	{
8965	  if (TREE_CODE (decl) == IDENTIFIER_NODE)
8966	    fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8967	  else
8968	    fprintf (asm_out_file, ",%s,comdat",
8969		     IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8970	}
8971    }
8972
8973  putc ('\n', asm_out_file);
8974}
8975
8976/* Select a format to encode pointers in exception handling data.  */
8977int
8978aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8979{
8980   int type;
8981   switch (aarch64_cmodel)
8982     {
8983     case AARCH64_CMODEL_TINY:
8984     case AARCH64_CMODEL_TINY_PIC:
8985     case AARCH64_CMODEL_SMALL:
8986     case AARCH64_CMODEL_SMALL_PIC:
8987       /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8988	  for everything.  */
8989       type = DW_EH_PE_sdata4;
8990       break;
8991     default:
8992       /* No assumptions here.  8-byte relocs required.  */
8993       type = DW_EH_PE_sdata8;
8994       break;
8995     }
8996   return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8997}
8998
8999/* Emit load exclusive.  */
9000
9001static void
9002aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9003			     rtx mem, rtx model_rtx)
9004{
9005  rtx (*gen) (rtx, rtx, rtx);
9006
9007  switch (mode)
9008    {
9009    case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9010    case HImode: gen = gen_aarch64_load_exclusivehi; break;
9011    case SImode: gen = gen_aarch64_load_exclusivesi; break;
9012    case DImode: gen = gen_aarch64_load_exclusivedi; break;
9013    default:
9014      gcc_unreachable ();
9015    }
9016
9017  emit_insn (gen (rval, mem, model_rtx));
9018}
9019
9020/* Emit store exclusive.  */
9021
9022static void
9023aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9024			      rtx rval, rtx mem, rtx model_rtx)
9025{
9026  rtx (*gen) (rtx, rtx, rtx, rtx);
9027
9028  switch (mode)
9029    {
9030    case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9031    case HImode: gen = gen_aarch64_store_exclusivehi; break;
9032    case SImode: gen = gen_aarch64_store_exclusivesi; break;
9033    case DImode: gen = gen_aarch64_store_exclusivedi; break;
9034    default:
9035      gcc_unreachable ();
9036    }
9037
9038  emit_insn (gen (bval, rval, mem, model_rtx));
9039}
9040
9041/* Mark the previous jump instruction as unlikely.  */
9042
9043static void
9044aarch64_emit_unlikely_jump (rtx insn)
9045{
9046  int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9047
9048  insn = emit_jump_insn (insn);
9049  add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9050}
9051
9052/* Expand a compare and swap pattern.  */
9053
9054void
9055aarch64_expand_compare_and_swap (rtx operands[])
9056{
9057  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9058  machine_mode mode, cmp_mode;
9059  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9060
9061  bval = operands[0];
9062  rval = operands[1];
9063  mem = operands[2];
9064  oldval = operands[3];
9065  newval = operands[4];
9066  is_weak = operands[5];
9067  mod_s = operands[6];
9068  mod_f = operands[7];
9069  mode = GET_MODE (mem);
9070  cmp_mode = mode;
9071
9072  /* Normally the succ memory model must be stronger than fail, but in the
9073     unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9074     promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9075
9076  if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9077      && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9078    mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9079
9080  switch (mode)
9081    {
9082    case QImode:
9083    case HImode:
9084      /* For short modes, we're going to perform the comparison in SImode,
9085	 so do the zero-extension now.  */
9086      cmp_mode = SImode;
9087      rval = gen_reg_rtx (SImode);
9088      oldval = convert_modes (SImode, mode, oldval, true);
9089      /* Fall through.  */
9090
9091    case SImode:
9092    case DImode:
9093      /* Force the value into a register if needed.  */
9094      if (!aarch64_plus_operand (oldval, mode))
9095	oldval = force_reg (cmp_mode, oldval);
9096      break;
9097
9098    default:
9099      gcc_unreachable ();
9100    }
9101
9102  switch (mode)
9103    {
9104    case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9105    case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9106    case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9107    case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9108    default:
9109      gcc_unreachable ();
9110    }
9111
9112  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9113
9114  if (mode == QImode || mode == HImode)
9115    emit_move_insn (operands[1], gen_lowpart (mode, rval));
9116
9117  x = gen_rtx_REG (CCmode, CC_REGNUM);
9118  x = gen_rtx_EQ (SImode, x, const0_rtx);
9119  emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9120}
9121
9122/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9123   sequence implementing an atomic operation.  */
9124
9125static void
9126aarch64_emit_post_barrier (enum memmodel model)
9127{
9128  const enum memmodel base_model = memmodel_base (model);
9129
9130  if (is_mm_sync (model)
9131      && (base_model == MEMMODEL_ACQUIRE
9132	  || base_model == MEMMODEL_ACQ_REL
9133	  || base_model == MEMMODEL_SEQ_CST))
9134    {
9135      emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9136    }
9137}
9138
9139/* Split a compare and swap pattern.  */
9140
9141void
9142aarch64_split_compare_and_swap (rtx operands[])
9143{
9144  rtx rval, mem, oldval, newval, scratch;
9145  machine_mode mode;
9146  bool is_weak;
9147  rtx_code_label *label1, *label2;
9148  rtx x, cond;
9149  enum memmodel model;
9150  rtx model_rtx;
9151
9152  rval = operands[0];
9153  mem = operands[1];
9154  oldval = operands[2];
9155  newval = operands[3];
9156  is_weak = (operands[4] != const0_rtx);
9157  model_rtx = operands[5];
9158  scratch = operands[7];
9159  mode = GET_MODE (mem);
9160  model = memmodel_from_int (INTVAL (model_rtx));
9161
9162  label1 = NULL;
9163  if (!is_weak)
9164    {
9165      label1 = gen_label_rtx ();
9166      emit_label (label1);
9167    }
9168  label2 = gen_label_rtx ();
9169
9170  /* The initial load can be relaxed for a __sync operation since a final
9171     barrier will be emitted to stop code hoisting.  */
9172  if (is_mm_sync (model))
9173    aarch64_emit_load_exclusive (mode, rval, mem,
9174				 GEN_INT (MEMMODEL_RELAXED));
9175  else
9176    aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9177
9178  cond = aarch64_gen_compare_reg (NE, rval, oldval);
9179  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9180  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9181			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9182  aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9183
9184  aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9185
9186  if (!is_weak)
9187    {
9188      x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9189      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9190				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9191      aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9192    }
9193  else
9194    {
9195      cond = gen_rtx_REG (CCmode, CC_REGNUM);
9196      x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9197      emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9198    }
9199
9200  emit_label (label2);
9201
9202  /* Emit any final barrier needed for a __sync operation.  */
9203  if (is_mm_sync (model))
9204    aarch64_emit_post_barrier (model);
9205}
9206
9207/* Split an atomic operation.  */
9208
9209void
9210aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9211		     rtx value, rtx model_rtx, rtx cond)
9212{
9213  machine_mode mode = GET_MODE (mem);
9214  machine_mode wmode = (mode == DImode ? DImode : SImode);
9215  const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9216  const bool is_sync = is_mm_sync (model);
9217  rtx_code_label *label;
9218  rtx x;
9219
9220  label = gen_label_rtx ();
9221  emit_label (label);
9222
9223  if (new_out)
9224    new_out = gen_lowpart (wmode, new_out);
9225  if (old_out)
9226    old_out = gen_lowpart (wmode, old_out);
9227  else
9228    old_out = new_out;
9229  value = simplify_gen_subreg (wmode, value, mode, 0);
9230
9231  /* The initial load can be relaxed for a __sync operation since a final
9232     barrier will be emitted to stop code hoisting.  */
9233 if (is_sync)
9234    aarch64_emit_load_exclusive (mode, old_out, mem,
9235				 GEN_INT (MEMMODEL_RELAXED));
9236  else
9237    aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9238
9239  switch (code)
9240    {
9241    case SET:
9242      new_out = value;
9243      break;
9244
9245    case NOT:
9246      x = gen_rtx_AND (wmode, old_out, value);
9247      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9248      x = gen_rtx_NOT (wmode, new_out);
9249      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9250      break;
9251
9252    case MINUS:
9253      if (CONST_INT_P (value))
9254	{
9255	  value = GEN_INT (-INTVAL (value));
9256	  code = PLUS;
9257	}
9258      /* Fall through.  */
9259
9260    default:
9261      x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9262      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9263      break;
9264    }
9265
9266  aarch64_emit_store_exclusive (mode, cond, mem,
9267				gen_lowpart (mode, new_out), model_rtx);
9268
9269  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9270  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9271			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9272  aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9273
9274  /* Emit any final barrier needed for a __sync operation.  */
9275  if (is_sync)
9276    aarch64_emit_post_barrier (model);
9277}
9278
9279static void
9280aarch64_print_extension (void)
9281{
9282  const struct aarch64_option_extension *opt = NULL;
9283
9284  for (opt = all_extensions; opt->name != NULL; opt++)
9285    if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9286      asm_fprintf (asm_out_file, "+%s", opt->name);
9287
9288  asm_fprintf (asm_out_file, "\n");
9289}
9290
9291static void
9292aarch64_start_file (void)
9293{
9294  if (selected_arch)
9295    {
9296      asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9297      aarch64_print_extension ();
9298    }
9299  else if (selected_cpu)
9300    {
9301      const char *truncated_name
9302	    = aarch64_rewrite_selected_cpu (selected_cpu->name);
9303      asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9304      aarch64_print_extension ();
9305    }
9306  default_file_start();
9307}
9308
9309/* Target hook for c_mode_for_suffix.  */
9310static machine_mode
9311aarch64_c_mode_for_suffix (char suffix)
9312{
9313  if (suffix == 'q')
9314    return TFmode;
9315
9316  return VOIDmode;
9317}
9318
9319/* We can only represent floating point constants which will fit in
9320   "quarter-precision" values.  These values are characterised by
9321   a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9322   by:
9323
9324   (-1)^s * (n/16) * 2^r
9325
9326   Where:
9327     's' is the sign bit.
9328     'n' is an integer in the range 16 <= n <= 31.
9329     'r' is an integer in the range -3 <= r <= 4.  */
9330
9331/* Return true iff X can be represented by a quarter-precision
9332   floating point immediate operand X.  Note, we cannot represent 0.0.  */
9333bool
9334aarch64_float_const_representable_p (rtx x)
9335{
9336  /* This represents our current view of how many bits
9337     make up the mantissa.  */
9338  int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9339  int exponent;
9340  unsigned HOST_WIDE_INT mantissa, mask;
9341  REAL_VALUE_TYPE r, m;
9342  bool fail;
9343
9344  if (!CONST_DOUBLE_P (x))
9345    return false;
9346
9347  if (GET_MODE (x) == VOIDmode)
9348    return false;
9349
9350  REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9351
9352  /* We cannot represent infinities, NaNs or +/-zero.  We won't
9353     know if we have +zero until we analyse the mantissa, but we
9354     can reject the other invalid values.  */
9355  if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9356      || REAL_VALUE_MINUS_ZERO (r))
9357    return false;
9358
9359  /* Extract exponent.  */
9360  r = real_value_abs (&r);
9361  exponent = REAL_EXP (&r);
9362
9363  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9364     highest (sign) bit, with a fixed binary point at bit point_pos.
9365     m1 holds the low part of the mantissa, m2 the high part.
9366     WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9367     bits for the mantissa, this can fail (low bits will be lost).  */
9368  real_ldexp (&m, &r, point_pos - exponent);
9369  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9370
9371  /* If the low part of the mantissa has bits set we cannot represent
9372     the value.  */
9373  if (w.elt (0) != 0)
9374    return false;
9375  /* We have rejected the lower HOST_WIDE_INT, so update our
9376     understanding of how many bits lie in the mantissa and
9377     look only at the high HOST_WIDE_INT.  */
9378  mantissa = w.elt (1);
9379  point_pos -= HOST_BITS_PER_WIDE_INT;
9380
9381  /* We can only represent values with a mantissa of the form 1.xxxx.  */
9382  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9383  if ((mantissa & mask) != 0)
9384    return false;
9385
9386  /* Having filtered unrepresentable values, we may now remove all
9387     but the highest 5 bits.  */
9388  mantissa >>= point_pos - 5;
9389
9390  /* We cannot represent the value 0.0, so reject it.  This is handled
9391     elsewhere.  */
9392  if (mantissa == 0)
9393    return false;
9394
9395  /* Then, as bit 4 is always set, we can mask it off, leaving
9396     the mantissa in the range [0, 15].  */
9397  mantissa &= ~(1 << 4);
9398  gcc_assert (mantissa <= 15);
9399
9400  /* GCC internally does not use IEEE754-like encoding (where normalized
9401     significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9402     Our mantissa values are shifted 4 places to the left relative to
9403     normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9404     by 5 places to correct for GCC's representation.  */
9405  exponent = 5 - exponent;
9406
9407  return (exponent >= 0 && exponent <= 7);
9408}
9409
9410char*
9411aarch64_output_simd_mov_immediate (rtx const_vector,
9412				   machine_mode mode,
9413				   unsigned width)
9414{
9415  bool is_valid;
9416  static char templ[40];
9417  const char *mnemonic;
9418  const char *shift_op;
9419  unsigned int lane_count = 0;
9420  char element_char;
9421
9422  struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9423
9424  /* This will return true to show const_vector is legal for use as either
9425     a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9426     also update INFO to show how the immediate should be generated.  */
9427  is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9428  gcc_assert (is_valid);
9429
9430  element_char = sizetochar (info.element_width);
9431  lane_count = width / info.element_width;
9432
9433  mode = GET_MODE_INNER (mode);
9434  if (mode == SFmode || mode == DFmode)
9435    {
9436      gcc_assert (info.shift == 0 && ! info.mvn);
9437      if (aarch64_float_const_zero_rtx_p (info.value))
9438        info.value = GEN_INT (0);
9439      else
9440	{
9441#define buf_size 20
9442	  REAL_VALUE_TYPE r;
9443	  REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9444	  char float_buf[buf_size] = {'\0'};
9445	  real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9446#undef buf_size
9447
9448	  if (lane_count == 1)
9449	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9450	  else
9451	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9452		      lane_count, element_char, float_buf);
9453	  return templ;
9454	}
9455    }
9456
9457  mnemonic = info.mvn ? "mvni" : "movi";
9458  shift_op = info.msl ? "msl" : "lsl";
9459
9460  if (lane_count == 1)
9461    snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9462	      mnemonic, UINTVAL (info.value));
9463  else if (info.shift)
9464    snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9465	      ", %s %d", mnemonic, lane_count, element_char,
9466	      UINTVAL (info.value), shift_op, info.shift);
9467  else
9468    snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9469	      mnemonic, lane_count, element_char, UINTVAL (info.value));
9470  return templ;
9471}
9472
9473char*
9474aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9475					  machine_mode mode)
9476{
9477  machine_mode vmode;
9478
9479  gcc_assert (!VECTOR_MODE_P (mode));
9480  vmode = aarch64_simd_container_mode (mode, 64);
9481  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9482  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9483}
9484
9485/* Split operands into moves from op[1] + op[2] into op[0].  */
9486
9487void
9488aarch64_split_combinev16qi (rtx operands[3])
9489{
9490  unsigned int dest = REGNO (operands[0]);
9491  unsigned int src1 = REGNO (operands[1]);
9492  unsigned int src2 = REGNO (operands[2]);
9493  machine_mode halfmode = GET_MODE (operands[1]);
9494  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9495  rtx destlo, desthi;
9496
9497  gcc_assert (halfmode == V16QImode);
9498
9499  if (src1 == dest && src2 == dest + halfregs)
9500    {
9501      /* No-op move.  Can't split to nothing; emit something.  */
9502      emit_note (NOTE_INSN_DELETED);
9503      return;
9504    }
9505
9506  /* Preserve register attributes for variable tracking.  */
9507  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9508  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9509			       GET_MODE_SIZE (halfmode));
9510
9511  /* Special case of reversed high/low parts.  */
9512  if (reg_overlap_mentioned_p (operands[2], destlo)
9513      && reg_overlap_mentioned_p (operands[1], desthi))
9514    {
9515      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9516      emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9517      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9518    }
9519  else if (!reg_overlap_mentioned_p (operands[2], destlo))
9520    {
9521      /* Try to avoid unnecessary moves if part of the result
9522	 is in the right place already.  */
9523      if (src1 != dest)
9524	emit_move_insn (destlo, operands[1]);
9525      if (src2 != dest + halfregs)
9526	emit_move_insn (desthi, operands[2]);
9527    }
9528  else
9529    {
9530      if (src2 != dest + halfregs)
9531	emit_move_insn (desthi, operands[2]);
9532      if (src1 != dest)
9533	emit_move_insn (destlo, operands[1]);
9534    }
9535}
9536
9537/* vec_perm support.  */
9538
9539#define MAX_VECT_LEN 16
9540
9541struct expand_vec_perm_d
9542{
9543  rtx target, op0, op1;
9544  unsigned char perm[MAX_VECT_LEN];
9545  machine_mode vmode;
9546  unsigned char nelt;
9547  bool one_vector_p;
9548  bool testing_p;
9549};
9550
9551/* Generate a variable permutation.  */
9552
9553static void
9554aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9555{
9556  machine_mode vmode = GET_MODE (target);
9557  bool one_vector_p = rtx_equal_p (op0, op1);
9558
9559  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9560  gcc_checking_assert (GET_MODE (op0) == vmode);
9561  gcc_checking_assert (GET_MODE (op1) == vmode);
9562  gcc_checking_assert (GET_MODE (sel) == vmode);
9563  gcc_checking_assert (TARGET_SIMD);
9564
9565  if (one_vector_p)
9566    {
9567      if (vmode == V8QImode)
9568	{
9569	  /* Expand the argument to a V16QI mode by duplicating it.  */
9570	  rtx pair = gen_reg_rtx (V16QImode);
9571	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9572	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9573	}
9574      else
9575	{
9576	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9577	}
9578    }
9579  else
9580    {
9581      rtx pair;
9582
9583      if (vmode == V8QImode)
9584	{
9585	  pair = gen_reg_rtx (V16QImode);
9586	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9587	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9588	}
9589      else
9590	{
9591	  pair = gen_reg_rtx (OImode);
9592	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9593	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9594	}
9595    }
9596}
9597
9598void
9599aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9600{
9601  machine_mode vmode = GET_MODE (target);
9602  unsigned int nelt = GET_MODE_NUNITS (vmode);
9603  bool one_vector_p = rtx_equal_p (op0, op1);
9604  rtx mask;
9605
9606  /* The TBL instruction does not use a modulo index, so we must take care
9607     of that ourselves.  */
9608  mask = aarch64_simd_gen_const_vector_dup (vmode,
9609      one_vector_p ? nelt - 1 : 2 * nelt - 1);
9610  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9611
9612  /* For big-endian, we also need to reverse the index within the vector
9613     (but not which vector).  */
9614  if (BYTES_BIG_ENDIAN)
9615    {
9616      /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9617      if (!one_vector_p)
9618        mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9619      sel = expand_simple_binop (vmode, XOR, sel, mask,
9620				 NULL, 0, OPTAB_LIB_WIDEN);
9621    }
9622  aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9623}
9624
9625/* Recognize patterns suitable for the TRN instructions.  */
9626static bool
9627aarch64_evpc_trn (struct expand_vec_perm_d *d)
9628{
9629  unsigned int i, odd, mask, nelt = d->nelt;
9630  rtx out, in0, in1, x;
9631  rtx (*gen) (rtx, rtx, rtx);
9632  machine_mode vmode = d->vmode;
9633
9634  if (GET_MODE_UNIT_SIZE (vmode) > 8)
9635    return false;
9636
9637  /* Note that these are little-endian tests.
9638     We correct for big-endian later.  */
9639  if (d->perm[0] == 0)
9640    odd = 0;
9641  else if (d->perm[0] == 1)
9642    odd = 1;
9643  else
9644    return false;
9645  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9646
9647  for (i = 0; i < nelt; i += 2)
9648    {
9649      if (d->perm[i] != i + odd)
9650	return false;
9651      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9652	return false;
9653    }
9654
9655  /* Success!  */
9656  if (d->testing_p)
9657    return true;
9658
9659  in0 = d->op0;
9660  in1 = d->op1;
9661  if (BYTES_BIG_ENDIAN)
9662    {
9663      x = in0, in0 = in1, in1 = x;
9664      odd = !odd;
9665    }
9666  out = d->target;
9667
9668  if (odd)
9669    {
9670      switch (vmode)
9671	{
9672	case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9673	case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9674	case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9675	case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9676	case V4SImode: gen = gen_aarch64_trn2v4si; break;
9677	case V2SImode: gen = gen_aarch64_trn2v2si; break;
9678	case V2DImode: gen = gen_aarch64_trn2v2di; break;
9679	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9680	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9681	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9682	default:
9683	  return false;
9684	}
9685    }
9686  else
9687    {
9688      switch (vmode)
9689	{
9690	case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9691	case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9692	case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9693	case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9694	case V4SImode: gen = gen_aarch64_trn1v4si; break;
9695	case V2SImode: gen = gen_aarch64_trn1v2si; break;
9696	case V2DImode: gen = gen_aarch64_trn1v2di; break;
9697	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9698	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9699	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9700	default:
9701	  return false;
9702	}
9703    }
9704
9705  emit_insn (gen (out, in0, in1));
9706  return true;
9707}
9708
9709/* Recognize patterns suitable for the UZP instructions.  */
9710static bool
9711aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9712{
9713  unsigned int i, odd, mask, nelt = d->nelt;
9714  rtx out, in0, in1, x;
9715  rtx (*gen) (rtx, rtx, rtx);
9716  machine_mode vmode = d->vmode;
9717
9718  if (GET_MODE_UNIT_SIZE (vmode) > 8)
9719    return false;
9720
9721  /* Note that these are little-endian tests.
9722     We correct for big-endian later.  */
9723  if (d->perm[0] == 0)
9724    odd = 0;
9725  else if (d->perm[0] == 1)
9726    odd = 1;
9727  else
9728    return false;
9729  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9730
9731  for (i = 0; i < nelt; i++)
9732    {
9733      unsigned elt = (i * 2 + odd) & mask;
9734      if (d->perm[i] != elt)
9735	return false;
9736    }
9737
9738  /* Success!  */
9739  if (d->testing_p)
9740    return true;
9741
9742  in0 = d->op0;
9743  in1 = d->op1;
9744  if (BYTES_BIG_ENDIAN)
9745    {
9746      x = in0, in0 = in1, in1 = x;
9747      odd = !odd;
9748    }
9749  out = d->target;
9750
9751  if (odd)
9752    {
9753      switch (vmode)
9754	{
9755	case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9756	case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9757	case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9758	case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9759	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9760	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9761	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9762	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9763	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9764	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9765	default:
9766	  return false;
9767	}
9768    }
9769  else
9770    {
9771      switch (vmode)
9772	{
9773	case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9774	case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9775	case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9776	case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9777	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9778	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9779	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9780	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9781	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9782	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9783	default:
9784	  return false;
9785	}
9786    }
9787
9788  emit_insn (gen (out, in0, in1));
9789  return true;
9790}
9791
9792/* Recognize patterns suitable for the ZIP instructions.  */
9793static bool
9794aarch64_evpc_zip (struct expand_vec_perm_d *d)
9795{
9796  unsigned int i, high, mask, nelt = d->nelt;
9797  rtx out, in0, in1, x;
9798  rtx (*gen) (rtx, rtx, rtx);
9799  machine_mode vmode = d->vmode;
9800
9801  if (GET_MODE_UNIT_SIZE (vmode) > 8)
9802    return false;
9803
9804  /* Note that these are little-endian tests.
9805     We correct for big-endian later.  */
9806  high = nelt / 2;
9807  if (d->perm[0] == high)
9808    /* Do Nothing.  */
9809    ;
9810  else if (d->perm[0] == 0)
9811    high = 0;
9812  else
9813    return false;
9814  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9815
9816  for (i = 0; i < nelt / 2; i++)
9817    {
9818      unsigned elt = (i + high) & mask;
9819      if (d->perm[i * 2] != elt)
9820	return false;
9821      elt = (elt + nelt) & mask;
9822      if (d->perm[i * 2 + 1] != elt)
9823	return false;
9824    }
9825
9826  /* Success!  */
9827  if (d->testing_p)
9828    return true;
9829
9830  in0 = d->op0;
9831  in1 = d->op1;
9832  if (BYTES_BIG_ENDIAN)
9833    {
9834      x = in0, in0 = in1, in1 = x;
9835      high = !high;
9836    }
9837  out = d->target;
9838
9839  if (high)
9840    {
9841      switch (vmode)
9842	{
9843	case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9844	case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9845	case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9846	case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9847	case V4SImode: gen = gen_aarch64_zip2v4si; break;
9848	case V2SImode: gen = gen_aarch64_zip2v2si; break;
9849	case V2DImode: gen = gen_aarch64_zip2v2di; break;
9850	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9851	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9852	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9853	default:
9854	  return false;
9855	}
9856    }
9857  else
9858    {
9859      switch (vmode)
9860	{
9861	case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9862	case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9863	case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9864	case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9865	case V4SImode: gen = gen_aarch64_zip1v4si; break;
9866	case V2SImode: gen = gen_aarch64_zip1v2si; break;
9867	case V2DImode: gen = gen_aarch64_zip1v2di; break;
9868	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9869	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9870	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9871	default:
9872	  return false;
9873	}
9874    }
9875
9876  emit_insn (gen (out, in0, in1));
9877  return true;
9878}
9879
9880/* Recognize patterns for the EXT insn.  */
9881
9882static bool
9883aarch64_evpc_ext (struct expand_vec_perm_d *d)
9884{
9885  unsigned int i, nelt = d->nelt;
9886  rtx (*gen) (rtx, rtx, rtx, rtx);
9887  rtx offset;
9888
9889  unsigned int location = d->perm[0]; /* Always < nelt.  */
9890
9891  /* Check if the extracted indices are increasing by one.  */
9892  for (i = 1; i < nelt; i++)
9893    {
9894      unsigned int required = location + i;
9895      if (d->one_vector_p)
9896        {
9897          /* We'll pass the same vector in twice, so allow indices to wrap.  */
9898	  required &= (nelt - 1);
9899	}
9900      if (d->perm[i] != required)
9901        return false;
9902    }
9903
9904  switch (d->vmode)
9905    {
9906    case V16QImode: gen = gen_aarch64_extv16qi; break;
9907    case V8QImode: gen = gen_aarch64_extv8qi; break;
9908    case V4HImode: gen = gen_aarch64_extv4hi; break;
9909    case V8HImode: gen = gen_aarch64_extv8hi; break;
9910    case V2SImode: gen = gen_aarch64_extv2si; break;
9911    case V4SImode: gen = gen_aarch64_extv4si; break;
9912    case V2SFmode: gen = gen_aarch64_extv2sf; break;
9913    case V4SFmode: gen = gen_aarch64_extv4sf; break;
9914    case V2DImode: gen = gen_aarch64_extv2di; break;
9915    case V2DFmode: gen = gen_aarch64_extv2df; break;
9916    default:
9917      return false;
9918    }
9919
9920  /* Success! */
9921  if (d->testing_p)
9922    return true;
9923
9924  /* The case where (location == 0) is a no-op for both big- and little-endian,
9925     and is removed by the mid-end at optimization levels -O1 and higher.  */
9926
9927  if (BYTES_BIG_ENDIAN && (location != 0))
9928    {
9929      /* After setup, we want the high elements of the first vector (stored
9930         at the LSB end of the register), and the low elements of the second
9931         vector (stored at the MSB end of the register). So swap.  */
9932      std::swap (d->op0, d->op1);
9933      /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9934      location = nelt - location;
9935    }
9936
9937  offset = GEN_INT (location);
9938  emit_insn (gen (d->target, d->op0, d->op1, offset));
9939  return true;
9940}
9941
9942/* Recognize patterns for the REV insns.  */
9943
9944static bool
9945aarch64_evpc_rev (struct expand_vec_perm_d *d)
9946{
9947  unsigned int i, j, diff, nelt = d->nelt;
9948  rtx (*gen) (rtx, rtx);
9949
9950  if (!d->one_vector_p)
9951    return false;
9952
9953  diff = d->perm[0];
9954  switch (diff)
9955    {
9956    case 7:
9957      switch (d->vmode)
9958	{
9959	case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9960	case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9961	default:
9962	  return false;
9963	}
9964      break;
9965    case 3:
9966      switch (d->vmode)
9967	{
9968	case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9969	case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9970	case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9971	case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9972	default:
9973	  return false;
9974	}
9975      break;
9976    case 1:
9977      switch (d->vmode)
9978	{
9979	case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9980	case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9981	case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9982	case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9983	case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9984	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9985	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9986	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9987	default:
9988	  return false;
9989	}
9990      break;
9991    default:
9992      return false;
9993    }
9994
9995  for (i = 0; i < nelt ; i += diff + 1)
9996    for (j = 0; j <= diff; j += 1)
9997      {
9998	/* This is guaranteed to be true as the value of diff
9999	   is 7, 3, 1 and we should have enough elements in the
10000	   queue to generate this.  Getting a vector mask with a
10001	   value of diff other than these values implies that
10002	   something is wrong by the time we get here.  */
10003	gcc_assert (i + j < nelt);
10004	if (d->perm[i + j] != i + diff - j)
10005	  return false;
10006      }
10007
10008  /* Success! */
10009  if (d->testing_p)
10010    return true;
10011
10012  emit_insn (gen (d->target, d->op0));
10013  return true;
10014}
10015
10016static bool
10017aarch64_evpc_dup (struct expand_vec_perm_d *d)
10018{
10019  rtx (*gen) (rtx, rtx, rtx);
10020  rtx out = d->target;
10021  rtx in0;
10022  machine_mode vmode = d->vmode;
10023  unsigned int i, elt, nelt = d->nelt;
10024  rtx lane;
10025
10026  elt = d->perm[0];
10027  for (i = 1; i < nelt; i++)
10028    {
10029      if (elt != d->perm[i])
10030	return false;
10031    }
10032
10033  /* The generic preparation in aarch64_expand_vec_perm_const_1
10034     swaps the operand order and the permute indices if it finds
10035     d->perm[0] to be in the second operand.  Thus, we can always
10036     use d->op0 and need not do any extra arithmetic to get the
10037     correct lane number.  */
10038  in0 = d->op0;
10039  lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10040
10041  switch (vmode)
10042    {
10043    case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10044    case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10045    case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10046    case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10047    case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10048    case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10049    case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10050    case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10051    case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10052    case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10053    default:
10054      return false;
10055    }
10056
10057  emit_insn (gen (out, in0, lane));
10058  return true;
10059}
10060
10061static bool
10062aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10063{
10064  rtx rperm[MAX_VECT_LEN], sel;
10065  machine_mode vmode = d->vmode;
10066  unsigned int i, nelt = d->nelt;
10067
10068  if (d->testing_p)
10069    return true;
10070
10071  /* Generic code will try constant permutation twice.  Once with the
10072     original mode and again with the elements lowered to QImode.
10073     So wait and don't do the selector expansion ourselves.  */
10074  if (vmode != V8QImode && vmode != V16QImode)
10075    return false;
10076
10077  for (i = 0; i < nelt; ++i)
10078    {
10079      int nunits = GET_MODE_NUNITS (vmode);
10080
10081      /* If big-endian and two vectors we end up with a weird mixed-endian
10082	 mode on NEON.  Reverse the index within each word but not the word
10083	 itself.  */
10084      rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10085					   : d->perm[i]);
10086    }
10087  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10088  sel = force_reg (vmode, sel);
10089
10090  aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10091  return true;
10092}
10093
10094static bool
10095aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10096{
10097  /* The pattern matching functions above are written to look for a small
10098     number to begin the sequence (0, 1, N/2).  If we begin with an index
10099     from the second operand, we can swap the operands.  */
10100  if (d->perm[0] >= d->nelt)
10101    {
10102      unsigned i, nelt = d->nelt;
10103
10104      gcc_assert (nelt == (nelt & -nelt));
10105      for (i = 0; i < nelt; ++i)
10106	d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10107
10108      std::swap (d->op0, d->op1);
10109    }
10110
10111  if (TARGET_SIMD)
10112    {
10113      if (aarch64_evpc_rev (d))
10114	return true;
10115      else if (aarch64_evpc_ext (d))
10116	return true;
10117      else if (aarch64_evpc_dup (d))
10118	return true;
10119      else if (aarch64_evpc_zip (d))
10120	return true;
10121      else if (aarch64_evpc_uzp (d))
10122	return true;
10123      else if (aarch64_evpc_trn (d))
10124	return true;
10125      return aarch64_evpc_tbl (d);
10126    }
10127  return false;
10128}
10129
10130/* Expand a vec_perm_const pattern.  */
10131
10132bool
10133aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10134{
10135  struct expand_vec_perm_d d;
10136  int i, nelt, which;
10137
10138  d.target = target;
10139  d.op0 = op0;
10140  d.op1 = op1;
10141
10142  d.vmode = GET_MODE (target);
10143  gcc_assert (VECTOR_MODE_P (d.vmode));
10144  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10145  d.testing_p = false;
10146
10147  for (i = which = 0; i < nelt; ++i)
10148    {
10149      rtx e = XVECEXP (sel, 0, i);
10150      int ei = INTVAL (e) & (2 * nelt - 1);
10151      which |= (ei < nelt ? 1 : 2);
10152      d.perm[i] = ei;
10153    }
10154
10155  switch (which)
10156    {
10157    default:
10158      gcc_unreachable ();
10159
10160    case 3:
10161      d.one_vector_p = false;
10162      if (!rtx_equal_p (op0, op1))
10163	break;
10164
10165      /* The elements of PERM do not suggest that only the first operand
10166	 is used, but both operands are identical.  Allow easier matching
10167	 of the permutation by folding the permutation into the single
10168	 input vector.  */
10169      /* Fall Through.  */
10170    case 2:
10171      for (i = 0; i < nelt; ++i)
10172	d.perm[i] &= nelt - 1;
10173      d.op0 = op1;
10174      d.one_vector_p = true;
10175      break;
10176
10177    case 1:
10178      d.op1 = op0;
10179      d.one_vector_p = true;
10180      break;
10181    }
10182
10183  return aarch64_expand_vec_perm_const_1 (&d);
10184}
10185
10186static bool
10187aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10188				     const unsigned char *sel)
10189{
10190  struct expand_vec_perm_d d;
10191  unsigned int i, nelt, which;
10192  bool ret;
10193
10194  d.vmode = vmode;
10195  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10196  d.testing_p = true;
10197  memcpy (d.perm, sel, nelt);
10198
10199  /* Calculate whether all elements are in one vector.  */
10200  for (i = which = 0; i < nelt; ++i)
10201    {
10202      unsigned char e = d.perm[i];
10203      gcc_assert (e < 2 * nelt);
10204      which |= (e < nelt ? 1 : 2);
10205    }
10206
10207  /* If all elements are from the second vector, reindex as if from the
10208     first vector.  */
10209  if (which == 2)
10210    for (i = 0; i < nelt; ++i)
10211      d.perm[i] -= nelt;
10212
10213  /* Check whether the mask can be applied to a single vector.  */
10214  d.one_vector_p = (which != 3);
10215
10216  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10217  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10218  if (!d.one_vector_p)
10219    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10220
10221  start_sequence ();
10222  ret = aarch64_expand_vec_perm_const_1 (&d);
10223  end_sequence ();
10224
10225  return ret;
10226}
10227
10228/* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
10229bool
10230aarch64_cannot_change_mode_class (machine_mode from,
10231				  machine_mode to,
10232				  enum reg_class rclass)
10233{
10234  /* We cannot allow word_mode subregs of full vector modes.
10235     Otherwise the middle-end will assume it's ok to store to
10236     (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
10237     of the 128-bit register.  However, after reload the subreg will
10238     be dropped leaving a plain DImode store.  See PR67609 for a more
10239     detailed dicussion.  In all other cases, we want to be permissive
10240     and return false.  */
10241  return (reg_classes_intersect_p (FP_REGS, rclass)
10242	  && GET_MODE_SIZE (to) == UNITS_PER_WORD
10243	  && GET_MODE_SIZE (from) > UNITS_PER_WORD);
10244}
10245
10246rtx
10247aarch64_reverse_mask (enum machine_mode mode)
10248{
10249  /* We have to reverse each vector because we dont have
10250     a permuted load that can reverse-load according to ABI rules.  */
10251  rtx mask;
10252  rtvec v = rtvec_alloc (16);
10253  int i, j;
10254  int nunits = GET_MODE_NUNITS (mode);
10255  int usize = GET_MODE_UNIT_SIZE (mode);
10256
10257  gcc_assert (BYTES_BIG_ENDIAN);
10258  gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10259
10260  for (i = 0; i < nunits; i++)
10261    for (j = 0; j < usize; j++)
10262      RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10263  mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10264  return force_reg (V16QImode, mask);
10265}
10266
10267/* Implement MODES_TIEABLE_P.  */
10268
10269bool
10270aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10271{
10272  if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10273    return true;
10274
10275  /* We specifically want to allow elements of "structure" modes to
10276     be tieable to the structure.  This more general condition allows
10277     other rarer situations too.  */
10278  if (TARGET_SIMD
10279      && aarch64_vector_mode_p (mode1)
10280      && aarch64_vector_mode_p (mode2))
10281    return true;
10282
10283  return false;
10284}
10285
10286/* Return a new RTX holding the result of moving POINTER forward by
10287   AMOUNT bytes.  */
10288
10289static rtx
10290aarch64_move_pointer (rtx pointer, int amount)
10291{
10292  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10293
10294  return adjust_automodify_address (pointer, GET_MODE (pointer),
10295				    next, amount);
10296}
10297
10298/* Return a new RTX holding the result of moving POINTER forward by the
10299   size of the mode it points to.  */
10300
10301static rtx
10302aarch64_progress_pointer (rtx pointer)
10303{
10304  HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10305
10306  return aarch64_move_pointer (pointer, amount);
10307}
10308
10309/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10310   MODE bytes.  */
10311
10312static void
10313aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10314					      machine_mode mode)
10315{
10316  rtx reg = gen_reg_rtx (mode);
10317
10318  /* "Cast" the pointers to the correct mode.  */
10319  *src = adjust_address (*src, mode, 0);
10320  *dst = adjust_address (*dst, mode, 0);
10321  /* Emit the memcpy.  */
10322  emit_move_insn (reg, *src);
10323  emit_move_insn (*dst, reg);
10324  /* Move the pointers forward.  */
10325  *src = aarch64_progress_pointer (*src);
10326  *dst = aarch64_progress_pointer (*dst);
10327}
10328
10329/* Expand movmem, as if from a __builtin_memcpy.  Return true if
10330   we succeed, otherwise return false.  */
10331
10332bool
10333aarch64_expand_movmem (rtx *operands)
10334{
10335  unsigned int n;
10336  rtx dst = operands[0];
10337  rtx src = operands[1];
10338  rtx base;
10339  bool speed_p = !optimize_function_for_size_p (cfun);
10340
10341  /* When optimizing for size, give a better estimate of the length of a
10342     memcpy call, but use the default otherwise.  */
10343  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10344
10345  /* We can't do anything smart if the amount to copy is not constant.  */
10346  if (!CONST_INT_P (operands[2]))
10347    return false;
10348
10349  n = UINTVAL (operands[2]);
10350
10351  /* Try to keep the number of instructions low.  For cases below 16 bytes we
10352     need to make at most two moves.  For cases above 16 bytes it will be one
10353     move for each 16 byte chunk, then at most two additional moves.  */
10354  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10355    return false;
10356
10357  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10358  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10359
10360  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10361  src = adjust_automodify_address (src, VOIDmode, base, 0);
10362
10363  /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10364     1-byte chunk.  */
10365  if (n < 4)
10366    {
10367      if (n >= 2)
10368	{
10369	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10370	  n -= 2;
10371	}
10372
10373      if (n == 1)
10374	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10375
10376      return true;
10377    }
10378
10379  /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10380     4-byte chunk, partially overlapping with the previously copied chunk.  */
10381  if (n < 8)
10382    {
10383      aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10384      n -= 4;
10385      if (n > 0)
10386	{
10387	  int move = n - 4;
10388
10389	  src = aarch64_move_pointer (src, move);
10390	  dst = aarch64_move_pointer (dst, move);
10391	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10392	}
10393      return true;
10394    }
10395
10396  /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10397     them, then (if applicable) an 8-byte chunk.  */
10398  while (n >= 8)
10399    {
10400      if (n / 16)
10401	{
10402	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10403	  n -= 16;
10404	}
10405      else
10406	{
10407	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10408	  n -= 8;
10409	}
10410    }
10411
10412  /* Finish the final bytes of the copy.  We can always do this in one
10413     instruction.  We either copy the exact amount we need, or partially
10414     overlap with the previous chunk we copied and copy 8-bytes.  */
10415  if (n == 0)
10416    return true;
10417  else if (n == 1)
10418    aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10419  else if (n == 2)
10420    aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10421  else if (n == 4)
10422    aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10423  else
10424    {
10425      if (n == 3)
10426	{
10427	  src = aarch64_move_pointer (src, -1);
10428	  dst = aarch64_move_pointer (dst, -1);
10429	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10430	}
10431      else
10432	{
10433	  int move = n - 8;
10434
10435	  src = aarch64_move_pointer (src, move);
10436	  dst = aarch64_move_pointer (dst, move);
10437	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10438	}
10439    }
10440
10441  return true;
10442}
10443
10444/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10445
10446static unsigned HOST_WIDE_INT
10447aarch64_asan_shadow_offset (void)
10448{
10449  return (HOST_WIDE_INT_1 << 36);
10450}
10451
10452static bool
10453aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10454					unsigned int align,
10455					enum by_pieces_operation op,
10456					bool speed_p)
10457{
10458  /* STORE_BY_PIECES can be used when copying a constant string, but
10459     in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10460     For now we always fail this and let the move_by_pieces code copy
10461     the string from read-only memory.  */
10462  if (op == STORE_BY_PIECES)
10463    return false;
10464
10465  return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10466}
10467
10468static enum machine_mode
10469aarch64_code_to_ccmode (enum rtx_code code)
10470{
10471  switch (code)
10472    {
10473    case NE:
10474      return CC_DNEmode;
10475
10476    case EQ:
10477      return CC_DEQmode;
10478
10479    case LE:
10480      return CC_DLEmode;
10481
10482    case LT:
10483      return CC_DLTmode;
10484
10485    case GE:
10486      return CC_DGEmode;
10487
10488    case GT:
10489      return CC_DGTmode;
10490
10491    case LEU:
10492      return CC_DLEUmode;
10493
10494    case LTU:
10495      return CC_DLTUmode;
10496
10497    case GEU:
10498      return CC_DGEUmode;
10499
10500    case GTU:
10501      return CC_DGTUmode;
10502
10503    default:
10504      return CCmode;
10505    }
10506}
10507
10508static rtx
10509aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10510			int code, tree treeop0, tree treeop1)
10511{
10512  enum machine_mode op_mode, cmp_mode, cc_mode;
10513  rtx op0, op1, cmp, target;
10514  int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10515  enum insn_code icode;
10516  struct expand_operand ops[4];
10517
10518  cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10519  if (cc_mode == CCmode)
10520    return NULL_RTX;
10521
10522  start_sequence ();
10523  expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10524
10525  op_mode = GET_MODE (op0);
10526  if (op_mode == VOIDmode)
10527    op_mode = GET_MODE (op1);
10528
10529  switch (op_mode)
10530    {
10531    case QImode:
10532    case HImode:
10533    case SImode:
10534      cmp_mode = SImode;
10535      icode = CODE_FOR_cmpsi;
10536      break;
10537
10538    case DImode:
10539      cmp_mode = DImode;
10540      icode = CODE_FOR_cmpdi;
10541      break;
10542
10543    default:
10544      end_sequence ();
10545      return NULL_RTX;
10546    }
10547
10548  op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10549  op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10550  if (!op0 || !op1)
10551    {
10552      end_sequence ();
10553      return NULL_RTX;
10554    }
10555  *prep_seq = get_insns ();
10556  end_sequence ();
10557
10558  cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10559  target = gen_rtx_REG (CCmode, CC_REGNUM);
10560
10561  create_output_operand (&ops[0], target, CCmode);
10562  create_fixed_operand (&ops[1], cmp);
10563  create_fixed_operand (&ops[2], op0);
10564  create_fixed_operand (&ops[3], op1);
10565
10566  start_sequence ();
10567  if (!maybe_expand_insn (icode, 4, ops))
10568    {
10569      end_sequence ();
10570      return NULL_RTX;
10571    }
10572  *gen_seq = get_insns ();
10573  end_sequence ();
10574
10575  return gen_rtx_REG (cc_mode, CC_REGNUM);
10576}
10577
10578static rtx
10579aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10580		       tree treeop0, tree treeop1, int bit_code)
10581{
10582  rtx op0, op1, cmp0, cmp1, target;
10583  enum machine_mode op_mode, cmp_mode, cc_mode;
10584  int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10585  enum insn_code icode = CODE_FOR_ccmp_andsi;
10586  struct expand_operand ops[6];
10587
10588  cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10589  if (cc_mode == CCmode)
10590    return NULL_RTX;
10591
10592  push_to_sequence ((rtx_insn*) *prep_seq);
10593  expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10594
10595  op_mode = GET_MODE (op0);
10596  if (op_mode == VOIDmode)
10597    op_mode = GET_MODE (op1);
10598
10599  switch (op_mode)
10600    {
10601    case QImode:
10602    case HImode:
10603    case SImode:
10604      cmp_mode = SImode;
10605      icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10606						: CODE_FOR_ccmp_iorsi;
10607      break;
10608
10609    case DImode:
10610      cmp_mode = DImode;
10611      icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10612						: CODE_FOR_ccmp_iordi;
10613      break;
10614
10615    default:
10616      end_sequence ();
10617      return NULL_RTX;
10618    }
10619
10620  op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10621  op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10622  if (!op0 || !op1)
10623    {
10624      end_sequence ();
10625      return NULL_RTX;
10626    }
10627  *prep_seq = get_insns ();
10628  end_sequence ();
10629
10630  target = gen_rtx_REG (cc_mode, CC_REGNUM);
10631  cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10632  cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10633
10634  create_fixed_operand (&ops[0], prev);
10635  create_fixed_operand (&ops[1], target);
10636  create_fixed_operand (&ops[2], op0);
10637  create_fixed_operand (&ops[3], op1);
10638  create_fixed_operand (&ops[4], cmp0);
10639  create_fixed_operand (&ops[5], cmp1);
10640
10641  push_to_sequence ((rtx_insn*) *gen_seq);
10642  if (!maybe_expand_insn (icode, 6, ops))
10643    {
10644      end_sequence ();
10645      return NULL_RTX;
10646    }
10647
10648  *gen_seq = get_insns ();
10649  end_sequence ();
10650
10651  return target;
10652}
10653
10654#undef TARGET_GEN_CCMP_FIRST
10655#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10656
10657#undef TARGET_GEN_CCMP_NEXT
10658#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10659
10660/* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10661   instruction fusion of some sort.  */
10662
10663static bool
10664aarch64_macro_fusion_p (void)
10665{
10666  return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10667}
10668
10669
10670/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10671   should be kept together during scheduling.  */
10672
10673static bool
10674aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10675{
10676  rtx set_dest;
10677  rtx prev_set = single_set (prev);
10678  rtx curr_set = single_set (curr);
10679  /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10680  bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10681
10682  if (!aarch64_macro_fusion_p ())
10683    return false;
10684
10685  if (simple_sets_p
10686      && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10687    {
10688      /* We are trying to match:
10689         prev (mov)  == (set (reg r0) (const_int imm16))
10690         curr (movk) == (set (zero_extract (reg r0)
10691                                           (const_int 16)
10692                                           (const_int 16))
10693                             (const_int imm16_1))  */
10694
10695      set_dest = SET_DEST (curr_set);
10696
10697      if (GET_CODE (set_dest) == ZERO_EXTRACT
10698          && CONST_INT_P (SET_SRC (curr_set))
10699          && CONST_INT_P (SET_SRC (prev_set))
10700          && CONST_INT_P (XEXP (set_dest, 2))
10701          && INTVAL (XEXP (set_dest, 2)) == 16
10702          && REG_P (XEXP (set_dest, 0))
10703          && REG_P (SET_DEST (prev_set))
10704          && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10705        {
10706          return true;
10707        }
10708    }
10709
10710  if (simple_sets_p
10711      && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10712    {
10713
10714      /*  We're trying to match:
10715          prev (adrp) == (set (reg r1)
10716                              (high (symbol_ref ("SYM"))))
10717          curr (add) == (set (reg r0)
10718                             (lo_sum (reg r1)
10719                                     (symbol_ref ("SYM"))))
10720          Note that r0 need not necessarily be the same as r1, especially
10721          during pre-regalloc scheduling.  */
10722
10723      if (satisfies_constraint_Ush (SET_SRC (prev_set))
10724          && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10725        {
10726          if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10727              && REG_P (XEXP (SET_SRC (curr_set), 0))
10728              && REGNO (XEXP (SET_SRC (curr_set), 0))
10729                 == REGNO (SET_DEST (prev_set))
10730              && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10731                              XEXP (SET_SRC (curr_set), 1)))
10732            return true;
10733        }
10734    }
10735
10736  if (simple_sets_p
10737      && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10738    {
10739
10740      /* We're trying to match:
10741         prev (movk) == (set (zero_extract (reg r0)
10742                                           (const_int 16)
10743                                           (const_int 32))
10744                             (const_int imm16_1))
10745         curr (movk) == (set (zero_extract (reg r0)
10746                                           (const_int 16)
10747                                           (const_int 48))
10748                             (const_int imm16_2))  */
10749
10750      if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10751          && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10752          && REG_P (XEXP (SET_DEST (prev_set), 0))
10753          && REG_P (XEXP (SET_DEST (curr_set), 0))
10754          && REGNO (XEXP (SET_DEST (prev_set), 0))
10755             == REGNO (XEXP (SET_DEST (curr_set), 0))
10756          && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10757          && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10758          && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10759          && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10760          && CONST_INT_P (SET_SRC (prev_set))
10761          && CONST_INT_P (SET_SRC (curr_set)))
10762        return true;
10763
10764    }
10765  if (simple_sets_p
10766      && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10767    {
10768      /* We're trying to match:
10769          prev (adrp) == (set (reg r0)
10770                              (high (symbol_ref ("SYM"))))
10771          curr (ldr) == (set (reg r1)
10772                             (mem (lo_sum (reg r0)
10773                                             (symbol_ref ("SYM")))))
10774                 or
10775          curr (ldr) == (set (reg r1)
10776                             (zero_extend (mem
10777                                           (lo_sum (reg r0)
10778                                                   (symbol_ref ("SYM"))))))  */
10779      if (satisfies_constraint_Ush (SET_SRC (prev_set))
10780          && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10781        {
10782          rtx curr_src = SET_SRC (curr_set);
10783
10784          if (GET_CODE (curr_src) == ZERO_EXTEND)
10785            curr_src = XEXP (curr_src, 0);
10786
10787          if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10788              && REG_P (XEXP (XEXP (curr_src, 0), 0))
10789              && REGNO (XEXP (XEXP (curr_src, 0), 0))
10790                 == REGNO (SET_DEST (prev_set))
10791              && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10792                              XEXP (SET_SRC (prev_set), 0)))
10793              return true;
10794        }
10795    }
10796
10797  if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10798      && any_condjump_p (curr))
10799    {
10800      enum attr_type prev_type = get_attr_type (prev);
10801
10802      /* FIXME: this misses some which is considered simple arthematic
10803         instructions for ThunderX.  Simple shifts are missed here.  */
10804      if (prev_type == TYPE_ALUS_SREG
10805          || prev_type == TYPE_ALUS_IMM
10806          || prev_type == TYPE_LOGICS_REG
10807          || prev_type == TYPE_LOGICS_IMM)
10808        return true;
10809    }
10810
10811  return false;
10812}
10813
10814/* If MEM is in the form of [base+offset], extract the two parts
10815   of address and set to BASE and OFFSET, otherwise return false
10816   after clearing BASE and OFFSET.  */
10817
10818bool
10819extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10820{
10821  rtx addr;
10822
10823  gcc_assert (MEM_P (mem));
10824
10825  addr = XEXP (mem, 0);
10826
10827  if (REG_P (addr))
10828    {
10829      *base = addr;
10830      *offset = const0_rtx;
10831      return true;
10832    }
10833
10834  if (GET_CODE (addr) == PLUS
10835      && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10836    {
10837      *base = XEXP (addr, 0);
10838      *offset = XEXP (addr, 1);
10839      return true;
10840    }
10841
10842  *base = NULL_RTX;
10843  *offset = NULL_RTX;
10844
10845  return false;
10846}
10847
10848/* Types for scheduling fusion.  */
10849enum sched_fusion_type
10850{
10851  SCHED_FUSION_NONE = 0,
10852  SCHED_FUSION_LD_SIGN_EXTEND,
10853  SCHED_FUSION_LD_ZERO_EXTEND,
10854  SCHED_FUSION_LD,
10855  SCHED_FUSION_ST,
10856  SCHED_FUSION_NUM
10857};
10858
10859/* If INSN is a load or store of address in the form of [base+offset],
10860   extract the two parts and set to BASE and OFFSET.  Return scheduling
10861   fusion type this INSN is.  */
10862
10863static enum sched_fusion_type
10864fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10865{
10866  rtx x, dest, src;
10867  enum sched_fusion_type fusion = SCHED_FUSION_LD;
10868
10869  gcc_assert (INSN_P (insn));
10870  x = PATTERN (insn);
10871  if (GET_CODE (x) != SET)
10872    return SCHED_FUSION_NONE;
10873
10874  src = SET_SRC (x);
10875  dest = SET_DEST (x);
10876
10877  if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10878      && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10879    return SCHED_FUSION_NONE;
10880
10881  if (GET_CODE (src) == SIGN_EXTEND)
10882    {
10883      fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10884      src = XEXP (src, 0);
10885      if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10886	return SCHED_FUSION_NONE;
10887    }
10888  else if (GET_CODE (src) == ZERO_EXTEND)
10889    {
10890      fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10891      src = XEXP (src, 0);
10892      if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10893	return SCHED_FUSION_NONE;
10894    }
10895
10896  if (GET_CODE (src) == MEM && REG_P (dest))
10897    extract_base_offset_in_addr (src, base, offset);
10898  else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10899    {
10900      fusion = SCHED_FUSION_ST;
10901      extract_base_offset_in_addr (dest, base, offset);
10902    }
10903  else
10904    return SCHED_FUSION_NONE;
10905
10906  if (*base == NULL_RTX || *offset == NULL_RTX)
10907    fusion = SCHED_FUSION_NONE;
10908
10909  return fusion;
10910}
10911
10912/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10913
10914   Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10915   and PRI are only calculated for these instructions.  For other instruction,
10916   FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10917   type instruction fusion can be added by returning different priorities.
10918
10919   It's important that irrelevant instructions get the largest FUSION_PRI.  */
10920
10921static void
10922aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10923			       int *fusion_pri, int *pri)
10924{
10925  int tmp, off_val;
10926  rtx base, offset;
10927  enum sched_fusion_type fusion;
10928
10929  gcc_assert (INSN_P (insn));
10930
10931  tmp = max_pri - 1;
10932  fusion = fusion_load_store (insn, &base, &offset);
10933  if (fusion == SCHED_FUSION_NONE)
10934    {
10935      *pri = tmp;
10936      *fusion_pri = tmp;
10937      return;
10938    }
10939
10940  /* Set FUSION_PRI according to fusion type and base register.  */
10941  *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10942
10943  /* Calculate PRI.  */
10944  tmp /= 2;
10945
10946  /* INSN with smaller offset goes first.  */
10947  off_val = (int)(INTVAL (offset));
10948  if (off_val >= 0)
10949    tmp -= (off_val & 0xfffff);
10950  else
10951    tmp += ((- off_val) & 0xfffff);
10952
10953  *pri = tmp;
10954  return;
10955}
10956
10957/* Given OPERANDS of consecutive load/store, check if we can merge
10958   them into ldp/stp.  LOAD is true if they are load instructions.
10959   MODE is the mode of memory operands.  */
10960
10961bool
10962aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10963				enum machine_mode mode)
10964{
10965  HOST_WIDE_INT offval_1, offval_2, msize;
10966  enum reg_class rclass_1, rclass_2;
10967  rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10968
10969  if (load)
10970    {
10971      mem_1 = operands[1];
10972      mem_2 = operands[3];
10973      reg_1 = operands[0];
10974      reg_2 = operands[2];
10975      gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10976      if (REGNO (reg_1) == REGNO (reg_2))
10977	return false;
10978    }
10979  else
10980    {
10981      mem_1 = operands[0];
10982      mem_2 = operands[2];
10983      reg_1 = operands[1];
10984      reg_2 = operands[3];
10985    }
10986
10987  /* The mems cannot be volatile.  */
10988  if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10989    return false;
10990
10991  /* Check if the addresses are in the form of [base+offset].  */
10992  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10993  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10994    return false;
10995  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10996  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10997    return false;
10998
10999  /* Check if the bases are same.  */
11000  if (!rtx_equal_p (base_1, base_2))
11001    return false;
11002
11003  offval_1 = INTVAL (offset_1);
11004  offval_2 = INTVAL (offset_2);
11005  msize = GET_MODE_SIZE (mode);
11006  /* Check if the offsets are consecutive.  */
11007  if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11008    return false;
11009
11010  /* Check if the addresses are clobbered by load.  */
11011  if (load)
11012    {
11013      if (reg_mentioned_p (reg_1, mem_1))
11014	return false;
11015
11016      /* In increasing order, the last load can clobber the address.  */
11017      if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11018      return false;
11019    }
11020
11021  if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11022    rclass_1 = FP_REGS;
11023  else
11024    rclass_1 = GENERAL_REGS;
11025
11026  if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11027    rclass_2 = FP_REGS;
11028  else
11029    rclass_2 = GENERAL_REGS;
11030
11031  /* Check if the registers are of same class.  */
11032  if (rclass_1 != rclass_2)
11033    return false;
11034
11035  return true;
11036}
11037
11038/* Given OPERANDS of consecutive load/store, check if we can merge
11039   them into ldp/stp by adjusting the offset.  LOAD is true if they
11040   are load instructions.  MODE is the mode of memory operands.
11041
11042   Given below consecutive stores:
11043
11044     str  w1, [xb, 0x100]
11045     str  w1, [xb, 0x104]
11046     str  w1, [xb, 0x108]
11047     str  w1, [xb, 0x10c]
11048
11049   Though the offsets are out of the range supported by stp, we can
11050   still pair them after adjusting the offset, like:
11051
11052     add  scratch, xb, 0x100
11053     stp  w1, w1, [scratch]
11054     stp  w1, w1, [scratch, 0x8]
11055
11056   The peephole patterns detecting this opportunity should guarantee
11057   the scratch register is avaliable.  */
11058
11059bool
11060aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11061				       enum machine_mode mode)
11062{
11063  enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11064  HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11065  rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11066  rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11067
11068  if (load)
11069    {
11070      reg_1 = operands[0];
11071      mem_1 = operands[1];
11072      reg_2 = operands[2];
11073      mem_2 = operands[3];
11074      reg_3 = operands[4];
11075      mem_3 = operands[5];
11076      reg_4 = operands[6];
11077      mem_4 = operands[7];
11078      gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11079		  && REG_P (reg_3) && REG_P (reg_4));
11080      if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11081	return false;
11082    }
11083  else
11084    {
11085      mem_1 = operands[0];
11086      reg_1 = operands[1];
11087      mem_2 = operands[2];
11088      reg_2 = operands[3];
11089      mem_3 = operands[4];
11090      reg_3 = operands[5];
11091      mem_4 = operands[6];
11092      reg_4 = operands[7];
11093    }
11094  /* Skip if memory operand is by itslef valid for ldp/stp.  */
11095  if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11096    return false;
11097
11098  /* The mems cannot be volatile.  */
11099  if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11100      || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11101    return false;
11102
11103  /* Check if the addresses are in the form of [base+offset].  */
11104  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11105  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11106    return false;
11107  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11108  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11109    return false;
11110  extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11111  if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11112    return false;
11113  extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11114  if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11115    return false;
11116
11117  /* Check if the bases are same.  */
11118  if (!rtx_equal_p (base_1, base_2)
11119      || !rtx_equal_p (base_2, base_3)
11120      || !rtx_equal_p (base_3, base_4))
11121    return false;
11122
11123  offval_1 = INTVAL (offset_1);
11124  offval_2 = INTVAL (offset_2);
11125  offval_3 = INTVAL (offset_3);
11126  offval_4 = INTVAL (offset_4);
11127  msize = GET_MODE_SIZE (mode);
11128  /* Check if the offsets are consecutive.  */
11129  if ((offval_1 != (offval_2 + msize)
11130       || offval_1 != (offval_3 + msize * 2)
11131       || offval_1 != (offval_4 + msize * 3))
11132      && (offval_4 != (offval_3 + msize)
11133	  || offval_4 != (offval_2 + msize * 2)
11134	  || offval_4 != (offval_1 + msize * 3)))
11135    return false;
11136
11137  /* Check if the addresses are clobbered by load.  */
11138  if (load)
11139    {
11140      if (reg_mentioned_p (reg_1, mem_1)
11141	  || reg_mentioned_p (reg_2, mem_2)
11142	  || reg_mentioned_p (reg_3, mem_3))
11143	return false;
11144
11145      /* In increasing order, the last load can clobber the address.  */
11146      if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11147	return false;
11148    }
11149
11150  if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11151    rclass_1 = FP_REGS;
11152  else
11153    rclass_1 = GENERAL_REGS;
11154
11155  if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11156    rclass_2 = FP_REGS;
11157  else
11158    rclass_2 = GENERAL_REGS;
11159
11160  if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11161    rclass_3 = FP_REGS;
11162  else
11163    rclass_3 = GENERAL_REGS;
11164
11165  if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11166    rclass_4 = FP_REGS;
11167  else
11168    rclass_4 = GENERAL_REGS;
11169
11170  /* Check if the registers are of same class.  */
11171  if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11172    return false;
11173
11174  return true;
11175}
11176
11177/* Given OPERANDS of consecutive load/store, this function pairs them
11178   into ldp/stp after adjusting the offset.  It depends on the fact
11179   that addresses of load/store instructions are in increasing order.
11180   MODE is the mode of memory operands.  CODE is the rtl operator
11181   which should be applied to all memory operands, it's SIGN_EXTEND,
11182   ZERO_EXTEND or UNKNOWN.  */
11183
11184bool
11185aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11186			     enum machine_mode mode, RTX_CODE code)
11187{
11188  rtx base, offset, t1, t2;
11189  rtx mem_1, mem_2, mem_3, mem_4;
11190  HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11191
11192  if (load)
11193    {
11194      mem_1 = operands[1];
11195      mem_2 = operands[3];
11196      mem_3 = operands[5];
11197      mem_4 = operands[7];
11198    }
11199  else
11200    {
11201      mem_1 = operands[0];
11202      mem_2 = operands[2];
11203      mem_3 = operands[4];
11204      mem_4 = operands[6];
11205      gcc_assert (code == UNKNOWN);
11206    }
11207
11208  extract_base_offset_in_addr (mem_1, &base, &offset);
11209  gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11210
11211  /* Adjust offset thus it can fit in ldp/stp instruction.  */
11212  msize = GET_MODE_SIZE (mode);
11213  stp_off_limit = msize * 0x40;
11214  off_val = INTVAL (offset);
11215  abs_off = (off_val < 0) ? -off_val : off_val;
11216  new_off = abs_off % stp_off_limit;
11217  adj_off = abs_off - new_off;
11218
11219  /* Further adjust to make sure all offsets are OK.  */
11220  if ((new_off + msize * 2) >= stp_off_limit)
11221    {
11222      adj_off += stp_off_limit;
11223      new_off -= stp_off_limit;
11224    }
11225
11226  /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11227  if (adj_off >= 0x1000)
11228    return false;
11229
11230  if (off_val < 0)
11231    {
11232      adj_off = -adj_off;
11233      new_off = -new_off;
11234    }
11235
11236  /* Create new memory references.  */
11237  mem_1 = change_address (mem_1, VOIDmode,
11238			  plus_constant (DImode, operands[8], new_off));
11239
11240  /* Check if the adjusted address is OK for ldp/stp.  */
11241  if (!aarch64_mem_pair_operand (mem_1, mode))
11242    return false;
11243
11244  msize = GET_MODE_SIZE (mode);
11245  mem_2 = change_address (mem_2, VOIDmode,
11246			  plus_constant (DImode,
11247					 operands[8],
11248					 new_off + msize));
11249  mem_3 = change_address (mem_3, VOIDmode,
11250			  plus_constant (DImode,
11251					 operands[8],
11252					 new_off + msize * 2));
11253  mem_4 = change_address (mem_4, VOIDmode,
11254			  plus_constant (DImode,
11255					 operands[8],
11256					 new_off + msize * 3));
11257
11258  if (code == ZERO_EXTEND)
11259    {
11260      mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11261      mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11262      mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11263      mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11264    }
11265  else if (code == SIGN_EXTEND)
11266    {
11267      mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11268      mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11269      mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11270      mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11271    }
11272
11273  if (load)
11274    {
11275      operands[1] = mem_1;
11276      operands[3] = mem_2;
11277      operands[5] = mem_3;
11278      operands[7] = mem_4;
11279    }
11280  else
11281    {
11282      operands[0] = mem_1;
11283      operands[2] = mem_2;
11284      operands[4] = mem_3;
11285      operands[6] = mem_4;
11286    }
11287
11288  /* Emit adjusting instruction.  */
11289  emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11290			  plus_constant (DImode, base, adj_off)));
11291  /* Emit ldp/stp instructions.  */
11292  t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11293  t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11294  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11295  t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11296  t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11297  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11298  return true;
11299}
11300
11301#undef TARGET_ADDRESS_COST
11302#define TARGET_ADDRESS_COST aarch64_address_cost
11303
11304/* This hook will determines whether unnamed bitfields affect the alignment
11305   of the containing structure.  The hook returns true if the structure
11306   should inherit the alignment requirements of an unnamed bitfield's
11307   type.  */
11308#undef TARGET_ALIGN_ANON_BITFIELD
11309#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11310
11311#undef TARGET_ASM_ALIGNED_DI_OP
11312#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11313
11314#undef TARGET_ASM_ALIGNED_HI_OP
11315#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11316
11317#undef TARGET_ASM_ALIGNED_SI_OP
11318#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11319
11320#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11321#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11322  hook_bool_const_tree_hwi_hwi_const_tree_true
11323
11324#undef TARGET_ASM_FILE_START
11325#define TARGET_ASM_FILE_START aarch64_start_file
11326
11327#undef TARGET_ASM_OUTPUT_MI_THUNK
11328#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11329
11330#undef TARGET_ASM_SELECT_RTX_SECTION
11331#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11332
11333#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11334#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11335
11336#undef TARGET_BUILD_BUILTIN_VA_LIST
11337#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11338
11339#undef TARGET_CALLEE_COPIES
11340#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11341
11342#undef TARGET_CAN_ELIMINATE
11343#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11344
11345#undef TARGET_CANNOT_FORCE_CONST_MEM
11346#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11347
11348#undef TARGET_CONDITIONAL_REGISTER_USAGE
11349#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11350
11351/* Only the least significant bit is used for initialization guard
11352   variables.  */
11353#undef TARGET_CXX_GUARD_MASK_BIT
11354#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11355
11356#undef TARGET_C_MODE_FOR_SUFFIX
11357#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11358
11359#ifdef TARGET_BIG_ENDIAN_DEFAULT
11360#undef  TARGET_DEFAULT_TARGET_FLAGS
11361#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11362#endif
11363
11364#undef TARGET_CLASS_MAX_NREGS
11365#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11366
11367#undef TARGET_BUILTIN_DECL
11368#define TARGET_BUILTIN_DECL aarch64_builtin_decl
11369
11370#undef  TARGET_EXPAND_BUILTIN
11371#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11372
11373#undef TARGET_EXPAND_BUILTIN_VA_START
11374#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11375
11376#undef TARGET_FOLD_BUILTIN
11377#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11378
11379#undef TARGET_FUNCTION_ARG
11380#define TARGET_FUNCTION_ARG aarch64_function_arg
11381
11382#undef TARGET_FUNCTION_ARG_ADVANCE
11383#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11384
11385#undef TARGET_FUNCTION_ARG_BOUNDARY
11386#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11387
11388#undef TARGET_FUNCTION_OK_FOR_SIBCALL
11389#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11390
11391#undef TARGET_FUNCTION_VALUE
11392#define TARGET_FUNCTION_VALUE aarch64_function_value
11393
11394#undef TARGET_FUNCTION_VALUE_REGNO_P
11395#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11396
11397#undef TARGET_FRAME_POINTER_REQUIRED
11398#define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11399
11400#undef TARGET_GIMPLE_FOLD_BUILTIN
11401#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11402
11403#undef TARGET_GIMPLIFY_VA_ARG_EXPR
11404#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11405
11406#undef  TARGET_INIT_BUILTINS
11407#define TARGET_INIT_BUILTINS  aarch64_init_builtins
11408
11409#undef TARGET_LEGITIMATE_ADDRESS_P
11410#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11411
11412#undef TARGET_LEGITIMATE_CONSTANT_P
11413#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11414
11415#undef TARGET_LIBGCC_CMP_RETURN_MODE
11416#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11417
11418#undef TARGET_LRA_P
11419#define TARGET_LRA_P hook_bool_void_true
11420
11421#undef TARGET_MANGLE_TYPE
11422#define TARGET_MANGLE_TYPE aarch64_mangle_type
11423
11424#undef TARGET_MEMORY_MOVE_COST
11425#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11426
11427#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11428#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11429
11430#undef TARGET_MUST_PASS_IN_STACK
11431#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11432
11433/* This target hook should return true if accesses to volatile bitfields
11434   should use the narrowest mode possible.  It should return false if these
11435   accesses should use the bitfield container type.  */
11436#undef TARGET_NARROW_VOLATILE_BITFIELD
11437#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11438
11439#undef  TARGET_OPTION_OVERRIDE
11440#define TARGET_OPTION_OVERRIDE aarch64_override_options
11441
11442#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11443#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11444  aarch64_override_options_after_change
11445
11446#undef TARGET_PASS_BY_REFERENCE
11447#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11448
11449#undef TARGET_PREFERRED_RELOAD_CLASS
11450#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11451
11452#undef TARGET_SCHED_REASSOCIATION_WIDTH
11453#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11454
11455#undef TARGET_SECONDARY_RELOAD
11456#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11457
11458#undef TARGET_SHIFT_TRUNCATION_MASK
11459#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11460
11461#undef TARGET_SETUP_INCOMING_VARARGS
11462#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11463
11464#undef TARGET_STRUCT_VALUE_RTX
11465#define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11466
11467#undef TARGET_REGISTER_MOVE_COST
11468#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11469
11470#undef TARGET_RETURN_IN_MEMORY
11471#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11472
11473#undef TARGET_RETURN_IN_MSB
11474#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11475
11476#undef TARGET_RTX_COSTS
11477#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11478
11479#undef TARGET_SCHED_ISSUE_RATE
11480#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11481
11482#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11483#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11484  aarch64_sched_first_cycle_multipass_dfa_lookahead
11485
11486#undef TARGET_TRAMPOLINE_INIT
11487#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11488
11489#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11490#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11491
11492#undef TARGET_VECTOR_MODE_SUPPORTED_P
11493#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11494
11495#undef TARGET_ARRAY_MODE_SUPPORTED_P
11496#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11497
11498#undef TARGET_VECTORIZE_ADD_STMT_COST
11499#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11500
11501#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11502#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11503  aarch64_builtin_vectorization_cost
11504
11505#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11506#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11507
11508#undef TARGET_VECTORIZE_BUILTINS
11509#define TARGET_VECTORIZE_BUILTINS
11510
11511#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11512#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11513  aarch64_builtin_vectorized_function
11514
11515#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11516#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11517  aarch64_autovectorize_vector_sizes
11518
11519#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11520#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11521  aarch64_atomic_assign_expand_fenv
11522
11523/* Section anchor support.  */
11524
11525#undef TARGET_MIN_ANCHOR_OFFSET
11526#define TARGET_MIN_ANCHOR_OFFSET -256
11527
11528/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11529   byte offset; we can do much more for larger data types, but have no way
11530   to determine the size of the access.  We assume accesses are aligned.  */
11531#undef TARGET_MAX_ANCHOR_OFFSET
11532#define TARGET_MAX_ANCHOR_OFFSET 4095
11533
11534#undef TARGET_VECTOR_ALIGNMENT
11535#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11536
11537#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11538#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11539  aarch64_simd_vector_alignment_reachable
11540
11541/* vec_perm support.  */
11542
11543#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11544#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11545  aarch64_vectorize_vec_perm_const_ok
11546
11547
11548#undef TARGET_FIXED_CONDITION_CODE_REGS
11549#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11550
11551#undef TARGET_FLAGS_REGNUM
11552#define TARGET_FLAGS_REGNUM CC_REGNUM
11553
11554#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11555#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11556
11557#undef TARGET_ASAN_SHADOW_OFFSET
11558#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11559
11560#undef TARGET_LEGITIMIZE_ADDRESS
11561#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11562
11563#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11564#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11565  aarch64_use_by_pieces_infrastructure_p
11566
11567#undef TARGET_CAN_USE_DOLOOP_P
11568#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11569
11570#undef TARGET_SCHED_MACRO_FUSION_P
11571#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11572
11573#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11574#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11575
11576#undef TARGET_SCHED_FUSION_PRIORITY
11577#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11578
11579#undef TARGET_RELAXED_ORDERING
11580#define TARGET_RELAXED_ORDERING true
11581
11582struct gcc_target targetm = TARGET_INITIALIZER;
11583
11584#include "gt-aarch64.h"
11585