1/* Machine description for AArch64 architecture.
2   Copyright (C) 2009-2020 Free Software Foundation, Inc.
3   Contributed by ARM Ltd.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   GCC is distributed in the hope that it will be useful, but
13   WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with GCC; see the file COPYING3.  If not see
19   <http://www.gnu.org/licenses/>.  */
20
21#define IN_TARGET_CODE 1
22
23#include "config.h"
24#define INCLUDE_STRING
25#include "system.h"
26#include "coretypes.h"
27#include "backend.h"
28#include "target.h"
29#include "rtl.h"
30#include "tree.h"
31#include "memmodel.h"
32#include "gimple.h"
33#include "cfghooks.h"
34#include "cfgloop.h"
35#include "df.h"
36#include "tm_p.h"
37#include "stringpool.h"
38#include "attribs.h"
39#include "optabs.h"
40#include "regs.h"
41#include "emit-rtl.h"
42#include "recog.h"
43#include "cgraph.h"
44#include "diagnostic.h"
45#include "insn-attr.h"
46#include "alias.h"
47#include "fold-const.h"
48#include "stor-layout.h"
49#include "calls.h"
50#include "varasm.h"
51#include "output.h"
52#include "flags.h"
53#include "explow.h"
54#include "expr.h"
55#include "reload.h"
56#include "langhooks.h"
57#include "opts.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "gimple-iterator.h"
61#include "tree-vectorizer.h"
62#include "aarch64-cost-tables.h"
63#include "dumpfile.h"
64#include "builtins.h"
65#include "rtl-iter.h"
66#include "tm-constrs.h"
67#include "sched-int.h"
68#include "target-globals.h"
69#include "common/common-target.h"
70#include "cfgrtl.h"
71#include "selftest.h"
72#include "selftest-rtl.h"
73#include "rtx-vector-builder.h"
74#include "intl.h"
75#include "expmed.h"
76#include "function-abi.h"
77
78/* This file should be included last.  */
79#include "target-def.h"
80
81/* Defined for convenience.  */
82#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84/* Information about a legitimate vector immediate operand.  */
85struct simd_immediate_info
86{
87  enum insn_type { MOV, MVN, INDEX, PTRUE };
88  enum modifier_type { LSL, MSL };
89
90  simd_immediate_info () {}
91  simd_immediate_info (scalar_float_mode, rtx);
92  simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93		       insn_type = MOV, modifier_type = LSL,
94		       unsigned int = 0);
95  simd_immediate_info (scalar_mode, rtx, rtx);
96  simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98  /* The mode of the elements.  */
99  scalar_mode elt_mode;
100
101  /* The instruction to use to move the immediate into a vector.  */
102  insn_type insn;
103
104  union
105  {
106    /* For MOV and MVN.  */
107    struct
108    {
109      /* The value of each element.  */
110      rtx value;
111
112      /* The kind of shift modifier to use, and the number of bits to shift.
113	 This is (LSL, 0) if no shift is needed.  */
114      modifier_type modifier;
115      unsigned int shift;
116    } mov;
117
118    /* For INDEX.  */
119    struct
120    {
121      /* The value of the first element and the step to be added for each
122	 subsequent element.  */
123      rtx base, step;
124    } index;
125
126    /* For PTRUE.  */
127    aarch64_svpattern pattern;
128  } u;
129};
130
131/* Construct a floating-point immediate in which each element has mode
132   ELT_MODE_IN and value VALUE_IN.  */
133inline simd_immediate_info
134::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135  : elt_mode (elt_mode_in), insn (MOV)
136{
137  u.mov.value = value_in;
138  u.mov.modifier = LSL;
139  u.mov.shift = 0;
140}
141
142/* Construct an integer immediate in which each element has mode ELT_MODE_IN
143   and value VALUE_IN.  The other parameters are as for the structure
144   fields.  */
145inline simd_immediate_info
146::simd_immediate_info (scalar_int_mode elt_mode_in,
147		       unsigned HOST_WIDE_INT value_in,
148		       insn_type insn_in, modifier_type modifier_in,
149		       unsigned int shift_in)
150  : elt_mode (elt_mode_in), insn (insn_in)
151{
152  u.mov.value = gen_int_mode (value_in, elt_mode_in);
153  u.mov.modifier = modifier_in;
154  u.mov.shift = shift_in;
155}
156
157/* Construct an integer immediate in which each element has mode ELT_MODE_IN
158   and where element I is equal to BASE_IN + I * STEP_IN.  */
159inline simd_immediate_info
160::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161  : elt_mode (elt_mode_in), insn (INDEX)
162{
163  u.index.base = base_in;
164  u.index.step = step_in;
165}
166
167/* Construct a predicate that controls elements of mode ELT_MODE_IN
168   and has PTRUE pattern PATTERN_IN.  */
169inline simd_immediate_info
170::simd_immediate_info (scalar_int_mode elt_mode_in,
171		       aarch64_svpattern pattern_in)
172  : elt_mode (elt_mode_in), insn (PTRUE)
173{
174  u.pattern = pattern_in;
175}
176
177namespace {
178
179/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
180class pure_scalable_type_info
181{
182public:
183  /* Represents the result of analyzing a type.  All values are nonzero,
184     in the possibly forlorn hope that accidental conversions to bool
185     trigger a warning.  */
186  enum analysis_result
187  {
188    /* The type does not have an ABI identity; i.e. it doesn't contain
189       at least one object whose type is a Fundamental Data Type.  */
190    NO_ABI_IDENTITY = 1,
191
192    /* The type is definitely a Pure Scalable Type.  */
193    IS_PST,
194
195    /* The type is definitely not a Pure Scalable Type.  */
196    ISNT_PST,
197
198    /* It doesn't matter for PCS purposes whether the type is a Pure
199       Scalable Type or not, since the type will be handled the same
200       way regardless.
201
202       Specifically, this means that if the type is a Pure Scalable Type,
203       there aren't enough argument registers to hold it, and so it will
204       need to be passed or returned in memory.  If the type isn't a
205       Pure Scalable Type, it's too big to be passed or returned in core
206       or SIMD&FP registers, and so again will need to go in memory.  */
207    DOESNT_MATTER
208  };
209
210  /* Aggregates of 17 bytes or more are normally passed and returned
211     in memory, so aggregates of that size can safely be analyzed as
212     DOESNT_MATTER.  We need to be able to collect enough pieces to
213     represent a PST that is smaller than that.  Since predicates are
214     2 bytes in size for -msve-vector-bits=128, that means we need to be
215     able to store at least 8 pieces.
216
217     We also need to be able to store enough pieces to represent
218     a single vector in each vector argument register and a single
219     predicate in each predicate argument register.  This means that
220     we need at least 12 pieces.  */
221  static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222#if __cplusplus >= 201103L
223  static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224#endif
225
226  /* Describes one piece of a PST.  Each piece is one of:
227
228     - a single Scalable Vector Type (SVT)
229     - a single Scalable Predicate Type (SPT)
230     - a PST containing 2, 3 or 4 SVTs, with no padding
231
232     It either represents a single built-in type or a PST formed from
233     multiple homogeneous built-in types.  */
234  struct piece
235  {
236    rtx get_rtx (unsigned int, unsigned int) const;
237
238    /* The number of vector and predicate registers that the piece
239       occupies.  One of the two is always zero.  */
240    unsigned int num_zr;
241    unsigned int num_pr;
242
243    /* The mode of the registers described above.  */
244    machine_mode mode;
245
246    /* If this piece is formed from multiple homogeneous built-in types,
247       this is the mode of the built-in types, otherwise it is MODE.  */
248    machine_mode orig_mode;
249
250    /* The offset in bytes of the piece from the start of the type.  */
251    poly_uint64_pod offset;
252  };
253
254  /* Divides types analyzed as IS_PST into individual pieces.  The pieces
255     are in memory order.  */
256  auto_vec<piece, MAX_PIECES> pieces;
257
258  unsigned int num_zr () const;
259  unsigned int num_pr () const;
260
261  rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262
263  analysis_result analyze (const_tree);
264  bool analyze_registers (const_tree);
265
266private:
267  analysis_result analyze_array (const_tree);
268  analysis_result analyze_record (const_tree);
269  void add_piece (const piece &);
270};
271}
272
273/* The current code model.  */
274enum aarch64_code_model aarch64_cmodel;
275
276/* The number of 64-bit elements in an SVE vector.  */
277poly_uint16 aarch64_sve_vg;
278
279#ifdef HAVE_AS_TLS
280#undef TARGET_HAVE_TLS
281#define TARGET_HAVE_TLS 1
282#endif
283
284static bool aarch64_composite_type_p (const_tree, machine_mode);
285static bool aarch64_return_in_memory_1 (const_tree);
286static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287						     const_tree,
288						     machine_mode *, int *,
289						     bool *, bool);
290static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292static void aarch64_override_options_after_change (void);
293static bool aarch64_vector_mode_supported_p (machine_mode);
294static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296							 const_tree type,
297							 int misalignment,
298							 bool is_packed);
299static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301					    aarch64_addr_query_type);
302static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
303
304/* Major revision number of the ARM Architecture implemented by the target.  */
305unsigned aarch64_architecture_version;
306
307/* The processor for which instructions should be scheduled.  */
308enum aarch64_processor aarch64_tune = cortexa53;
309
310/* Mask to specify which instruction scheduling options should be used.  */
311uint64_t aarch64_tune_flags = 0;
312
313/* Global flag for PC relative loads.  */
314bool aarch64_pcrelative_literal_loads;
315
316/* Global flag for whether frame pointer is enabled.  */
317bool aarch64_use_frame_pointer;
318
319#define BRANCH_PROTECT_STR_MAX 255
320char *accepted_branch_protection_string = NULL;
321
322static enum aarch64_parse_opt_result
323aarch64_parse_branch_protection (const char*, char**);
324
325/* Support for command line parsing of boolean flags in the tuning
326   structures.  */
327struct aarch64_flag_desc
328{
329  const char* name;
330  unsigned int flag;
331};
332
333#define AARCH64_FUSION_PAIR(name, internal_name) \
334  { name, AARCH64_FUSE_##internal_name },
335static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336{
337  { "none", AARCH64_FUSE_NOTHING },
338#include "aarch64-fusion-pairs.def"
339  { "all", AARCH64_FUSE_ALL },
340  { NULL, AARCH64_FUSE_NOTHING }
341};
342
343#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344  { name, AARCH64_EXTRA_TUNE_##internal_name },
345static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346{
347  { "none", AARCH64_EXTRA_TUNE_NONE },
348#include "aarch64-tuning-flags.def"
349  { "all", AARCH64_EXTRA_TUNE_ALL },
350  { NULL, AARCH64_EXTRA_TUNE_NONE }
351};
352
353/* Tuning parameters.  */
354
355static const struct cpu_addrcost_table generic_addrcost_table =
356{
357    {
358      1, /* hi  */
359      0, /* si  */
360      0, /* di  */
361      1, /* ti  */
362    },
363  0, /* pre_modify  */
364  0, /* post_modify  */
365  0, /* register_offset  */
366  0, /* register_sextend  */
367  0, /* register_zextend  */
368  0 /* imm_offset  */
369};
370
371static const struct cpu_addrcost_table exynosm1_addrcost_table =
372{
373    {
374      0, /* hi  */
375      0, /* si  */
376      0, /* di  */
377      2, /* ti  */
378    },
379  0, /* pre_modify  */
380  0, /* post_modify  */
381  1, /* register_offset  */
382  1, /* register_sextend  */
383  2, /* register_zextend  */
384  0, /* imm_offset  */
385};
386
387static const struct cpu_addrcost_table xgene1_addrcost_table =
388{
389    {
390      1, /* hi  */
391      0, /* si  */
392      0, /* di  */
393      1, /* ti  */
394    },
395  1, /* pre_modify  */
396  1, /* post_modify  */
397  0, /* register_offset  */
398  1, /* register_sextend  */
399  1, /* register_zextend  */
400  0, /* imm_offset  */
401};
402
403static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
404{
405    {
406      1, /* hi  */
407      1, /* si  */
408      1, /* di  */
409      2, /* ti  */
410    },
411  0, /* pre_modify  */
412  0, /* post_modify  */
413  2, /* register_offset  */
414  3, /* register_sextend  */
415  3, /* register_zextend  */
416  0, /* imm_offset  */
417};
418
419static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
420{
421    {
422      1, /* hi  */
423      1, /* si  */
424      1, /* di  */
425      2, /* ti  */
426    },
427  0, /* pre_modify  */
428  0, /* post_modify  */
429  2, /* register_offset  */
430  3, /* register_sextend  */
431  3, /* register_zextend  */
432  0, /* imm_offset  */
433};
434
435static const struct cpu_addrcost_table tsv110_addrcost_table =
436{
437    {
438      1, /* hi  */
439      0, /* si  */
440      0, /* di  */
441      1, /* ti  */
442    },
443  0, /* pre_modify  */
444  0, /* post_modify  */
445  0, /* register_offset  */
446  1, /* register_sextend  */
447  1, /* register_zextend  */
448  0, /* imm_offset  */
449};
450
451static const struct cpu_addrcost_table qdf24xx_addrcost_table =
452{
453    {
454      1, /* hi  */
455      1, /* si  */
456      1, /* di  */
457      2, /* ti  */
458    },
459  1, /* pre_modify  */
460  1, /* post_modify  */
461  3, /* register_offset  */
462  3, /* register_sextend  */
463  3, /* register_zextend  */
464  2, /* imm_offset  */
465};
466
467static const struct cpu_addrcost_table a64fx_addrcost_table =
468{
469    {
470      1, /* hi  */
471      1, /* si  */
472      1, /* di  */
473      2, /* ti  */
474    },
475  0, /* pre_modify  */
476  0, /* post_modify  */
477  2, /* register_offset  */
478  3, /* register_sextend  */
479  3, /* register_zextend  */
480  0, /* imm_offset  */
481};
482
483static const struct cpu_regmove_cost generic_regmove_cost =
484{
485  1, /* GP2GP  */
486  /* Avoid the use of slow int<->fp moves for spilling by setting
487     their cost higher than memmov_cost.  */
488  5, /* GP2FP  */
489  5, /* FP2GP  */
490  2 /* FP2FP  */
491};
492
493static const struct cpu_regmove_cost cortexa57_regmove_cost =
494{
495  1, /* GP2GP  */
496  /* Avoid the use of slow int<->fp moves for spilling by setting
497     their cost higher than memmov_cost.  */
498  5, /* GP2FP  */
499  5, /* FP2GP  */
500  2 /* FP2FP  */
501};
502
503static const struct cpu_regmove_cost cortexa53_regmove_cost =
504{
505  1, /* GP2GP  */
506  /* Avoid the use of slow int<->fp moves for spilling by setting
507     their cost higher than memmov_cost.  */
508  5, /* GP2FP  */
509  5, /* FP2GP  */
510  2 /* FP2FP  */
511};
512
513static const struct cpu_regmove_cost exynosm1_regmove_cost =
514{
515  1, /* GP2GP  */
516  /* Avoid the use of slow int<->fp moves for spilling by setting
517     their cost higher than memmov_cost (actual, 4 and 9).  */
518  9, /* GP2FP  */
519  9, /* FP2GP  */
520  1 /* FP2FP  */
521};
522
523static const struct cpu_regmove_cost thunderx_regmove_cost =
524{
525  2, /* GP2GP  */
526  2, /* GP2FP  */
527  6, /* FP2GP  */
528  4 /* FP2FP  */
529};
530
531static const struct cpu_regmove_cost xgene1_regmove_cost =
532{
533  1, /* GP2GP  */
534  /* Avoid the use of slow int<->fp moves for spilling by setting
535     their cost higher than memmov_cost.  */
536  8, /* GP2FP  */
537  8, /* FP2GP  */
538  2 /* FP2FP  */
539};
540
541static const struct cpu_regmove_cost qdf24xx_regmove_cost =
542{
543  2, /* GP2GP  */
544  /* Avoid the use of int<->fp moves for spilling.  */
545  6, /* GP2FP  */
546  6, /* FP2GP  */
547  4 /* FP2FP  */
548};
549
550static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
551{
552  1, /* GP2GP  */
553  /* Avoid the use of int<->fp moves for spilling.  */
554  5, /* GP2FP  */
555  6, /* FP2GP  */
556  3, /* FP2FP  */
557};
558
559static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
560{
561  1, /* GP2GP  */
562  /* Avoid the use of int<->fp moves for spilling.  */
563  4, /* GP2FP  */
564  5, /* FP2GP  */
565  4  /* FP2FP  */
566};
567
568static const struct cpu_regmove_cost tsv110_regmove_cost =
569{
570  1, /* GP2GP  */
571  /* Avoid the use of slow int<->fp moves for spilling by setting
572     their cost higher than memmov_cost.  */
573  2, /* GP2FP  */
574  3, /* FP2GP  */
575  2  /* FP2FP  */
576};
577
578static const struct cpu_regmove_cost a64fx_regmove_cost =
579{
580  1, /* GP2GP  */
581  /* Avoid the use of slow int<->fp moves for spilling by setting
582     their cost higher than memmov_cost.  */
583  5, /* GP2FP  */
584  7, /* FP2GP  */
585  2 /* FP2FP  */
586};
587
588/* Generic costs for vector insn classes.  */
589static const struct cpu_vector_cost generic_vector_cost =
590{
591  1, /* scalar_int_stmt_cost  */
592  1, /* scalar_fp_stmt_cost  */
593  1, /* scalar_load_cost  */
594  1, /* scalar_store_cost  */
595  1, /* vec_int_stmt_cost  */
596  1, /* vec_fp_stmt_cost  */
597  2, /* vec_permute_cost  */
598  2, /* vec_to_scalar_cost  */
599  1, /* scalar_to_vec_cost  */
600  1, /* vec_align_load_cost  */
601  1, /* vec_unalign_load_cost  */
602  1, /* vec_unalign_store_cost  */
603  1, /* vec_store_cost  */
604  3, /* cond_taken_branch_cost  */
605  1 /* cond_not_taken_branch_cost  */
606};
607
608/* QDF24XX costs for vector insn classes.  */
609static const struct cpu_vector_cost qdf24xx_vector_cost =
610{
611  1, /* scalar_int_stmt_cost  */
612  1, /* scalar_fp_stmt_cost  */
613  1, /* scalar_load_cost  */
614  1, /* scalar_store_cost  */
615  1, /* vec_int_stmt_cost  */
616  3, /* vec_fp_stmt_cost  */
617  2, /* vec_permute_cost  */
618  1, /* vec_to_scalar_cost  */
619  1, /* scalar_to_vec_cost  */
620  1, /* vec_align_load_cost  */
621  1, /* vec_unalign_load_cost  */
622  1, /* vec_unalign_store_cost  */
623  1, /* vec_store_cost  */
624  3, /* cond_taken_branch_cost  */
625  1 /* cond_not_taken_branch_cost  */
626};
627
628/* ThunderX costs for vector insn classes.  */
629static const struct cpu_vector_cost thunderx_vector_cost =
630{
631  1, /* scalar_int_stmt_cost  */
632  1, /* scalar_fp_stmt_cost  */
633  3, /* scalar_load_cost  */
634  1, /* scalar_store_cost  */
635  4, /* vec_int_stmt_cost  */
636  1, /* vec_fp_stmt_cost  */
637  4, /* vec_permute_cost  */
638  2, /* vec_to_scalar_cost  */
639  2, /* scalar_to_vec_cost  */
640  3, /* vec_align_load_cost  */
641  5, /* vec_unalign_load_cost  */
642  5, /* vec_unalign_store_cost  */
643  1, /* vec_store_cost  */
644  3, /* cond_taken_branch_cost  */
645  3 /* cond_not_taken_branch_cost  */
646};
647
648static const struct cpu_vector_cost tsv110_vector_cost =
649{
650  1, /* scalar_int_stmt_cost  */
651  1, /* scalar_fp_stmt_cost  */
652  5, /* scalar_load_cost  */
653  1, /* scalar_store_cost  */
654  2, /* vec_int_stmt_cost  */
655  2, /* vec_fp_stmt_cost  */
656  2, /* vec_permute_cost  */
657  3, /* vec_to_scalar_cost  */
658  2, /* scalar_to_vec_cost  */
659  5, /* vec_align_load_cost  */
660  5, /* vec_unalign_load_cost  */
661  1, /* vec_unalign_store_cost  */
662  1, /* vec_store_cost  */
663  1, /* cond_taken_branch_cost  */
664  1 /* cond_not_taken_branch_cost  */
665};
666
667/* Generic costs for vector insn classes.  */
668static const struct cpu_vector_cost cortexa57_vector_cost =
669{
670  1, /* scalar_int_stmt_cost  */
671  1, /* scalar_fp_stmt_cost  */
672  4, /* scalar_load_cost  */
673  1, /* scalar_store_cost  */
674  2, /* vec_int_stmt_cost  */
675  2, /* vec_fp_stmt_cost  */
676  3, /* vec_permute_cost  */
677  8, /* vec_to_scalar_cost  */
678  8, /* scalar_to_vec_cost  */
679  4, /* vec_align_load_cost  */
680  4, /* vec_unalign_load_cost  */
681  1, /* vec_unalign_store_cost  */
682  1, /* vec_store_cost  */
683  1, /* cond_taken_branch_cost  */
684  1 /* cond_not_taken_branch_cost  */
685};
686
687static const struct cpu_vector_cost exynosm1_vector_cost =
688{
689  1, /* scalar_int_stmt_cost  */
690  1, /* scalar_fp_stmt_cost  */
691  5, /* scalar_load_cost  */
692  1, /* scalar_store_cost  */
693  3, /* vec_int_stmt_cost  */
694  3, /* vec_fp_stmt_cost  */
695  3, /* vec_permute_cost  */
696  3, /* vec_to_scalar_cost  */
697  3, /* scalar_to_vec_cost  */
698  5, /* vec_align_load_cost  */
699  5, /* vec_unalign_load_cost  */
700  1, /* vec_unalign_store_cost  */
701  1, /* vec_store_cost  */
702  1, /* cond_taken_branch_cost  */
703  1 /* cond_not_taken_branch_cost  */
704};
705
706/* Generic costs for vector insn classes.  */
707static const struct cpu_vector_cost xgene1_vector_cost =
708{
709  1, /* scalar_int_stmt_cost  */
710  1, /* scalar_fp_stmt_cost  */
711  5, /* scalar_load_cost  */
712  1, /* scalar_store_cost  */
713  2, /* vec_int_stmt_cost  */
714  2, /* vec_fp_stmt_cost  */
715  2, /* vec_permute_cost  */
716  4, /* vec_to_scalar_cost  */
717  4, /* scalar_to_vec_cost  */
718  10, /* vec_align_load_cost  */
719  10, /* vec_unalign_load_cost  */
720  2, /* vec_unalign_store_cost  */
721  2, /* vec_store_cost  */
722  2, /* cond_taken_branch_cost  */
723  1 /* cond_not_taken_branch_cost  */
724};
725
726/* Costs for vector insn classes for Vulcan.  */
727static const struct cpu_vector_cost thunderx2t99_vector_cost =
728{
729  1, /* scalar_int_stmt_cost  */
730  6, /* scalar_fp_stmt_cost  */
731  4, /* scalar_load_cost  */
732  1, /* scalar_store_cost  */
733  4, /* vec_int_stmt_cost  */
734  5, /* vec_fp_stmt_cost  */
735  10, /* vec_permute_cost  */
736  6, /* vec_to_scalar_cost  */
737  5, /* scalar_to_vec_cost  */
738  4, /* vec_align_load_cost  */
739  4, /* vec_unalign_load_cost  */
740  1, /* vec_unalign_store_cost  */
741  1, /* vec_store_cost  */
742  2, /* cond_taken_branch_cost  */
743  1  /* cond_not_taken_branch_cost  */
744};
745
746static const struct cpu_vector_cost thunderx3t110_vector_cost =
747{
748  1, /* scalar_int_stmt_cost  */
749  5, /* scalar_fp_stmt_cost  */
750  4, /* scalar_load_cost  */
751  1, /* scalar_store_cost  */
752  5, /* vec_int_stmt_cost  */
753  5, /* vec_fp_stmt_cost  */
754  10, /* vec_permute_cost  */
755  5, /* vec_to_scalar_cost  */
756  5, /* scalar_to_vec_cost  */
757  4, /* vec_align_load_cost  */
758  4, /* vec_unalign_load_cost  */
759  4, /* vec_unalign_store_cost  */
760  4, /* vec_store_cost  */
761  2, /* cond_taken_branch_cost  */
762  1  /* cond_not_taken_branch_cost  */
763};
764
765static const struct cpu_vector_cost a64fx_vector_cost =
766{
767  1, /* scalar_int_stmt_cost  */
768  5, /* scalar_fp_stmt_cost  */
769  4, /* scalar_load_cost  */
770  1, /* scalar_store_cost  */
771  2, /* vec_int_stmt_cost  */
772  5, /* vec_fp_stmt_cost  */
773  3, /* vec_permute_cost  */
774  13, /* vec_to_scalar_cost  */
775  4, /* scalar_to_vec_cost  */
776  6, /* vec_align_load_cost  */
777  6, /* vec_unalign_load_cost  */
778  1, /* vec_unalign_store_cost  */
779  1, /* vec_store_cost  */
780  3, /* cond_taken_branch_cost  */
781  1 /* cond_not_taken_branch_cost  */
782};
783
784/* Ampere-1 costs for vector insn classes.  */
785static const struct cpu_vector_cost ampere1_vector_cost =
786{
787  1, /* scalar_int_stmt_cost  */
788  3, /* scalar_fp_stmt_cost  */
789  4, /* scalar_load_cost  */
790  1, /* scalar_store_cost  */
791  1, /* int_stmt_cost  */
792  3, /* fp_stmt_cost  */
793  2, /* permute_cost  */
794  6, /* vec_to_scalar_cost  */
795  7, /* scalar_to_vec_cost  */
796  4, /* align_load_cost  */
797  4, /* unalign_load_cost  */
798  1, /* unalign_store_cost  */
799  1, /* store_cost  */
800  1, /* cond_taken_branch_cost  */
801  1 /* cond_not_taken_branch_cost  */
802};
803
804/* Generic costs for branch instructions.  */
805static const struct cpu_branch_cost generic_branch_cost =
806{
807  1,  /* Predictable.  */
808  3   /* Unpredictable.  */
809};
810
811/* Generic approximation modes.  */
812static const cpu_approx_modes generic_approx_modes =
813{
814  AARCH64_APPROX_NONE,	/* division  */
815  AARCH64_APPROX_NONE,	/* sqrt  */
816  AARCH64_APPROX_NONE	/* recip_sqrt  */
817};
818
819/* Approximation modes for Exynos M1.  */
820static const cpu_approx_modes exynosm1_approx_modes =
821{
822  AARCH64_APPROX_NONE,	/* division  */
823  AARCH64_APPROX_ALL,	/* sqrt  */
824  AARCH64_APPROX_ALL	/* recip_sqrt  */
825};
826
827/* Approximation modes for X-Gene 1.  */
828static const cpu_approx_modes xgene1_approx_modes =
829{
830  AARCH64_APPROX_NONE,	/* division  */
831  AARCH64_APPROX_NONE,	/* sqrt  */
832  AARCH64_APPROX_ALL	/* recip_sqrt  */
833};
834
835/* Generic prefetch settings (which disable prefetch).  */
836static const cpu_prefetch_tune generic_prefetch_tune =
837{
838  0,			/* num_slots  */
839  -1,			/* l1_cache_size  */
840  -1,			/* l1_cache_line_size  */
841  -1,			/* l2_cache_size  */
842  true,			/* prefetch_dynamic_strides */
843  -1,			/* minimum_stride */
844  -1			/* default_opt_level  */
845};
846
847static const cpu_prefetch_tune exynosm1_prefetch_tune =
848{
849  0,			/* num_slots  */
850  -1,			/* l1_cache_size  */
851  64,			/* l1_cache_line_size  */
852  -1,			/* l2_cache_size  */
853  true,			/* prefetch_dynamic_strides */
854  -1,			/* minimum_stride */
855  -1			/* default_opt_level  */
856};
857
858static const cpu_prefetch_tune qdf24xx_prefetch_tune =
859{
860  4,			/* num_slots  */
861  32,			/* l1_cache_size  */
862  64,			/* l1_cache_line_size  */
863  512,			/* l2_cache_size  */
864  false,		/* prefetch_dynamic_strides */
865  2048,			/* minimum_stride */
866  3			/* default_opt_level  */
867};
868
869static const cpu_prefetch_tune thunderxt88_prefetch_tune =
870{
871  8,			/* num_slots  */
872  32,			/* l1_cache_size  */
873  128,			/* l1_cache_line_size  */
874  16*1024,		/* l2_cache_size  */
875  true,			/* prefetch_dynamic_strides */
876  -1,			/* minimum_stride */
877  3			/* default_opt_level  */
878};
879
880static const cpu_prefetch_tune thunderx_prefetch_tune =
881{
882  8,			/* num_slots  */
883  32,			/* l1_cache_size  */
884  128,			/* l1_cache_line_size  */
885  -1,			/* l2_cache_size  */
886  true,			/* prefetch_dynamic_strides */
887  -1,			/* minimum_stride */
888  -1			/* default_opt_level  */
889};
890
891static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
892{
893  8,			/* num_slots  */
894  32,			/* l1_cache_size  */
895  64,			/* l1_cache_line_size  */
896  256,			/* l2_cache_size  */
897  true,			/* prefetch_dynamic_strides */
898  -1,			/* minimum_stride */
899  -1			/* default_opt_level  */
900};
901
902static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
903{
904  8,			/* num_slots  */
905  32,			/* l1_cache_size  */
906  64,			/* l1_cache_line_size  */
907  256,			/* l2_cache_size  */
908  true,			/* prefetch_dynamic_strides */
909  -1,			/* minimum_stride */
910  -1			/* default_opt_level  */
911};
912
913static const cpu_prefetch_tune tsv110_prefetch_tune =
914{
915  0,                    /* num_slots  */
916  64,                   /* l1_cache_size  */
917  64,                   /* l1_cache_line_size  */
918  512,                  /* l2_cache_size  */
919  true,                 /* prefetch_dynamic_strides */
920  -1,                   /* minimum_stride */
921  -1                    /* default_opt_level  */
922};
923
924static const cpu_prefetch_tune xgene1_prefetch_tune =
925{
926  8,			/* num_slots  */
927  32,			/* l1_cache_size  */
928  64,			/* l1_cache_line_size  */
929  256,			/* l2_cache_size  */
930  true,                 /* prefetch_dynamic_strides */
931  -1,                   /* minimum_stride */
932  -1			/* default_opt_level  */
933};
934
935static const cpu_prefetch_tune a64fx_prefetch_tune =
936{
937  8,			/* num_slots  */
938  64,			/* l1_cache_size  */
939  256,			/* l1_cache_line_size  */
940  32768,		/* l2_cache_size  */
941  true,			/* prefetch_dynamic_strides */
942  -1,			/* minimum_stride */
943  -1			/* default_opt_level  */
944};
945
946static const cpu_prefetch_tune ampere1_prefetch_tune =
947{
948  0,			/* num_slots  */
949  64,			/* l1_cache_size  */
950  64,			/* l1_cache_line_size  */
951  2048,			/* l2_cache_size  */
952  true,			/* prefetch_dynamic_strides */
953  -1,			/* minimum_stride */
954  -1			/* default_opt_level  */
955};
956
957static const struct tune_params generic_tunings =
958{
959  &cortexa57_extra_costs,
960  &generic_addrcost_table,
961  &generic_regmove_cost,
962  &generic_vector_cost,
963  &generic_branch_cost,
964  &generic_approx_modes,
965  SVE_NOT_IMPLEMENTED, /* sve_width  */
966  4, /* memmov_cost  */
967  2, /* issue_rate  */
968  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
969  "16:12",	/* function_align.  */
970  "4",	/* jump_align.  */
971  "8",	/* loop_align.  */
972  2,	/* int_reassoc_width.  */
973  4,	/* fp_reassoc_width.  */
974  1,	/* vec_reassoc_width.  */
975  2,	/* min_div_recip_mul_sf.  */
976  2,	/* min_div_recip_mul_df.  */
977  0,	/* max_case_values.  */
978  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
979  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
980     Neoverse V1.  It does not have a noticeable effect on A64FX and should
981     have at most a very minor effect on SVE2 cores.  */
982  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
983  &generic_prefetch_tune
984};
985
986static const struct tune_params cortexa35_tunings =
987{
988  &cortexa53_extra_costs,
989  &generic_addrcost_table,
990  &cortexa53_regmove_cost,
991  &generic_vector_cost,
992  &generic_branch_cost,
993  &generic_approx_modes,
994  SVE_NOT_IMPLEMENTED, /* sve_width  */
995  4, /* memmov_cost  */
996  1, /* issue_rate  */
997  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
998   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
999  "16",	/* function_align.  */
1000  "4",	/* jump_align.  */
1001  "8",	/* loop_align.  */
1002  2,	/* int_reassoc_width.  */
1003  4,	/* fp_reassoc_width.  */
1004  1,	/* vec_reassoc_width.  */
1005  2,	/* min_div_recip_mul_sf.  */
1006  2,	/* min_div_recip_mul_df.  */
1007  0,	/* max_case_values.  */
1008  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1009  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1010  &generic_prefetch_tune
1011};
1012
1013static const struct tune_params cortexa53_tunings =
1014{
1015  &cortexa53_extra_costs,
1016  &generic_addrcost_table,
1017  &cortexa53_regmove_cost,
1018  &generic_vector_cost,
1019  &generic_branch_cost,
1020  &generic_approx_modes,
1021  SVE_NOT_IMPLEMENTED, /* sve_width  */
1022  4, /* memmov_cost  */
1023  2, /* issue_rate  */
1024  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1025   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1026  "16",	/* function_align.  */
1027  "4",	/* jump_align.  */
1028  "8",	/* loop_align.  */
1029  2,	/* int_reassoc_width.  */
1030  4,	/* fp_reassoc_width.  */
1031  1,	/* vec_reassoc_width.  */
1032  2,	/* min_div_recip_mul_sf.  */
1033  2,	/* min_div_recip_mul_df.  */
1034  0,	/* max_case_values.  */
1035  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1036  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1037  &generic_prefetch_tune
1038};
1039
1040static const struct tune_params cortexa57_tunings =
1041{
1042  &cortexa57_extra_costs,
1043  &generic_addrcost_table,
1044  &cortexa57_regmove_cost,
1045  &cortexa57_vector_cost,
1046  &generic_branch_cost,
1047  &generic_approx_modes,
1048  SVE_NOT_IMPLEMENTED, /* sve_width  */
1049  4, /* memmov_cost  */
1050  3, /* issue_rate  */
1051  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1053  "16",	/* function_align.  */
1054  "4",	/* jump_align.  */
1055  "8",	/* loop_align.  */
1056  2,	/* int_reassoc_width.  */
1057  4,	/* fp_reassoc_width.  */
1058  1,	/* vec_reassoc_width.  */
1059  2,	/* min_div_recip_mul_sf.  */
1060  2,	/* min_div_recip_mul_df.  */
1061  0,	/* max_case_values.  */
1062  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1063  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
1064  &generic_prefetch_tune
1065};
1066
1067static const struct tune_params cortexa72_tunings =
1068{
1069  &cortexa57_extra_costs,
1070  &generic_addrcost_table,
1071  &cortexa57_regmove_cost,
1072  &cortexa57_vector_cost,
1073  &generic_branch_cost,
1074  &generic_approx_modes,
1075  SVE_NOT_IMPLEMENTED, /* sve_width  */
1076  4, /* memmov_cost  */
1077  3, /* issue_rate  */
1078  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1080  "16",	/* function_align.  */
1081  "4",	/* jump_align.  */
1082  "8",	/* loop_align.  */
1083  2,	/* int_reassoc_width.  */
1084  4,	/* fp_reassoc_width.  */
1085  1,	/* vec_reassoc_width.  */
1086  2,	/* min_div_recip_mul_sf.  */
1087  2,	/* min_div_recip_mul_df.  */
1088  0,	/* max_case_values.  */
1089  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1090  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1091  &generic_prefetch_tune
1092};
1093
1094static const struct tune_params cortexa73_tunings =
1095{
1096  &cortexa57_extra_costs,
1097  &generic_addrcost_table,
1098  &cortexa57_regmove_cost,
1099  &cortexa57_vector_cost,
1100  &generic_branch_cost,
1101  &generic_approx_modes,
1102  SVE_NOT_IMPLEMENTED, /* sve_width  */
1103  4, /* memmov_cost.  */
1104  2, /* issue_rate.  */
1105  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1106   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1107  "16",	/* function_align.  */
1108  "4",	/* jump_align.  */
1109  "8",	/* loop_align.  */
1110  2,	/* int_reassoc_width.  */
1111  4,	/* fp_reassoc_width.  */
1112  1,	/* vec_reassoc_width.  */
1113  2,	/* min_div_recip_mul_sf.  */
1114  2,	/* min_div_recip_mul_df.  */
1115  0,	/* max_case_values.  */
1116  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1117  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1118  &generic_prefetch_tune
1119};
1120
1121
1122
1123static const struct tune_params exynosm1_tunings =
1124{
1125  &exynosm1_extra_costs,
1126  &exynosm1_addrcost_table,
1127  &exynosm1_regmove_cost,
1128  &exynosm1_vector_cost,
1129  &generic_branch_cost,
1130  &exynosm1_approx_modes,
1131  SVE_NOT_IMPLEMENTED, /* sve_width  */
1132  4,	/* memmov_cost  */
1133  3,	/* issue_rate  */
1134  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1135  "4",	/* function_align.  */
1136  "4",	/* jump_align.  */
1137  "4",	/* loop_align.  */
1138  2,	/* int_reassoc_width.  */
1139  4,	/* fp_reassoc_width.  */
1140  1,	/* vec_reassoc_width.  */
1141  2,	/* min_div_recip_mul_sf.  */
1142  2,	/* min_div_recip_mul_df.  */
1143  48,	/* max_case_values.  */
1144  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1145  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1146  &exynosm1_prefetch_tune
1147};
1148
1149static const struct tune_params thunderxt88_tunings =
1150{
1151  &thunderx_extra_costs,
1152  &generic_addrcost_table,
1153  &thunderx_regmove_cost,
1154  &thunderx_vector_cost,
1155  &generic_branch_cost,
1156  &generic_approx_modes,
1157  SVE_NOT_IMPLEMENTED, /* sve_width  */
1158  6, /* memmov_cost  */
1159  2, /* issue_rate  */
1160  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1161  "8",	/* function_align.  */
1162  "8",	/* jump_align.  */
1163  "8",	/* loop_align.  */
1164  2,	/* int_reassoc_width.  */
1165  4,	/* fp_reassoc_width.  */
1166  1,	/* vec_reassoc_width.  */
1167  2,	/* min_div_recip_mul_sf.  */
1168  2,	/* min_div_recip_mul_df.  */
1169  0,	/* max_case_values.  */
1170  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1171  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
1172  &thunderxt88_prefetch_tune
1173};
1174
1175static const struct tune_params thunderx_tunings =
1176{
1177  &thunderx_extra_costs,
1178  &generic_addrcost_table,
1179  &thunderx_regmove_cost,
1180  &thunderx_vector_cost,
1181  &generic_branch_cost,
1182  &generic_approx_modes,
1183  SVE_NOT_IMPLEMENTED, /* sve_width  */
1184  6, /* memmov_cost  */
1185  2, /* issue_rate  */
1186  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1187  "8",	/* function_align.  */
1188  "8",	/* jump_align.  */
1189  "8",	/* loop_align.  */
1190  2,	/* int_reassoc_width.  */
1191  4,	/* fp_reassoc_width.  */
1192  1,	/* vec_reassoc_width.  */
1193  2,	/* min_div_recip_mul_sf.  */
1194  2,	/* min_div_recip_mul_df.  */
1195  0,	/* max_case_values.  */
1196  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1197  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1198   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
1199  &thunderx_prefetch_tune
1200};
1201
1202static const struct tune_params tsv110_tunings =
1203{
1204  &tsv110_extra_costs,
1205  &tsv110_addrcost_table,
1206  &tsv110_regmove_cost,
1207  &tsv110_vector_cost,
1208  &generic_branch_cost,
1209  &generic_approx_modes,
1210  SVE_NOT_IMPLEMENTED, /* sve_width  */
1211  4,    /* memmov_cost  */
1212  4,    /* issue_rate  */
1213  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1214   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1215  "16", /* function_align.  */
1216  "4",  /* jump_align.  */
1217  "8",  /* loop_align.  */
1218  2,    /* int_reassoc_width.  */
1219  4,    /* fp_reassoc_width.  */
1220  1,    /* vec_reassoc_width.  */
1221  2,    /* min_div_recip_mul_sf.  */
1222  2,    /* min_div_recip_mul_df.  */
1223  0,    /* max_case_values.  */
1224  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1225  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1226  &tsv110_prefetch_tune
1227};
1228
1229static const struct tune_params xgene1_tunings =
1230{
1231  &xgene1_extra_costs,
1232  &xgene1_addrcost_table,
1233  &xgene1_regmove_cost,
1234  &xgene1_vector_cost,
1235  &generic_branch_cost,
1236  &xgene1_approx_modes,
1237  SVE_NOT_IMPLEMENTED, /* sve_width  */
1238  6, /* memmov_cost  */
1239  4, /* issue_rate  */
1240  AARCH64_FUSE_NOTHING, /* fusible_ops  */
1241  "16",	/* function_align.  */
1242  "16",	/* jump_align.  */
1243  "16",	/* loop_align.  */
1244  2,	/* int_reassoc_width.  */
1245  4,	/* fp_reassoc_width.  */
1246  1,	/* vec_reassoc_width.  */
1247  2,	/* min_div_recip_mul_sf.  */
1248  2,	/* min_div_recip_mul_df.  */
1249  17,	/* max_case_values.  */
1250  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1251  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1252  &xgene1_prefetch_tune
1253};
1254
1255static const struct tune_params emag_tunings =
1256{
1257  &xgene1_extra_costs,
1258  &xgene1_addrcost_table,
1259  &xgene1_regmove_cost,
1260  &xgene1_vector_cost,
1261  &generic_branch_cost,
1262  &xgene1_approx_modes,
1263  SVE_NOT_IMPLEMENTED,
1264  6, /* memmov_cost  */
1265  4, /* issue_rate  */
1266  AARCH64_FUSE_NOTHING, /* fusible_ops  */
1267  "16",	/* function_align.  */
1268  "16",	/* jump_align.  */
1269  "16",	/* loop_align.  */
1270  2,	/* int_reassoc_width.  */
1271  4,	/* fp_reassoc_width.  */
1272  1,	/* vec_reassoc_width.  */
1273  2,	/* min_div_recip_mul_sf.  */
1274  2,	/* min_div_recip_mul_df.  */
1275  17,	/* max_case_values.  */
1276  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1277  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1278  &xgene1_prefetch_tune
1279};
1280
1281static const struct tune_params qdf24xx_tunings =
1282{
1283  &qdf24xx_extra_costs,
1284  &qdf24xx_addrcost_table,
1285  &qdf24xx_regmove_cost,
1286  &qdf24xx_vector_cost,
1287  &generic_branch_cost,
1288  &generic_approx_modes,
1289  SVE_NOT_IMPLEMENTED, /* sve_width  */
1290  4, /* memmov_cost  */
1291  4, /* issue_rate  */
1292  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1293   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1294  "16",	/* function_align.  */
1295  "8",	/* jump_align.  */
1296  "16",	/* loop_align.  */
1297  2,	/* int_reassoc_width.  */
1298  4,	/* fp_reassoc_width.  */
1299  1,	/* vec_reassoc_width.  */
1300  2,	/* min_div_recip_mul_sf.  */
1301  2,	/* min_div_recip_mul_df.  */
1302  0,	/* max_case_values.  */
1303  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1304  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1305  &qdf24xx_prefetch_tune
1306};
1307
1308/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1309   for now.  */
1310static const struct tune_params saphira_tunings =
1311{
1312  &generic_extra_costs,
1313  &generic_addrcost_table,
1314  &generic_regmove_cost,
1315  &generic_vector_cost,
1316  &generic_branch_cost,
1317  &generic_approx_modes,
1318  SVE_NOT_IMPLEMENTED, /* sve_width  */
1319  4, /* memmov_cost  */
1320  4, /* issue_rate  */
1321  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1322   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1323  "16",	/* function_align.  */
1324  "8",	/* jump_align.  */
1325  "16",	/* loop_align.  */
1326  2,	/* int_reassoc_width.  */
1327  4,	/* fp_reassoc_width.  */
1328  1,	/* vec_reassoc_width.  */
1329  2,	/* min_div_recip_mul_sf.  */
1330  2,	/* min_div_recip_mul_df.  */
1331  0,	/* max_case_values.  */
1332  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1333  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1334  &generic_prefetch_tune
1335};
1336
1337static const struct tune_params thunderx2t99_tunings =
1338{
1339  &thunderx2t99_extra_costs,
1340  &thunderx2t99_addrcost_table,
1341  &thunderx2t99_regmove_cost,
1342  &thunderx2t99_vector_cost,
1343  &generic_branch_cost,
1344  &generic_approx_modes,
1345  SVE_NOT_IMPLEMENTED, /* sve_width  */
1346  4, /* memmov_cost.  */
1347  4, /* issue_rate.  */
1348  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1349   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1350  "16",	/* function_align.  */
1351  "8",	/* jump_align.  */
1352  "16",	/* loop_align.  */
1353  3,	/* int_reassoc_width.  */
1354  2,	/* fp_reassoc_width.  */
1355  2,	/* vec_reassoc_width.  */
1356  2,	/* min_div_recip_mul_sf.  */
1357  2,	/* min_div_recip_mul_df.  */
1358  0,	/* max_case_values.  */
1359  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1360  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1361  &thunderx2t99_prefetch_tune
1362};
1363
1364static const struct tune_params thunderx3t110_tunings =
1365{
1366  &thunderx3t110_extra_costs,
1367  &thunderx3t110_addrcost_table,
1368  &thunderx3t110_regmove_cost,
1369  &thunderx3t110_vector_cost,
1370  &generic_branch_cost,
1371  &generic_approx_modes,
1372  SVE_NOT_IMPLEMENTED, /* sve_width  */
1373  4, /* memmov_cost.  */
1374  6, /* issue_rate.  */
1375  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1376   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1377  "16",	/* function_align.  */
1378  "8",	/* jump_align.  */
1379  "16",	/* loop_align.  */
1380  3,	/* int_reassoc_width.  */
1381  2,	/* fp_reassoc_width.  */
1382  2,	/* vec_reassoc_width.  */
1383  2,	/* min_div_recip_mul_sf.  */
1384  2,	/* min_div_recip_mul_df.  */
1385  0,	/* max_case_values.  */
1386  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1387  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1388  &thunderx3t110_prefetch_tune
1389};
1390
1391static const struct tune_params neoversen1_tunings =
1392{
1393  &cortexa57_extra_costs,
1394  &generic_addrcost_table,
1395  &generic_regmove_cost,
1396  &cortexa57_vector_cost,
1397  &generic_branch_cost,
1398  &generic_approx_modes,
1399  SVE_NOT_IMPLEMENTED, /* sve_width  */
1400  4, /* memmov_cost  */
1401  3, /* issue_rate  */
1402  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1403  "32:16",	/* function_align.  */
1404  "4",		/* jump_align.  */
1405  "32:16",	/* loop_align.  */
1406  2,	/* int_reassoc_width.  */
1407  4,	/* fp_reassoc_width.  */
1408  2,	/* vec_reassoc_width.  */
1409  2,	/* min_div_recip_mul_sf.  */
1410  2,	/* min_div_recip_mul_df.  */
1411  0,	/* max_case_values.  */
1412  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1413  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1414  &generic_prefetch_tune
1415};
1416
1417static const struct tune_params ampere1_tunings =
1418{
1419  &ampere1_extra_costs,
1420  &generic_addrcost_table,
1421  &generic_regmove_cost,
1422  &ampere1_vector_cost,
1423  &generic_branch_cost,
1424  &generic_approx_modes,
1425  SVE_NOT_IMPLEMENTED, /* sve_width  */
1426  4, /* memmov_cost  */
1427  4, /* issue_rate  */
1428  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1429   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1430   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1431   AARCH64_FUSE_CMP_BRANCH),
1432  /* fusible_ops  */
1433  "32",		/* function_align.  */
1434  "4",		/* jump_align.  */
1435  "32:16",	/* loop_align.  */
1436  2,	/* int_reassoc_width.  */
1437  4,	/* fp_reassoc_width.  */
1438  2,	/* vec_reassoc_width.  */
1439  2,	/* min_div_recip_mul_sf.  */
1440  2,	/* min_div_recip_mul_df.  */
1441  0,	/* max_case_values.  */
1442  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1443  (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),	/* tune_flags.  */
1444  &ampere1_prefetch_tune
1445};
1446
1447static const struct tune_params ampere1a_tunings =
1448{
1449  &ampere1a_extra_costs,
1450  &generic_addrcost_table,
1451  &generic_regmove_cost,
1452  &ampere1_vector_cost,
1453  &generic_branch_cost,
1454  &generic_approx_modes,
1455  SVE_NOT_IMPLEMENTED, /* sve_width  */
1456  4, /* memmov_cost  */
1457  4, /* issue_rate  */
1458  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1459   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1460   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1461   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1462   AARCH64_FUSE_ADDSUB_2REG_CONST1),
1463  /* fusible_ops  */
1464  "32",		/* function_align.  */
1465  "4",		/* jump_align.  */
1466  "32:16",	/* loop_align.  */
1467  2,	/* int_reassoc_width.  */
1468  4,	/* fp_reassoc_width.  */
1469  2,	/* vec_reassoc_width.  */
1470  2,	/* min_div_recip_mul_sf.  */
1471  2,	/* min_div_recip_mul_df.  */
1472  0,	/* max_case_values.  */
1473  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1474  (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),	/* tune_flags.  */
1475  &ampere1_prefetch_tune
1476};
1477
1478static const struct tune_params neoversev1_tunings =
1479{
1480  &cortexa57_extra_costs,
1481  &generic_addrcost_table,
1482  &generic_regmove_cost,
1483  &cortexa57_vector_cost,
1484  &generic_branch_cost,
1485  &generic_approx_modes,
1486  SVE_256, /* sve_width  */
1487  4, /* memmov_cost  */
1488  3, /* issue_rate  */
1489  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1490  "32:16",	/* function_align.  */
1491  "4",		/* jump_align.  */
1492  "32:16",	/* loop_align.  */
1493  2,	/* int_reassoc_width.  */
1494  4,	/* fp_reassoc_width.  */
1495  2,	/* vec_reassoc_width.  */
1496  2,	/* min_div_recip_mul_sf.  */
1497  2,	/* min_div_recip_mul_df.  */
1498  0,	/* max_case_values.  */
1499  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1500  (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC
1501   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
1502  &generic_prefetch_tune
1503};
1504
1505static const struct tune_params neoversen2_tunings =
1506{
1507  &cortexa57_extra_costs,
1508  &generic_addrcost_table,
1509  &generic_regmove_cost,
1510  &cortexa57_vector_cost,
1511  &generic_branch_cost,
1512  &generic_approx_modes,
1513  SVE_128, /* sve_width  */
1514  4, /* memmov_cost  */
1515  3, /* issue_rate  */
1516  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1517  "32:16",	/* function_align.  */
1518  "4",		/* jump_align.  */
1519  "32:16",	/* loop_align.  */
1520  2,	/* int_reassoc_width.  */
1521  4,	/* fp_reassoc_width.  */
1522  2,	/* vec_reassoc_width.  */
1523  2,	/* min_div_recip_mul_sf.  */
1524  2,	/* min_div_recip_mul_df.  */
1525  0,	/* max_case_values.  */
1526  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1527  (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC),	/* tune_flags.  */
1528  &generic_prefetch_tune
1529};
1530
1531static const struct tune_params a64fx_tunings =
1532{
1533  &a64fx_extra_costs,
1534  &a64fx_addrcost_table,
1535  &a64fx_regmove_cost,
1536  &a64fx_vector_cost,
1537  &generic_branch_cost,
1538  &generic_approx_modes,
1539  SVE_512, /* sve_width  */
1540  4, /* memmov_cost  */
1541  7, /* issue_rate  */
1542  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1543  "32",	/* function_align.  */
1544  "16",	/* jump_align.  */
1545  "32",	/* loop_align.  */
1546  4,	/* int_reassoc_width.  */
1547  2,	/* fp_reassoc_width.  */
1548  2,	/* vec_reassoc_width.  */
1549  2,	/* min_div_recip_mul_sf.  */
1550  2,	/* min_div_recip_mul_df.  */
1551  0,	/* max_case_values.  */
1552  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1553  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1554  &a64fx_prefetch_tune
1555};
1556
1557/* Support for fine-grained override of the tuning structures.  */
1558struct aarch64_tuning_override_function
1559{
1560  const char* name;
1561  void (*parse_override)(const char*, struct tune_params*);
1562};
1563
1564static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1565static void aarch64_parse_tune_string (const char*, struct tune_params*);
1566static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1567
1568static const struct aarch64_tuning_override_function
1569aarch64_tuning_override_functions[] =
1570{
1571  { "fuse", aarch64_parse_fuse_string },
1572  { "tune", aarch64_parse_tune_string },
1573  { "sve_width", aarch64_parse_sve_width_string },
1574  { NULL, NULL }
1575};
1576
1577/* A processor implementing AArch64.  */
1578struct processor
1579{
1580  const char *const name;
1581  enum aarch64_processor ident;
1582  enum aarch64_processor sched_core;
1583  enum aarch64_arch arch;
1584  unsigned architecture_version;
1585  const uint64_t flags;
1586  const struct tune_params *const tune;
1587};
1588
1589/* Architectures implementing AArch64.  */
1590static const struct processor all_architectures[] =
1591{
1592#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1593  {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1594#include "aarch64-arches.def"
1595  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1596};
1597
1598/* Processor cores implementing AArch64.  */
1599static const struct processor all_cores[] =
1600{
1601#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1602  {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1603  all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1604  FLAGS, &COSTS##_tunings},
1605#include "aarch64-cores.def"
1606  {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1607    AARCH64_FL_FOR_ARCH8, &generic_tunings},
1608  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1609};
1610
1611
1612/* Target specification.  These are populated by the -march, -mtune, -mcpu
1613   handling code or by target attributes.  */
1614static const struct processor *selected_arch;
1615static const struct processor *selected_cpu;
1616static const struct processor *selected_tune;
1617
1618enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1619
1620/* The current tuning set.  */
1621struct tune_params aarch64_tune_params = generic_tunings;
1622
1623/* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1624
1625static tree
1626handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1627				     int, bool *no_add_attrs)
1628{
1629  /* Since we set fn_type_req to true, the caller should have checked
1630     this for us.  */
1631  gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1632  switch ((arm_pcs) fntype_abi (*node).id ())
1633    {
1634    case ARM_PCS_AAPCS64:
1635    case ARM_PCS_SIMD:
1636      return NULL_TREE;
1637
1638    case ARM_PCS_SVE:
1639      error ("the %qE attribute cannot be applied to an SVE function type",
1640	     name);
1641      *no_add_attrs = true;
1642      return NULL_TREE;
1643
1644    case ARM_PCS_TLSDESC:
1645    case ARM_PCS_UNKNOWN:
1646      break;
1647    }
1648  gcc_unreachable ();
1649}
1650
1651/* Table of machine attributes.  */
1652static const struct attribute_spec aarch64_attribute_table[] =
1653{
1654  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1655       affects_type_identity, handler, exclude } */
1656  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1657			  handle_aarch64_vector_pcs_attribute, NULL },
1658  { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1659			  aarch64_sve::handle_arm_sve_vector_bits_attribute,
1660			  NULL },
1661  { "Advanced SIMD type", 0, 0, false, true,  false, true,  NULL, NULL },
1662  { "SVE type",		  3, 3, false, true,  false, true,  NULL, NULL },
1663  { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1664  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1665};
1666
1667#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1668
1669/* An ISA extension in the co-processor and main instruction set space.  */
1670struct aarch64_option_extension
1671{
1672  const char *const name;
1673  const unsigned long flags_on;
1674  const unsigned long flags_off;
1675};
1676
1677typedef enum aarch64_cond_code
1678{
1679  AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1680  AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1681  AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1682}
1683aarch64_cc;
1684
1685#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1686
1687struct aarch64_branch_protect_type
1688{
1689  /* The type's name that the user passes to the branch-protection option
1690    string.  */
1691  const char* name;
1692  /* Function to handle the protection type and set global variables.
1693    First argument is the string token corresponding with this type and the
1694    second argument is the next token in the option string.
1695    Return values:
1696    * AARCH64_PARSE_OK: Handling was sucessful.
1697    * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1698      should print an error.
1699    * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1700      own error.  */
1701  enum aarch64_parse_opt_result (*handler)(char*, char*);
1702  /* A list of types that can follow this type in the option string.  */
1703  const aarch64_branch_protect_type* subtypes;
1704  unsigned int num_subtypes;
1705};
1706
1707static enum aarch64_parse_opt_result
1708aarch64_handle_no_branch_protection (char* str, char* rest)
1709{
1710  aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1711  aarch64_enable_bti = 0;
1712  if (rest)
1713    {
1714      error ("unexpected %<%s%> after %<%s%>", rest, str);
1715      return AARCH64_PARSE_INVALID_FEATURE;
1716    }
1717  return AARCH64_PARSE_OK;
1718}
1719
1720static enum aarch64_parse_opt_result
1721aarch64_handle_standard_branch_protection (char* str, char* rest)
1722{
1723  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1724  aarch64_ra_sign_key = AARCH64_KEY_A;
1725  aarch64_enable_bti = 1;
1726  if (rest)
1727    {
1728      error ("unexpected %<%s%> after %<%s%>", rest, str);
1729      return AARCH64_PARSE_INVALID_FEATURE;
1730    }
1731  return AARCH64_PARSE_OK;
1732}
1733
1734static enum aarch64_parse_opt_result
1735aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1736				    char* rest ATTRIBUTE_UNUSED)
1737{
1738  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1739  aarch64_ra_sign_key = AARCH64_KEY_A;
1740  return AARCH64_PARSE_OK;
1741}
1742
1743static enum aarch64_parse_opt_result
1744aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1745			      char* rest ATTRIBUTE_UNUSED)
1746{
1747  aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1748  return AARCH64_PARSE_OK;
1749}
1750
1751static enum aarch64_parse_opt_result
1752aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1753			      char* rest ATTRIBUTE_UNUSED)
1754{
1755  aarch64_ra_sign_key = AARCH64_KEY_B;
1756  return AARCH64_PARSE_OK;
1757}
1758
1759static enum aarch64_parse_opt_result
1760aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1761				    char* rest ATTRIBUTE_UNUSED)
1762{
1763  aarch64_enable_bti = 1;
1764  return AARCH64_PARSE_OK;
1765}
1766
1767static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1768  { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1769  { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1770  { NULL, NULL, NULL, 0 }
1771};
1772
1773static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1774  { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1775  { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1776  { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1777    ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1778  { "bti", aarch64_handle_bti_protection, NULL, 0 },
1779  { NULL, NULL, NULL, 0 }
1780};
1781
1782/* The condition codes of the processor, and the inverse function.  */
1783static const char * const aarch64_condition_codes[] =
1784{
1785  "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1786  "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1787};
1788
1789/* The preferred condition codes for SVE conditions.  */
1790static const char *const aarch64_sve_condition_codes[] =
1791{
1792  "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1793  "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1794};
1795
1796/* Return the assembly token for svpattern value VALUE.  */
1797
1798static const char *
1799svpattern_token (enum aarch64_svpattern pattern)
1800{
1801  switch (pattern)
1802    {
1803#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1804    AARCH64_FOR_SVPATTERN (CASE)
1805#undef CASE
1806    case AARCH64_NUM_SVPATTERNS:
1807      break;
1808    }
1809  gcc_unreachable ();
1810}
1811
1812/* Return the location of a piece that is known to be passed or returned
1813   in registers.  FIRST_ZR is the first unused vector argument register
1814   and FIRST_PR is the first unused predicate argument register.  */
1815
1816rtx
1817pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1818					 unsigned int first_pr) const
1819{
1820  gcc_assert (VECTOR_MODE_P (mode)
1821	      && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1822	      && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1823
1824  if (num_zr > 0 && num_pr == 0)
1825    return gen_rtx_REG (mode, first_zr);
1826
1827  if (num_zr == 0 && num_pr == 1)
1828    return gen_rtx_REG (mode, first_pr);
1829
1830  gcc_unreachable ();
1831}
1832
1833/* Return the total number of vector registers required by the PST.  */
1834
1835unsigned int
1836pure_scalable_type_info::num_zr () const
1837{
1838  unsigned int res = 0;
1839  for (unsigned int i = 0; i < pieces.length (); ++i)
1840    res += pieces[i].num_zr;
1841  return res;
1842}
1843
1844/* Return the total number of predicate registers required by the PST.  */
1845
1846unsigned int
1847pure_scalable_type_info::num_pr () const
1848{
1849  unsigned int res = 0;
1850  for (unsigned int i = 0; i < pieces.length (); ++i)
1851    res += pieces[i].num_pr;
1852  return res;
1853}
1854
1855/* Return the location of a PST that is known to be passed or returned
1856   in registers.  FIRST_ZR is the first unused vector argument register
1857   and FIRST_PR is the first unused predicate argument register.  */
1858
1859rtx
1860pure_scalable_type_info::get_rtx (machine_mode mode,
1861				  unsigned int first_zr,
1862				  unsigned int first_pr) const
1863{
1864  /* Try to return a single REG if possible.  This leads to better
1865     code generation; it isn't required for correctness.  */
1866  if (mode == pieces[0].mode)
1867    {
1868      gcc_assert (pieces.length () == 1);
1869      return pieces[0].get_rtx (first_zr, first_pr);
1870    }
1871
1872  /* Build up a PARALLEL that contains the individual pieces.  */
1873  rtvec rtxes = rtvec_alloc (pieces.length ());
1874  for (unsigned int i = 0; i < pieces.length (); ++i)
1875    {
1876      rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1877      rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1878      RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1879      first_zr += pieces[i].num_zr;
1880      first_pr += pieces[i].num_pr;
1881    }
1882  return gen_rtx_PARALLEL (mode, rtxes);
1883}
1884
1885/* Analyze whether TYPE is a Pure Scalable Type according to the rules
1886   in the AAPCS64.  */
1887
1888pure_scalable_type_info::analysis_result
1889pure_scalable_type_info::analyze (const_tree type)
1890{
1891  /* Prevent accidental reuse.  */
1892  gcc_assert (pieces.is_empty ());
1893
1894  /* No code will be generated for erroneous types, so we won't establish
1895     an ABI mapping.  */
1896  if (type == error_mark_node)
1897    return NO_ABI_IDENTITY;
1898
1899  /* Zero-sized types disappear in the language->ABI mapping.  */
1900  if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1901    return NO_ABI_IDENTITY;
1902
1903  /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1904  piece p = {};
1905  if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1906    {
1907      machine_mode mode = TYPE_MODE_RAW (type);
1908      gcc_assert (VECTOR_MODE_P (mode)
1909		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1910
1911      p.mode = p.orig_mode = mode;
1912      add_piece (p);
1913      return IS_PST;
1914    }
1915
1916  /* Check for user-defined PSTs.  */
1917  if (TREE_CODE (type) == ARRAY_TYPE)
1918    return analyze_array (type);
1919  if (TREE_CODE (type) == RECORD_TYPE)
1920    return analyze_record (type);
1921
1922  return ISNT_PST;
1923}
1924
1925/* Analyze a type that is known not to be passed or returned in memory.
1926   Return true if it has an ABI identity and is a Pure Scalable Type.  */
1927
1928bool
1929pure_scalable_type_info::analyze_registers (const_tree type)
1930{
1931  analysis_result result = analyze (type);
1932  gcc_assert (result != DOESNT_MATTER);
1933  return result == IS_PST;
1934}
1935
1936/* Subroutine of analyze for handling ARRAY_TYPEs.  */
1937
1938pure_scalable_type_info::analysis_result
1939pure_scalable_type_info::analyze_array (const_tree type)
1940{
1941  /* Analyze the element type.  */
1942  pure_scalable_type_info element_info;
1943  analysis_result result = element_info.analyze (TREE_TYPE (type));
1944  if (result != IS_PST)
1945    return result;
1946
1947  /* An array of unknown, flexible or variable length will be passed and
1948     returned by reference whatever we do.  */
1949  tree nelts_minus_one = array_type_nelts (type);
1950  if (!tree_fits_uhwi_p (nelts_minus_one))
1951    return DOESNT_MATTER;
1952
1953  /* Likewise if the array is constant-sized but too big to be interesting.
1954     The double checks against MAX_PIECES are to protect against overflow.  */
1955  unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1956  if (count > MAX_PIECES)
1957    return DOESNT_MATTER;
1958  count += 1;
1959  if (count * element_info.pieces.length () > MAX_PIECES)
1960    return DOESNT_MATTER;
1961
1962  /* The above checks should have weeded out elements of unknown size.  */
1963  poly_uint64 element_bytes;
1964  if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1965    gcc_unreachable ();
1966
1967  /* Build up the list of individual vectors and predicates.  */
1968  gcc_assert (!element_info.pieces.is_empty ());
1969  for (unsigned int i = 0; i < count; ++i)
1970    for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1971      {
1972	piece p = element_info.pieces[j];
1973	p.offset += i * element_bytes;
1974	add_piece (p);
1975      }
1976  return IS_PST;
1977}
1978
1979/* Subroutine of analyze for handling RECORD_TYPEs.  */
1980
1981pure_scalable_type_info::analysis_result
1982pure_scalable_type_info::analyze_record (const_tree type)
1983{
1984  for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1985    {
1986      if (TREE_CODE (field) != FIELD_DECL)
1987	continue;
1988
1989      /* Zero-sized fields disappear in the language->ABI mapping.  */
1990      if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1991	continue;
1992
1993      /* All fields with an ABI identity must be PSTs for the record as
1994	 a whole to be a PST.  If any individual field is too big to be
1995	 interesting then the record is too.  */
1996      pure_scalable_type_info field_info;
1997      analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1998      if (subresult == NO_ABI_IDENTITY)
1999	continue;
2000      if (subresult != IS_PST)
2001	return subresult;
2002
2003      /* Since all previous fields are PSTs, we ought to be able to track
2004	 the field offset using poly_ints.  */
2005      tree bitpos = bit_position (field);
2006      gcc_assert (poly_int_tree_p (bitpos));
2007
2008      /* For the same reason, it shouldn't be possible to create a PST field
2009	 whose offset isn't byte-aligned.  */
2010      poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
2011						BITS_PER_UNIT);
2012
2013      /* Punt if the record is too big to be interesting.  */
2014      poly_uint64 bytepos;
2015      if (!wide_bytepos.to_uhwi (&bytepos)
2016	  || pieces.length () + field_info.pieces.length () > MAX_PIECES)
2017	return DOESNT_MATTER;
2018
2019      /* Add the individual vectors and predicates in the field to the
2020	 record's list.  */
2021      gcc_assert (!field_info.pieces.is_empty ());
2022      for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
2023	{
2024	  piece p = field_info.pieces[i];
2025	  p.offset += bytepos;
2026	  add_piece (p);
2027	}
2028    }
2029  /* Empty structures disappear in the language->ABI mapping.  */
2030  return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
2031}
2032
2033/* Add P to the list of pieces in the type.  */
2034
2035void
2036pure_scalable_type_info::add_piece (const piece &p)
2037{
2038  /* Try to fold the new piece into the previous one to form a
2039     single-mode PST.  For example, if we see three consecutive vectors
2040     of the same mode, we can represent them using the corresponding
2041     3-tuple mode.
2042
2043     This is purely an optimization.  */
2044  if (!pieces.is_empty ())
2045    {
2046      piece &prev = pieces.last ();
2047      gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
2048      unsigned int nelems1, nelems2;
2049      if (prev.orig_mode == p.orig_mode
2050	  && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
2051	  && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
2052				  GET_MODE_NUNITS (p.orig_mode), &nelems1)
2053	  && constant_multiple_p (GET_MODE_NUNITS (p.mode),
2054				  GET_MODE_NUNITS (p.orig_mode), &nelems2)
2055	  && targetm.array_mode (p.orig_mode,
2056				 nelems1 + nelems2).exists (&prev.mode))
2057	{
2058	  prev.num_zr += p.num_zr;
2059	  prev.num_pr += p.num_pr;
2060	  return;
2061	}
2062    }
2063  pieces.quick_push (p);
2064}
2065
2066/* Return true if at least one possible value of type TYPE includes at
2067   least one object of Pure Scalable Type, in the sense of the AAPCS64.
2068
2069   This is a relatively expensive test for some types, so it should
2070   generally be made as late as possible.  */
2071
2072static bool
2073aarch64_some_values_include_pst_objects_p (const_tree type)
2074{
2075  if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2076    return false;
2077
2078  if (aarch64_sve::builtin_type_p (type))
2079    return true;
2080
2081  if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2082    return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2083
2084  if (RECORD_OR_UNION_TYPE_P (type))
2085    for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2086      if (TREE_CODE (field) == FIELD_DECL
2087	  && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2088	return true;
2089
2090  return false;
2091}
2092
2093/* Return the descriptor of the SIMD ABI.  */
2094
2095static const predefined_function_abi &
2096aarch64_simd_abi (void)
2097{
2098  predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2099  if (!simd_abi.initialized_p ())
2100    {
2101      HARD_REG_SET full_reg_clobbers
2102	= default_function_abi.full_reg_clobbers ();
2103      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2104	if (FP_SIMD_SAVED_REGNUM_P (regno))
2105	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2106      simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2107    }
2108  return simd_abi;
2109}
2110
2111/* Return the descriptor of the SVE PCS.  */
2112
2113static const predefined_function_abi &
2114aarch64_sve_abi (void)
2115{
2116  predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2117  if (!sve_abi.initialized_p ())
2118    {
2119      HARD_REG_SET full_reg_clobbers
2120	= default_function_abi.full_reg_clobbers ();
2121      for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2122	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2123      for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2124	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2125      sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2126    }
2127  return sve_abi;
2128}
2129
2130/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2131   wraps, otherwise return X itself.  */
2132
2133static rtx
2134strip_salt (rtx x)
2135{
2136  rtx search = x;
2137  if (GET_CODE (search) == CONST)
2138    search = XEXP (search, 0);
2139  if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2140    x = XVECEXP (search, 0, 0);
2141  return x;
2142}
2143
2144/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2145   expression.  */
2146
2147static rtx
2148strip_offset_and_salt (rtx addr, poly_int64 *offset)
2149{
2150  return strip_salt (strip_offset (addr, offset));
2151}
2152
2153/* Generate code to enable conditional branches in functions over 1 MiB.  */
2154const char *
2155aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2156			const char * branch_format)
2157{
2158    rtx_code_label * tmp_label = gen_label_rtx ();
2159    char label_buf[256];
2160    char buffer[128];
2161    ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2162				 CODE_LABEL_NUMBER (tmp_label));
2163    const char *label_ptr = targetm.strip_name_encoding (label_buf);
2164    rtx dest_label = operands[pos_label];
2165    operands[pos_label] = tmp_label;
2166
2167    snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2168    output_asm_insn (buffer, operands);
2169
2170    snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2171    operands[pos_label] = dest_label;
2172    output_asm_insn (buffer, operands);
2173    return "";
2174}
2175
2176void
2177aarch64_err_no_fpadvsimd (machine_mode mode)
2178{
2179  if (TARGET_GENERAL_REGS_ONLY)
2180    if (FLOAT_MODE_P (mode))
2181      error ("%qs is incompatible with the use of floating-point types",
2182	     "-mgeneral-regs-only");
2183    else
2184      error ("%qs is incompatible with the use of vector types",
2185	     "-mgeneral-regs-only");
2186  else
2187    if (FLOAT_MODE_P (mode))
2188      error ("%qs feature modifier is incompatible with the use of"
2189	     " floating-point types", "+nofp");
2190    else
2191      error ("%qs feature modifier is incompatible with the use of"
2192	     " vector types", "+nofp");
2193}
2194
2195/* Report when we try to do something that requires SVE when SVE is disabled.
2196   This is an error of last resort and isn't very high-quality.  It usually
2197   involves attempts to measure the vector length in some way.  */
2198static void
2199aarch64_report_sve_required (void)
2200{
2201  static bool reported_p = false;
2202
2203  /* Avoid reporting a slew of messages for a single oversight.  */
2204  if (reported_p)
2205    return;
2206
2207  error ("this operation requires the SVE ISA extension");
2208  inform (input_location, "you can enable SVE using the command-line"
2209	  " option %<-march%>, or by using the %<target%>"
2210	  " attribute or pragma");
2211  reported_p = true;
2212}
2213
2214/* Return true if REGNO is P0-P15 or one of the special FFR-related
2215   registers.  */
2216inline bool
2217pr_or_ffr_regnum_p (unsigned int regno)
2218{
2219  return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2220}
2221
2222/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2223   The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2224   GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2225   higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2226   and GENERAL_REGS is lower than the memory cost (in this case the best class
2227   is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
2228   cost results in bad allocations with many redundant int<->FP moves which
2229   are expensive on various cores.
2230   To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2231   force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
2232   if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
2233   POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
2234   The result of this is that it is no longer inefficient to have a higher
2235   memory move cost than the register move cost.
2236*/
2237
2238static reg_class_t
2239aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2240					 reg_class_t best_class)
2241{
2242  machine_mode mode;
2243
2244  if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2245      || !reg_class_subset_p (FP_REGS, allocno_class))
2246    return allocno_class;
2247
2248  if (!reg_class_subset_p (GENERAL_REGS, best_class)
2249      || !reg_class_subset_p (FP_REGS, best_class))
2250    return best_class;
2251
2252  mode = PSEUDO_REGNO_MODE (regno);
2253  return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2254}
2255
2256static unsigned int
2257aarch64_min_divisions_for_recip_mul (machine_mode mode)
2258{
2259  if (GET_MODE_UNIT_SIZE (mode) == 4)
2260    return aarch64_tune_params.min_div_recip_mul_sf;
2261  return aarch64_tune_params.min_div_recip_mul_df;
2262}
2263
2264/* Return the reassociation width of treeop OPC with mode MODE.  */
2265static int
2266aarch64_reassociation_width (unsigned opc, machine_mode mode)
2267{
2268  if (VECTOR_MODE_P (mode))
2269    return aarch64_tune_params.vec_reassoc_width;
2270  if (INTEGRAL_MODE_P (mode))
2271    return aarch64_tune_params.int_reassoc_width;
2272  /* Avoid reassociating floating point addition so we emit more FMAs.  */
2273  if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2274    return aarch64_tune_params.fp_reassoc_width;
2275  return 1;
2276}
2277
2278/* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2279unsigned
2280aarch64_dbx_register_number (unsigned regno)
2281{
2282   if (GP_REGNUM_P (regno))
2283     return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2284   else if (regno == SP_REGNUM)
2285     return AARCH64_DWARF_SP;
2286   else if (FP_REGNUM_P (regno))
2287     return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2288   else if (PR_REGNUM_P (regno))
2289     return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2290   else if (regno == VG_REGNUM)
2291     return AARCH64_DWARF_VG;
2292
2293   /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2294      equivalent DWARF register.  */
2295   return DWARF_FRAME_REGISTERS;
2296}
2297
2298/* If X is a CONST_DOUBLE, return its bit representation as a constant
2299   integer, otherwise return X unmodified.  */
2300static rtx
2301aarch64_bit_representation (rtx x)
2302{
2303  if (CONST_DOUBLE_P (x))
2304    x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2305  return x;
2306}
2307
2308/* Return true if MODE is any of the Advanced SIMD structure modes.  */
2309static bool
2310aarch64_advsimd_struct_mode_p (machine_mode mode)
2311{
2312  return (TARGET_SIMD
2313	  && (mode == OImode || mode == CImode || mode == XImode));
2314}
2315
2316/* Return true if MODE is an SVE predicate mode.  */
2317static bool
2318aarch64_sve_pred_mode_p (machine_mode mode)
2319{
2320  return (TARGET_SVE
2321	  && (mode == VNx16BImode
2322	      || mode == VNx8BImode
2323	      || mode == VNx4BImode
2324	      || mode == VNx2BImode));
2325}
2326
2327/* Three mutually-exclusive flags describing a vector or predicate type.  */
2328const unsigned int VEC_ADVSIMD  = 1;
2329const unsigned int VEC_SVE_DATA = 2;
2330const unsigned int VEC_SVE_PRED = 4;
2331/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2332   a structure of 2, 3 or 4 vectors.  */
2333const unsigned int VEC_STRUCT   = 8;
2334/* Can be used in combination with VEC_SVE_DATA to indicate that the
2335   vector has fewer significant bytes than a full SVE vector.  */
2336const unsigned int VEC_PARTIAL  = 16;
2337/* Useful combinations of the above.  */
2338const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2339const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2340
2341/* Return a set of flags describing the vector properties of mode MODE.
2342   Ignore modes that are not supported by the current target.  */
2343static unsigned int
2344aarch64_classify_vector_mode (machine_mode mode)
2345{
2346  if (aarch64_advsimd_struct_mode_p (mode))
2347    return VEC_ADVSIMD | VEC_STRUCT;
2348
2349  if (aarch64_sve_pred_mode_p (mode))
2350    return VEC_SVE_PRED;
2351
2352  /* Make the decision based on the mode's enum value rather than its
2353     properties, so that we keep the correct classification regardless
2354     of -msve-vector-bits.  */
2355  switch (mode)
2356    {
2357    /* Partial SVE QI vectors.  */
2358    case E_VNx2QImode:
2359    case E_VNx4QImode:
2360    case E_VNx8QImode:
2361    /* Partial SVE HI vectors.  */
2362    case E_VNx2HImode:
2363    case E_VNx4HImode:
2364    /* Partial SVE SI vector.  */
2365    case E_VNx2SImode:
2366    /* Partial SVE HF vectors.  */
2367    case E_VNx2HFmode:
2368    case E_VNx4HFmode:
2369    /* Partial SVE SF vector.  */
2370    case E_VNx2SFmode:
2371      return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2372
2373    case E_VNx16QImode:
2374    case E_VNx8HImode:
2375    case E_VNx4SImode:
2376    case E_VNx2DImode:
2377    case E_VNx8BFmode:
2378    case E_VNx8HFmode:
2379    case E_VNx4SFmode:
2380    case E_VNx2DFmode:
2381      return TARGET_SVE ? VEC_SVE_DATA : 0;
2382
2383    /* x2 SVE vectors.  */
2384    case E_VNx32QImode:
2385    case E_VNx16HImode:
2386    case E_VNx8SImode:
2387    case E_VNx4DImode:
2388    case E_VNx16BFmode:
2389    case E_VNx16HFmode:
2390    case E_VNx8SFmode:
2391    case E_VNx4DFmode:
2392    /* x3 SVE vectors.  */
2393    case E_VNx48QImode:
2394    case E_VNx24HImode:
2395    case E_VNx12SImode:
2396    case E_VNx6DImode:
2397    case E_VNx24BFmode:
2398    case E_VNx24HFmode:
2399    case E_VNx12SFmode:
2400    case E_VNx6DFmode:
2401    /* x4 SVE vectors.  */
2402    case E_VNx64QImode:
2403    case E_VNx32HImode:
2404    case E_VNx16SImode:
2405    case E_VNx8DImode:
2406    case E_VNx32BFmode:
2407    case E_VNx32HFmode:
2408    case E_VNx16SFmode:
2409    case E_VNx8DFmode:
2410      return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2411
2412    /* 64-bit Advanced SIMD vectors.  */
2413    case E_V8QImode:
2414    case E_V4HImode:
2415    case E_V2SImode:
2416    /* ...E_V1DImode doesn't exist.  */
2417    case E_V4HFmode:
2418    case E_V4BFmode:
2419    case E_V2SFmode:
2420    case E_V1DFmode:
2421    /* 128-bit Advanced SIMD vectors.  */
2422    case E_V16QImode:
2423    case E_V8HImode:
2424    case E_V4SImode:
2425    case E_V2DImode:
2426    case E_V8HFmode:
2427    case E_V8BFmode:
2428    case E_V4SFmode:
2429    case E_V2DFmode:
2430      return TARGET_SIMD ? VEC_ADVSIMD : 0;
2431
2432    default:
2433      return 0;
2434    }
2435}
2436
2437/* Return true if MODE is any of the data vector modes, including
2438   structure modes.  */
2439static bool
2440aarch64_vector_data_mode_p (machine_mode mode)
2441{
2442  return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2443}
2444
2445/* Return true if MODE is any form of SVE mode, including predicates,
2446   vectors and structures.  */
2447bool
2448aarch64_sve_mode_p (machine_mode mode)
2449{
2450  return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2451}
2452
2453/* Return true if MODE is an SVE data vector mode; either a single vector
2454   or a structure of vectors.  */
2455static bool
2456aarch64_sve_data_mode_p (machine_mode mode)
2457{
2458  return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2459}
2460
2461/* Return the number of defined bytes in one constituent vector of
2462   SVE mode MODE, which has vector flags VEC_FLAGS.  */
2463static poly_int64
2464aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2465{
2466  if (vec_flags & VEC_PARTIAL)
2467    /* A single partial vector.  */
2468    return GET_MODE_SIZE (mode);
2469
2470  if (vec_flags & VEC_SVE_DATA)
2471    /* A single vector or a tuple.  */
2472    return BYTES_PER_SVE_VECTOR;
2473
2474  /* A single predicate.  */
2475  gcc_assert (vec_flags & VEC_SVE_PRED);
2476  return BYTES_PER_SVE_PRED;
2477}
2478
2479/* Implement target hook TARGET_ARRAY_MODE.  */
2480static opt_machine_mode
2481aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2482{
2483  if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2484      && IN_RANGE (nelems, 2, 4))
2485    return mode_for_vector (GET_MODE_INNER (mode),
2486			    GET_MODE_NUNITS (mode) * nelems);
2487
2488  return opt_machine_mode ();
2489}
2490
2491/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2492static bool
2493aarch64_array_mode_supported_p (machine_mode mode,
2494				unsigned HOST_WIDE_INT nelems)
2495{
2496  if (TARGET_SIMD
2497      && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2498	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
2499      && (nelems >= 2 && nelems <= 4))
2500    return true;
2501
2502  return false;
2503}
2504
2505/* MODE is some form of SVE vector mode.  For data modes, return the number
2506   of vector register bits that each element of MODE occupies, such as 64
2507   for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2508   in a 64-bit container).  For predicate modes, return the number of
2509   data bits controlled by each significant predicate bit.  */
2510
2511static unsigned int
2512aarch64_sve_container_bits (machine_mode mode)
2513{
2514  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2515  poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2516			     ? BITS_PER_SVE_VECTOR
2517			     : GET_MODE_BITSIZE (mode));
2518  return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2519}
2520
2521/* Return the SVE predicate mode to use for elements that have
2522   ELEM_NBYTES bytes, if such a mode exists.  */
2523
2524opt_machine_mode
2525aarch64_sve_pred_mode (unsigned int elem_nbytes)
2526{
2527  if (TARGET_SVE)
2528    {
2529      if (elem_nbytes == 1)
2530	return VNx16BImode;
2531      if (elem_nbytes == 2)
2532	return VNx8BImode;
2533      if (elem_nbytes == 4)
2534	return VNx4BImode;
2535      if (elem_nbytes == 8)
2536	return VNx2BImode;
2537    }
2538  return opt_machine_mode ();
2539}
2540
2541/* Return the SVE predicate mode that should be used to control
2542   SVE mode MODE.  */
2543
2544machine_mode
2545aarch64_sve_pred_mode (machine_mode mode)
2546{
2547  unsigned int bits = aarch64_sve_container_bits (mode);
2548  return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2549}
2550
2551/* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2552
2553static opt_machine_mode
2554aarch64_get_mask_mode (machine_mode mode)
2555{
2556  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2557  if (vec_flags & VEC_SVE_DATA)
2558    return aarch64_sve_pred_mode (mode);
2559
2560  return default_get_mask_mode (mode);
2561}
2562
2563/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2564
2565opt_machine_mode
2566aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2567{
2568  enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2569			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2570  machine_mode mode;
2571  FOR_EACH_MODE_IN_CLASS (mode, mclass)
2572    if (inner_mode == GET_MODE_INNER (mode)
2573	&& known_eq (nunits, GET_MODE_NUNITS (mode))
2574	&& aarch64_sve_data_mode_p (mode))
2575      return mode;
2576  return opt_machine_mode ();
2577}
2578
2579/* Return the integer element mode associated with SVE mode MODE.  */
2580
2581static scalar_int_mode
2582aarch64_sve_element_int_mode (machine_mode mode)
2583{
2584  poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2585			     ? BITS_PER_SVE_VECTOR
2586			     : GET_MODE_BITSIZE (mode));
2587  unsigned int elt_bits = vector_element_size (vector_bits,
2588					       GET_MODE_NUNITS (mode));
2589  return int_mode_for_size (elt_bits, 0).require ();
2590}
2591
2592/* Return an integer element mode that contains exactly
2593   aarch64_sve_container_bits (MODE) bits.  This is wider than
2594   aarch64_sve_element_int_mode if MODE is a partial vector,
2595   otherwise it's the same.  */
2596
2597static scalar_int_mode
2598aarch64_sve_container_int_mode (machine_mode mode)
2599{
2600  return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2601}
2602
2603/* Return the integer vector mode associated with SVE mode MODE.
2604   Unlike related_int_vector_mode, this can handle the case in which
2605   MODE is a predicate (and thus has a different total size).  */
2606
2607machine_mode
2608aarch64_sve_int_mode (machine_mode mode)
2609{
2610  scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2611  return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2612}
2613
2614/* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2615
2616static opt_machine_mode
2617aarch64_vectorize_related_mode (machine_mode vector_mode,
2618				scalar_mode element_mode,
2619				poly_uint64 nunits)
2620{
2621  unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2622
2623  /* If we're operating on SVE vectors, try to return an SVE mode.  */
2624  poly_uint64 sve_nunits;
2625  if ((vec_flags & VEC_SVE_DATA)
2626      && multiple_p (BYTES_PER_SVE_VECTOR,
2627		     GET_MODE_SIZE (element_mode), &sve_nunits))
2628    {
2629      machine_mode sve_mode;
2630      if (maybe_ne (nunits, 0U))
2631	{
2632	  /* Try to find a full or partial SVE mode with exactly
2633	     NUNITS units.  */
2634	  if (multiple_p (sve_nunits, nunits)
2635	      && aarch64_sve_data_mode (element_mode,
2636					nunits).exists (&sve_mode))
2637	    return sve_mode;
2638	}
2639      else
2640	{
2641	  /* Take the preferred number of units from the number of bytes
2642	     that fit in VECTOR_MODE.  We always start by "autodetecting"
2643	     a full vector mode with preferred_simd_mode, so vectors
2644	     chosen here will also be full vector modes.  Then
2645	     autovectorize_vector_modes tries smaller starting modes
2646	     and thus smaller preferred numbers of units.  */
2647	  sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2648	  if (aarch64_sve_data_mode (element_mode,
2649				     sve_nunits).exists (&sve_mode))
2650	    return sve_mode;
2651	}
2652    }
2653
2654  /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2655  if ((vec_flags & VEC_ADVSIMD)
2656      && known_eq (nunits, 0U)
2657      && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2658      && maybe_ge (GET_MODE_BITSIZE (element_mode)
2659		   * GET_MODE_NUNITS (vector_mode), 128U))
2660    {
2661      machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2662      if (VECTOR_MODE_P (res))
2663	return res;
2664    }
2665
2666  return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2667}
2668
2669/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2670   prefer to use the first arithmetic operand as the else value if
2671   the else value doesn't matter, since that exactly matches the SVE
2672   destructive merging form.  For ternary operations we could either
2673   pick the first operand and use FMAD-like instructions or the last
2674   operand and use FMLA-like instructions; the latter seems more
2675   natural.  */
2676
2677static tree
2678aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2679{
2680  return nops == 3 ? ops[2] : ops[0];
2681}
2682
2683/* Implement TARGET_HARD_REGNO_NREGS.  */
2684
2685static unsigned int
2686aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2687{
2688  /* ??? Logically we should only need to provide a value when
2689     HARD_REGNO_MODE_OK says that the combination is valid,
2690     but at the moment we need to handle all modes.  Just ignore
2691     any runtime parts for registers that can't store them.  */
2692  HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2693  switch (aarch64_regno_regclass (regno))
2694    {
2695    case FP_REGS:
2696    case FP_LO_REGS:
2697    case FP_LO8_REGS:
2698      {
2699	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2700	if (vec_flags & VEC_SVE_DATA)
2701	  return exact_div (GET_MODE_SIZE (mode),
2702			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2703	return CEIL (lowest_size, UNITS_PER_VREG);
2704      }
2705    case PR_REGS:
2706    case PR_LO_REGS:
2707    case PR_HI_REGS:
2708    case FFR_REGS:
2709    case PR_AND_FFR_REGS:
2710      return 1;
2711    default:
2712      return CEIL (lowest_size, UNITS_PER_WORD);
2713    }
2714  gcc_unreachable ();
2715}
2716
2717/* Implement TARGET_HARD_REGNO_MODE_OK.  */
2718
2719static bool
2720aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2721{
2722  if (GET_MODE_CLASS (mode) == MODE_CC)
2723    return regno == CC_REGNUM;
2724
2725  if (regno == VG_REGNUM)
2726    /* This must have the same size as _Unwind_Word.  */
2727    return mode == DImode;
2728
2729  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2730  if (vec_flags & VEC_SVE_PRED)
2731    return pr_or_ffr_regnum_p (regno);
2732
2733  if (pr_or_ffr_regnum_p (regno))
2734    return false;
2735
2736  if (regno == SP_REGNUM)
2737    /* The purpose of comparing with ptr_mode is to support the
2738       global register variable associated with the stack pointer
2739       register via the syntax of asm ("wsp") in ILP32.  */
2740    return mode == Pmode || mode == ptr_mode;
2741
2742  if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2743    return mode == Pmode;
2744
2745  if (GP_REGNUM_P (regno))
2746    {
2747      if (vec_flags & VEC_ANY_SVE)
2748	return false;
2749      if (known_le (GET_MODE_SIZE (mode), 8))
2750	return true;
2751      if (known_le (GET_MODE_SIZE (mode), 16))
2752	return (regno & 1) == 0;
2753    }
2754  else if (FP_REGNUM_P (regno))
2755    {
2756      if (vec_flags & VEC_STRUCT)
2757	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2758      else
2759	return !VECTOR_MODE_P (mode) || vec_flags != 0;
2760    }
2761
2762  return false;
2763}
2764
2765/* Return true if a function with type FNTYPE returns its value in
2766   SVE vector or predicate registers.  */
2767
2768static bool
2769aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2770{
2771  tree return_type = TREE_TYPE (fntype);
2772
2773  pure_scalable_type_info pst_info;
2774  switch (pst_info.analyze (return_type))
2775    {
2776    case pure_scalable_type_info::IS_PST:
2777      return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2778	      && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2779
2780    case pure_scalable_type_info::DOESNT_MATTER:
2781      gcc_assert (aarch64_return_in_memory_1 (return_type));
2782      return false;
2783
2784    case pure_scalable_type_info::NO_ABI_IDENTITY:
2785    case pure_scalable_type_info::ISNT_PST:
2786      return false;
2787    }
2788  gcc_unreachable ();
2789}
2790
2791/* Return true if a function with type FNTYPE takes arguments in
2792   SVE vector or predicate registers.  */
2793
2794static bool
2795aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2796{
2797  CUMULATIVE_ARGS args_so_far_v;
2798  aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2799				NULL_TREE, 0, true);
2800  cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2801
2802  for (tree chain = TYPE_ARG_TYPES (fntype);
2803       chain && chain != void_list_node;
2804       chain = TREE_CHAIN (chain))
2805    {
2806      tree arg_type = TREE_VALUE (chain);
2807      if (arg_type == error_mark_node)
2808	return false;
2809
2810      function_arg_info arg (arg_type, /*named=*/true);
2811      apply_pass_by_reference_rules (&args_so_far_v, arg);
2812      pure_scalable_type_info pst_info;
2813      if (pst_info.analyze_registers (arg.type))
2814	{
2815	  unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2816	  unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2817	  gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2818	  return true;
2819	}
2820
2821      targetm.calls.function_arg_advance (args_so_far, arg);
2822    }
2823  return false;
2824}
2825
2826/* Implement TARGET_FNTYPE_ABI.  */
2827
2828static const predefined_function_abi &
2829aarch64_fntype_abi (const_tree fntype)
2830{
2831  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2832    return aarch64_simd_abi ();
2833
2834  if (aarch64_returns_value_in_sve_regs_p (fntype)
2835      || aarch64_takes_arguments_in_sve_regs_p (fntype))
2836    return aarch64_sve_abi ();
2837
2838  return default_function_abi;
2839}
2840
2841/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2842
2843static bool
2844aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2845{
2846  return (aarch64_sve::builtin_type_p (type1)
2847	  == aarch64_sve::builtin_type_p (type2));
2848}
2849
2850/* Return true if we should emit CFI for register REGNO.  */
2851
2852static bool
2853aarch64_emit_cfi_for_reg_p (unsigned int regno)
2854{
2855  return (GP_REGNUM_P (regno)
2856	  || !default_function_abi.clobbers_full_reg_p (regno));
2857}
2858
2859/* Return the mode we should use to save and restore register REGNO.  */
2860
2861static machine_mode
2862aarch64_reg_save_mode (unsigned int regno)
2863{
2864  if (GP_REGNUM_P (regno))
2865    return DImode;
2866
2867  if (FP_REGNUM_P (regno))
2868    switch (crtl->abi->id ())
2869      {
2870      case ARM_PCS_AAPCS64:
2871	/* Only the low 64 bits are saved by the base PCS.  */
2872	return DFmode;
2873
2874      case ARM_PCS_SIMD:
2875	/* The vector PCS saves the low 128 bits (which is the full
2876	   register on non-SVE targets).  */
2877	return TFmode;
2878
2879      case ARM_PCS_SVE:
2880	/* Use vectors of DImode for registers that need frame
2881	   information, so that the first 64 bytes of the save slot
2882	   are always the equivalent of what storing D<n> would give.  */
2883	if (aarch64_emit_cfi_for_reg_p (regno))
2884	  return VNx2DImode;
2885
2886	/* Use vectors of bytes otherwise, so that the layout is
2887	   endian-agnostic, and so that we can use LDR and STR for
2888	   big-endian targets.  */
2889	return VNx16QImode;
2890
2891      case ARM_PCS_TLSDESC:
2892      case ARM_PCS_UNKNOWN:
2893	break;
2894      }
2895
2896  if (PR_REGNUM_P (regno))
2897    /* Save the full predicate register.  */
2898    return VNx16BImode;
2899
2900  gcc_unreachable ();
2901}
2902
2903/* Implement TARGET_INSN_CALLEE_ABI.  */
2904
2905const predefined_function_abi &
2906aarch64_insn_callee_abi (const rtx_insn *insn)
2907{
2908  rtx pat = PATTERN (insn);
2909  gcc_assert (GET_CODE (pat) == PARALLEL);
2910  rtx unspec = XVECEXP (pat, 0, 1);
2911  gcc_assert (GET_CODE (unspec) == UNSPEC
2912	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2913  return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2914}
2915
2916/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2917   the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2918   clobbers the top 64 bits when restoring the bottom 64 bits.  */
2919
2920static bool
2921aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2922					unsigned int regno,
2923					machine_mode mode)
2924{
2925  if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2926    {
2927      poly_int64 per_register_size = GET_MODE_SIZE (mode);
2928      unsigned int nregs = hard_regno_nregs (regno, mode);
2929      if (nregs > 1)
2930	per_register_size = exact_div (per_register_size, nregs);
2931      if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2932	return maybe_gt (per_register_size, 16);
2933      return maybe_gt (per_register_size, 8);
2934    }
2935  return false;
2936}
2937
2938/* Implement REGMODE_NATURAL_SIZE.  */
2939poly_uint64
2940aarch64_regmode_natural_size (machine_mode mode)
2941{
2942  /* The natural size for SVE data modes is one SVE data vector,
2943     and similarly for predicates.  We can't independently modify
2944     anything smaller than that.  */
2945  /* ??? For now, only do this for variable-width SVE registers.
2946     Doing it for constant-sized registers breaks lower-subreg.c.  */
2947  /* ??? And once that's fixed, we should probably have similar
2948     code for Advanced SIMD.  */
2949  if (!aarch64_sve_vg.is_constant ())
2950    {
2951      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2952      if (vec_flags & VEC_SVE_PRED)
2953	return BYTES_PER_SVE_PRED;
2954      if (vec_flags & VEC_SVE_DATA)
2955	return BYTES_PER_SVE_VECTOR;
2956    }
2957  return UNITS_PER_WORD;
2958}
2959
2960/* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2961machine_mode
2962aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2963				     machine_mode mode)
2964{
2965  /* The predicate mode determines which bits are significant and
2966     which are "don't care".  Decreasing the number of lanes would
2967     lose data while increasing the number of lanes would make bits
2968     unnecessarily significant.  */
2969  if (PR_REGNUM_P (regno))
2970    return mode;
2971  if (known_ge (GET_MODE_SIZE (mode), 4))
2972    return mode;
2973  else
2974    return SImode;
2975}
2976
2977/* Return true if I's bits are consecutive ones from the MSB.  */
2978bool
2979aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2980{
2981  return exact_log2 (-i) != HOST_WIDE_INT_M1;
2982}
2983
2984/* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2985   that strcpy from constants will be faster.  */
2986
2987static HOST_WIDE_INT
2988aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2989{
2990  if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2991    return MAX (align, BITS_PER_WORD);
2992  return align;
2993}
2994
2995/* Return true if calls to DECL should be treated as
2996   long-calls (ie called via a register).  */
2997static bool
2998aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2999{
3000  return false;
3001}
3002
3003/* Return true if calls to symbol-ref SYM should be treated as
3004   long-calls (ie called via a register).  */
3005bool
3006aarch64_is_long_call_p (rtx sym)
3007{
3008  return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
3009}
3010
3011/* Return true if calls to symbol-ref SYM should not go through
3012   plt stubs.  */
3013
3014bool
3015aarch64_is_noplt_call_p (rtx sym)
3016{
3017  const_tree decl = SYMBOL_REF_DECL (sym);
3018
3019  if (flag_pic
3020      && decl
3021      && (!flag_plt
3022	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
3023      && !targetm.binds_local_p (decl))
3024    return true;
3025
3026  return false;
3027}
3028
3029/* Return true if the offsets to a zero/sign-extract operation
3030   represent an expression that matches an extend operation.  The
3031   operands represent the parameters from
3032
3033   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
3034bool
3035aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
3036				rtx extract_imm)
3037{
3038  HOST_WIDE_INT mult_val, extract_val;
3039
3040  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
3041    return false;
3042
3043  mult_val = INTVAL (mult_imm);
3044  extract_val = INTVAL (extract_imm);
3045
3046  if (extract_val > 8
3047      && extract_val < GET_MODE_BITSIZE (mode)
3048      && exact_log2 (extract_val & ~7) > 0
3049      && (extract_val & 7) <= 4
3050      && mult_val == (1 << (extract_val & 7)))
3051    return true;
3052
3053  return false;
3054}
3055
3056/* Emit an insn that's a simple single-set.  Both the operands must be
3057   known to be valid.  */
3058inline static rtx_insn *
3059emit_set_insn (rtx x, rtx y)
3060{
3061  return emit_insn (gen_rtx_SET (x, y));
3062}
3063
3064/* X and Y are two things to compare using CODE.  Emit the compare insn and
3065   return the rtx for register 0 in the proper mode.  */
3066rtx
3067aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
3068{
3069  machine_mode cmp_mode = GET_MODE (x);
3070  machine_mode cc_mode;
3071  rtx cc_reg;
3072
3073  if (cmp_mode == TImode)
3074    {
3075      gcc_assert (code == NE);
3076
3077      cc_mode = CCmode;
3078      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3079
3080      rtx x_lo = operand_subword (x, 0, 0, TImode);
3081      rtx y_lo = operand_subword (y, 0, 0, TImode);
3082      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
3083
3084      rtx x_hi = operand_subword (x, 1, 0, TImode);
3085      rtx y_hi = operand_subword (y, 1, 0, TImode);
3086      emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
3087			       gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
3088			       GEN_INT (AARCH64_EQ)));
3089    }
3090  else
3091    {
3092      cc_mode = SELECT_CC_MODE (code, x, y);
3093      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3094      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3095    }
3096  return cc_reg;
3097}
3098
3099/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
3100
3101static rtx
3102aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3103                                  machine_mode y_mode)
3104{
3105  if (y_mode == E_QImode || y_mode == E_HImode)
3106    {
3107      if (CONST_INT_P (y))
3108	{
3109	  y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3110	  y_mode = SImode;
3111	}
3112      else
3113	{
3114	  rtx t, cc_reg;
3115	  machine_mode cc_mode;
3116
3117	  t = gen_rtx_ZERO_EXTEND (SImode, y);
3118	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3119	  cc_mode = CC_SWPmode;
3120	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3121	  emit_set_insn (cc_reg, t);
3122	  return cc_reg;
3123	}
3124    }
3125
3126  if (!aarch64_plus_operand (y, y_mode))
3127    y = force_reg (y_mode, y);
3128
3129  return aarch64_gen_compare_reg (code, x, y);
3130}
3131
3132/* Build the SYMBOL_REF for __tls_get_addr.  */
3133
3134static GTY(()) rtx tls_get_addr_libfunc;
3135
3136rtx
3137aarch64_tls_get_addr (void)
3138{
3139  if (!tls_get_addr_libfunc)
3140    tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3141  return tls_get_addr_libfunc;
3142}
3143
3144/* Return the TLS model to use for ADDR.  */
3145
3146static enum tls_model
3147tls_symbolic_operand_type (rtx addr)
3148{
3149  enum tls_model tls_kind = TLS_MODEL_NONE;
3150  poly_int64 offset;
3151  addr = strip_offset_and_salt (addr, &offset);
3152  if (GET_CODE (addr) == SYMBOL_REF)
3153    tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3154
3155  return tls_kind;
3156}
3157
3158/* We'll allow lo_sum's in addresses in our legitimate addresses
3159   so that combine would take care of combining addresses where
3160   necessary, but for generation purposes, we'll generate the address
3161   as :
3162   RTL                               Absolute
3163   tmp = hi (symbol_ref);            adrp  x1, foo
3164   dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
3165                                     nop
3166
3167   PIC                               TLS
3168   adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
3169   ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
3170                                     bl   __tls_get_addr
3171                                     nop
3172
3173   Load TLS symbol, depending on TLS mechanism and TLS access model.
3174
3175   Global Dynamic - Traditional TLS:
3176   adrp tmp, :tlsgd:imm
3177   add  dest, tmp, #:tlsgd_lo12:imm
3178   bl   __tls_get_addr
3179
3180   Global Dynamic - TLS Descriptors:
3181   adrp dest, :tlsdesc:imm
3182   ldr  tmp, [dest, #:tlsdesc_lo12:imm]
3183   add  dest, dest, #:tlsdesc_lo12:imm
3184   blr  tmp
3185   mrs  tp, tpidr_el0
3186   add  dest, dest, tp
3187
3188   Initial Exec:
3189   mrs  tp, tpidr_el0
3190   adrp tmp, :gottprel:imm
3191   ldr  dest, [tmp, #:gottprel_lo12:imm]
3192   add  dest, dest, tp
3193
3194   Local Exec:
3195   mrs  tp, tpidr_el0
3196   add  t0, tp, #:tprel_hi12:imm, lsl #12
3197   add  t0, t0, #:tprel_lo12_nc:imm
3198*/
3199
3200static void
3201aarch64_load_symref_appropriately (rtx dest, rtx imm,
3202				   enum aarch64_symbol_type type)
3203{
3204  switch (type)
3205    {
3206    case SYMBOL_SMALL_ABSOLUTE:
3207      {
3208	/* In ILP32, the mode of dest can be either SImode or DImode.  */
3209	rtx tmp_reg = dest;
3210	machine_mode mode = GET_MODE (dest);
3211
3212	gcc_assert (mode == Pmode || mode == ptr_mode);
3213
3214	if (can_create_pseudo_p ())
3215	  tmp_reg = gen_reg_rtx (mode);
3216
3217	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
3218	emit_insn (gen_add_losym (dest, tmp_reg, imm));
3219	return;
3220      }
3221
3222    case SYMBOL_TINY_ABSOLUTE:
3223      emit_insn (gen_rtx_SET (dest, imm));
3224      return;
3225
3226    case SYMBOL_SMALL_GOT_28K:
3227      {
3228	machine_mode mode = GET_MODE (dest);
3229	rtx gp_rtx = pic_offset_table_rtx;
3230	rtx insn;
3231	rtx mem;
3232
3233	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3234	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
3235	   decide rtx costs, in which case pic_offset_table_rtx is not
3236	   initialized.  For that case no need to generate the first adrp
3237	   instruction as the final cost for global variable access is
3238	   one instruction.  */
3239	if (gp_rtx != NULL)
3240	  {
3241	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3242	       using the page base as GOT base, the first page may be wasted,
3243	       in the worst scenario, there is only 28K space for GOT).
3244
3245	       The generate instruction sequence for accessing global variable
3246	       is:
3247
3248		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3249
3250	       Only one instruction needed. But we must initialize
3251	       pic_offset_table_rtx properly.  We generate initialize insn for
3252	       every global access, and allow CSE to remove all redundant.
3253
3254	       The final instruction sequences will look like the following
3255	       for multiply global variables access.
3256
3257		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3258
3259		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3260		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3261		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3262		 ...  */
3263
3264	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3265	    crtl->uses_pic_offset_table = 1;
3266	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3267
3268	    if (mode != GET_MODE (gp_rtx))
3269             gp_rtx = gen_lowpart (mode, gp_rtx);
3270
3271	  }
3272
3273	if (mode == ptr_mode)
3274	  {
3275	    if (mode == DImode)
3276	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3277	    else
3278	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3279
3280	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3281	  }
3282	else
3283	  {
3284	    gcc_assert (mode == Pmode);
3285
3286	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3287	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3288	  }
3289
3290	/* The operand is expected to be MEM.  Whenever the related insn
3291	   pattern changed, above code which calculate mem should be
3292	   updated.  */
3293	gcc_assert (GET_CODE (mem) == MEM);
3294	MEM_READONLY_P (mem) = 1;
3295	MEM_NOTRAP_P (mem) = 1;
3296	emit_insn (insn);
3297	return;
3298      }
3299
3300    case SYMBOL_SMALL_GOT_4G:
3301      {
3302	/* In ILP32, the mode of dest can be either SImode or DImode,
3303	   while the got entry is always of SImode size.  The mode of
3304	   dest depends on how dest is used: if dest is assigned to a
3305	   pointer (e.g. in the memory), it has SImode; it may have
3306	   DImode if dest is dereferenced to access the memeory.
3307	   This is why we have to handle three different ldr_got_small
3308	   patterns here (two patterns for ILP32).  */
3309
3310	rtx insn;
3311	rtx mem;
3312	rtx tmp_reg = dest;
3313	machine_mode mode = GET_MODE (dest);
3314
3315	if (can_create_pseudo_p ())
3316	  tmp_reg = gen_reg_rtx (mode);
3317
3318	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3319	if (mode == ptr_mode)
3320	  {
3321	    if (mode == DImode)
3322	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3323	    else
3324	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3325
3326	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3327	  }
3328	else
3329	  {
3330	    gcc_assert (mode == Pmode);
3331
3332	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3333	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3334	  }
3335
3336	gcc_assert (GET_CODE (mem) == MEM);
3337	MEM_READONLY_P (mem) = 1;
3338	MEM_NOTRAP_P (mem) = 1;
3339	emit_insn (insn);
3340	return;
3341      }
3342
3343    case SYMBOL_SMALL_TLSGD:
3344      {
3345	rtx_insn *insns;
3346	/* The return type of __tls_get_addr is the C pointer type
3347	   so use ptr_mode.  */
3348	rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3349	rtx tmp_reg = dest;
3350
3351	if (GET_MODE (dest) != ptr_mode)
3352	  tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3353
3354	start_sequence ();
3355	if (ptr_mode == SImode)
3356	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3357	else
3358	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3359	insns = get_insns ();
3360	end_sequence ();
3361
3362	RTL_CONST_CALL_P (insns) = 1;
3363	emit_libcall_block (insns, tmp_reg, result, imm);
3364	/* Convert back to the mode of the dest adding a zero_extend
3365	   from SImode (ptr_mode) to DImode (Pmode). */
3366	if (dest != tmp_reg)
3367	  convert_move (dest, tmp_reg, true);
3368	return;
3369      }
3370
3371    case SYMBOL_SMALL_TLSDESC:
3372      {
3373	machine_mode mode = GET_MODE (dest);
3374	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3375	rtx tp;
3376
3377	gcc_assert (mode == Pmode || mode == ptr_mode);
3378
3379	/* In ILP32, the got entry is always of SImode size.  Unlike
3380	   small GOT, the dest is fixed at reg 0.  */
3381	if (TARGET_ILP32)
3382	  emit_insn (gen_tlsdesc_small_si (imm));
3383	else
3384	  emit_insn (gen_tlsdesc_small_di (imm));
3385	tp = aarch64_load_tp (NULL);
3386
3387	if (mode != Pmode)
3388	  tp = gen_lowpart (mode, tp);
3389
3390	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3391	if (REG_P (dest))
3392	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3393	return;
3394      }
3395
3396    case SYMBOL_SMALL_TLSIE:
3397      {
3398	/* In ILP32, the mode of dest can be either SImode or DImode,
3399	   while the got entry is always of SImode size.  The mode of
3400	   dest depends on how dest is used: if dest is assigned to a
3401	   pointer (e.g. in the memory), it has SImode; it may have
3402	   DImode if dest is dereferenced to access the memeory.
3403	   This is why we have to handle three different tlsie_small
3404	   patterns here (two patterns for ILP32).  */
3405	machine_mode mode = GET_MODE (dest);
3406	rtx tmp_reg = gen_reg_rtx (mode);
3407	rtx tp = aarch64_load_tp (NULL);
3408
3409	if (mode == ptr_mode)
3410	  {
3411	    if (mode == DImode)
3412	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3413	    else
3414	      {
3415		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3416		tp = gen_lowpart (mode, tp);
3417	      }
3418	  }
3419	else
3420	  {
3421	    gcc_assert (mode == Pmode);
3422	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3423	  }
3424
3425	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3426	if (REG_P (dest))
3427	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3428	return;
3429      }
3430
3431    case SYMBOL_TLSLE12:
3432    case SYMBOL_TLSLE24:
3433    case SYMBOL_TLSLE32:
3434    case SYMBOL_TLSLE48:
3435      {
3436	machine_mode mode = GET_MODE (dest);
3437	rtx tp = aarch64_load_tp (NULL);
3438
3439	if (mode != Pmode)
3440	  tp = gen_lowpart (mode, tp);
3441
3442	switch (type)
3443	  {
3444	  case SYMBOL_TLSLE12:
3445	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3446			(dest, tp, imm));
3447	    break;
3448	  case SYMBOL_TLSLE24:
3449	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3450			(dest, tp, imm));
3451	  break;
3452	  case SYMBOL_TLSLE32:
3453	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3454			(dest, imm));
3455	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3456			(dest, dest, tp));
3457	  break;
3458	  case SYMBOL_TLSLE48:
3459	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3460			(dest, imm));
3461	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3462			(dest, dest, tp));
3463	    break;
3464	  default:
3465	    gcc_unreachable ();
3466	  }
3467
3468	if (REG_P (dest))
3469	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3470	return;
3471      }
3472
3473    case SYMBOL_TINY_GOT:
3474      {
3475	rtx insn;
3476	machine_mode mode = GET_MODE (dest);
3477
3478	if (mode == ptr_mode)
3479	  insn = gen_ldr_got_tiny (mode, dest, imm);
3480	else
3481	  {
3482	    gcc_assert (mode == Pmode);
3483	    insn = gen_ldr_got_tiny_sidi (dest, imm);
3484	  }
3485
3486	emit_insn (insn);
3487	return;
3488      }
3489
3490    case SYMBOL_TINY_TLSIE:
3491      {
3492	machine_mode mode = GET_MODE (dest);
3493	rtx tp = aarch64_load_tp (NULL);
3494
3495	if (mode == ptr_mode)
3496	  {
3497	    if (mode == DImode)
3498	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3499	    else
3500	      {
3501		tp = gen_lowpart (mode, tp);
3502		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3503	      }
3504	  }
3505	else
3506	  {
3507	    gcc_assert (mode == Pmode);
3508	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3509	  }
3510
3511	if (REG_P (dest))
3512	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3513	return;
3514      }
3515
3516    default:
3517      gcc_unreachable ();
3518    }
3519}
3520
3521/* Emit a move from SRC to DEST.  Assume that the move expanders can
3522   handle all moves if !can_create_pseudo_p ().  The distinction is
3523   important because, unlike emit_move_insn, the move expanders know
3524   how to force Pmode objects into the constant pool even when the
3525   constant pool address is not itself legitimate.  */
3526static rtx
3527aarch64_emit_move (rtx dest, rtx src)
3528{
3529  return (can_create_pseudo_p ()
3530	  ? emit_move_insn (dest, src)
3531	  : emit_move_insn_1 (dest, src));
3532}
3533
3534/* Apply UNOPTAB to OP and store the result in DEST.  */
3535
3536static void
3537aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3538{
3539  rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3540  if (dest != tmp)
3541    emit_move_insn (dest, tmp);
3542}
3543
3544/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3545
3546static void
3547aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3548{
3549  rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3550			  OPTAB_DIRECT);
3551  if (dest != tmp)
3552    emit_move_insn (dest, tmp);
3553}
3554
3555/* Split a 128-bit move operation into two 64-bit move operations,
3556   taking care to handle partial overlap of register to register
3557   copies.  Special cases are needed when moving between GP regs and
3558   FP regs.  SRC can be a register, constant or memory; DST a register
3559   or memory.  If either operand is memory it must not have any side
3560   effects.  */
3561void
3562aarch64_split_128bit_move (rtx dst, rtx src)
3563{
3564  rtx dst_lo, dst_hi;
3565  rtx src_lo, src_hi;
3566
3567  machine_mode mode = GET_MODE (dst);
3568
3569  gcc_assert (mode == TImode || mode == TFmode);
3570  gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3571  gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3572
3573  if (REG_P (dst) && REG_P (src))
3574    {
3575      int src_regno = REGNO (src);
3576      int dst_regno = REGNO (dst);
3577
3578      /* Handle FP <-> GP regs.  */
3579      if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3580	{
3581	  src_lo = gen_lowpart (word_mode, src);
3582	  src_hi = gen_highpart (word_mode, src);
3583
3584	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3585	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3586	  return;
3587	}
3588      else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3589	{
3590	  dst_lo = gen_lowpart (word_mode, dst);
3591	  dst_hi = gen_highpart (word_mode, dst);
3592
3593	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3594	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3595	  return;
3596	}
3597    }
3598
3599  dst_lo = gen_lowpart (word_mode, dst);
3600  dst_hi = gen_highpart (word_mode, dst);
3601  src_lo = gen_lowpart (word_mode, src);
3602  src_hi = gen_highpart_mode (word_mode, mode, src);
3603
3604  /* At most one pairing may overlap.  */
3605  if (reg_overlap_mentioned_p (dst_lo, src_hi))
3606    {
3607      aarch64_emit_move (dst_hi, src_hi);
3608      aarch64_emit_move (dst_lo, src_lo);
3609    }
3610  else
3611    {
3612      aarch64_emit_move (dst_lo, src_lo);
3613      aarch64_emit_move (dst_hi, src_hi);
3614    }
3615}
3616
3617bool
3618aarch64_split_128bit_move_p (rtx dst, rtx src)
3619{
3620  return (! REG_P (src)
3621	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3622}
3623
3624/* Split a complex SIMD combine.  */
3625
3626void
3627aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3628{
3629  machine_mode src_mode = GET_MODE (src1);
3630  machine_mode dst_mode = GET_MODE (dst);
3631
3632  gcc_assert (VECTOR_MODE_P (dst_mode));
3633  gcc_assert (register_operand (dst, dst_mode)
3634	      && register_operand (src1, src_mode)
3635	      && register_operand (src2, src_mode));
3636
3637  emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3638  return;
3639}
3640
3641/* Split a complex SIMD move.  */
3642
3643void
3644aarch64_split_simd_move (rtx dst, rtx src)
3645{
3646  machine_mode src_mode = GET_MODE (src);
3647  machine_mode dst_mode = GET_MODE (dst);
3648
3649  gcc_assert (VECTOR_MODE_P (dst_mode));
3650
3651  if (REG_P (dst) && REG_P (src))
3652    {
3653      gcc_assert (VECTOR_MODE_P (src_mode));
3654      emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3655    }
3656}
3657
3658bool
3659aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3660			      machine_mode ymode, rtx y)
3661{
3662  rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3663  gcc_assert (r != NULL);
3664  return rtx_equal_p (x, r);
3665}
3666
3667/* Return TARGET if it is nonnull and a register of mode MODE.
3668   Otherwise, return a fresh register of mode MODE if we can,
3669   or TARGET reinterpreted as MODE if we can't.  */
3670
3671static rtx
3672aarch64_target_reg (rtx target, machine_mode mode)
3673{
3674  if (target && REG_P (target) && GET_MODE (target) == mode)
3675    return target;
3676  if (!can_create_pseudo_p ())
3677    {
3678      gcc_assert (target);
3679      return gen_lowpart (mode, target);
3680    }
3681  return gen_reg_rtx (mode);
3682}
3683
3684/* Return a register that contains the constant in BUILDER, given that
3685   the constant is a legitimate move operand.  Use TARGET as the register
3686   if it is nonnull and convenient.  */
3687
3688static rtx
3689aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3690{
3691  rtx src = builder.build ();
3692  target = aarch64_target_reg (target, GET_MODE (src));
3693  emit_insn (gen_rtx_SET (target, src));
3694  return target;
3695}
3696
3697static rtx
3698aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3699{
3700  if (can_create_pseudo_p ())
3701    return force_reg (mode, value);
3702  else
3703    {
3704      gcc_assert (x);
3705      aarch64_emit_move (x, value);
3706      return x;
3707    }
3708}
3709
3710/* Return true if predicate value X is a constant in which every element
3711   is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3712   value, i.e. as a predicate in which all bits are significant.  */
3713
3714static bool
3715aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3716{
3717  if (GET_CODE (x) != CONST_VECTOR)
3718    return false;
3719
3720  unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3721					     GET_MODE_NUNITS (GET_MODE (x)));
3722  unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3723  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3724  builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3725
3726  unsigned int nelts = const_vector_encoded_nelts (x);
3727  for (unsigned int i = 0; i < nelts; ++i)
3728    {
3729      rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3730      if (!CONST_INT_P (elt))
3731	return false;
3732
3733      builder.quick_push (elt);
3734      for (unsigned int j = 1; j < factor; ++j)
3735	builder.quick_push (const0_rtx);
3736    }
3737  builder.finalize ();
3738  return true;
3739}
3740
3741/* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3742   widest predicate element size it can have (that is, the largest size
3743   for which each element would still be 0 or 1).  */
3744
3745unsigned int
3746aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3747{
3748  /* Start with the most optimistic assumption: that we only need
3749     one bit per pattern.  This is what we will use if only the first
3750     bit in each pattern is ever set.  */
3751  unsigned int mask = GET_MODE_SIZE (DImode);
3752  mask |= builder.npatterns ();
3753
3754  /* Look for set bits.  */
3755  unsigned int nelts = builder.encoded_nelts ();
3756  for (unsigned int i = 1; i < nelts; ++i)
3757    if (INTVAL (builder.elt (i)) != 0)
3758      {
3759	if (i & 1)
3760	  return 1;
3761	mask |= i;
3762      }
3763  return mask & -mask;
3764}
3765
3766/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3767   return that predicate mode, otherwise return opt_machine_mode ().  */
3768
3769opt_machine_mode
3770aarch64_ptrue_all_mode (rtx x)
3771{
3772  gcc_assert (GET_MODE (x) == VNx16BImode);
3773  if (GET_CODE (x) != CONST_VECTOR
3774      || !CONST_VECTOR_DUPLICATE_P (x)
3775      || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3776      || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3777    return opt_machine_mode ();
3778
3779  unsigned int nelts = const_vector_encoded_nelts (x);
3780  for (unsigned int i = 1; i < nelts; ++i)
3781    if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3782      return opt_machine_mode ();
3783
3784  return aarch64_sve_pred_mode (nelts);
3785}
3786
3787/* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3788   that the constant would have with predicate element size ELT_SIZE
3789   (ignoring the upper bits in each element) and return:
3790
3791   * -1 if all bits are set
3792   * N if the predicate has N leading set bits followed by all clear bits
3793   * 0 if the predicate does not have any of these forms.  */
3794
3795int
3796aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3797			      unsigned int elt_size)
3798{
3799  /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3800     followed by set bits.  */
3801  if (builder.nelts_per_pattern () == 3)
3802    return 0;
3803
3804  /* Skip over leading set bits.  */
3805  unsigned int nelts = builder.encoded_nelts ();
3806  unsigned int i = 0;
3807  for (; i < nelts; i += elt_size)
3808    if (INTVAL (builder.elt (i)) == 0)
3809      break;
3810  unsigned int vl = i / elt_size;
3811
3812  /* Check for the all-true case.  */
3813  if (i == nelts)
3814    return -1;
3815
3816  /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3817     repeating pattern of set bits followed by clear bits.  */
3818  if (builder.nelts_per_pattern () != 2)
3819    return 0;
3820
3821  /* We have a "foreground" value and a duplicated "background" value.
3822     If the background might repeat and the last set bit belongs to it,
3823     we might have set bits followed by clear bits followed by set bits.  */
3824  if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3825    return 0;
3826
3827  /* Make sure that the rest are all clear.  */
3828  for (; i < nelts; i += elt_size)
3829    if (INTVAL (builder.elt (i)) != 0)
3830      return 0;
3831
3832  return vl;
3833}
3834
3835/* See if there is an svpattern that encodes an SVE predicate of mode
3836   PRED_MODE in which the first VL bits are set and the rest are clear.
3837   Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3838   A VL of -1 indicates an all-true vector.  */
3839
3840aarch64_svpattern
3841aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3842{
3843  if (vl < 0)
3844    return AARCH64_SV_ALL;
3845
3846  if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3847    return AARCH64_NUM_SVPATTERNS;
3848
3849  if (vl >= 1 && vl <= 8)
3850    return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3851
3852  if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3853    return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3854
3855  int max_vl;
3856  if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3857    {
3858      if (vl == (max_vl / 3) * 3)
3859	return AARCH64_SV_MUL3;
3860      /* These would only trigger for non-power-of-2 lengths.  */
3861      if (vl == (max_vl & -4))
3862	return AARCH64_SV_MUL4;
3863      if (vl == (1 << floor_log2 (max_vl)))
3864	return AARCH64_SV_POW2;
3865      if (vl == max_vl)
3866	return AARCH64_SV_ALL;
3867    }
3868  return AARCH64_NUM_SVPATTERNS;
3869}
3870
3871/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3872   bits has the lowest bit set and the upper bits clear.  This is the
3873   VNx16BImode equivalent of a PTRUE for controlling elements of
3874   ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3875   all bits are significant, even the upper zeros.  */
3876
3877rtx
3878aarch64_ptrue_all (unsigned int elt_size)
3879{
3880  rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3881  builder.quick_push (const1_rtx);
3882  for (unsigned int i = 1; i < elt_size; ++i)
3883    builder.quick_push (const0_rtx);
3884  return builder.build ();
3885}
3886
3887/* Return an all-true predicate register of mode MODE.  */
3888
3889rtx
3890aarch64_ptrue_reg (machine_mode mode)
3891{
3892  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3893  rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3894  return gen_lowpart (mode, reg);
3895}
3896
3897/* Return an all-false predicate register of mode MODE.  */
3898
3899rtx
3900aarch64_pfalse_reg (machine_mode mode)
3901{
3902  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3903  rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3904  return gen_lowpart (mode, reg);
3905}
3906
3907/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3908   for it.  PRED2[0] is the predicate for the instruction whose result
3909   is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3910   for it.  Return true if we can prove that the two predicates are
3911   equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3912   with PRED1[0] without changing behavior.  */
3913
3914bool
3915aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3916{
3917  machine_mode mode = GET_MODE (pred1[0]);
3918  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3919	      && mode == GET_MODE (pred2[0])
3920	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
3921	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
3922
3923  bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3924		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3925  bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3926		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3927  return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3928}
3929
3930/* Emit a comparison CMP between OP0 and OP1, both of which have mode
3931   DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3932   Use TARGET as the target register if nonnull and convenient.  */
3933
3934static rtx
3935aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3936			  machine_mode data_mode, rtx op1, rtx op2)
3937{
3938  insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3939  expand_operand ops[5];
3940  create_output_operand (&ops[0], target, pred_mode);
3941  create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3942  create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3943  create_input_operand (&ops[3], op1, data_mode);
3944  create_input_operand (&ops[4], op2, data_mode);
3945  expand_insn (icode, 5, ops);
3946  return ops[0].value;
3947}
3948
3949/* Use a comparison to convert integer vector SRC into MODE, which is
3950   the corresponding SVE predicate mode.  Use TARGET for the result
3951   if it's nonnull and convenient.  */
3952
3953rtx
3954aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3955{
3956  machine_mode src_mode = GET_MODE (src);
3957  return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3958				   src, CONST0_RTX (src_mode));
3959}
3960
3961/* Return the assembly token for svprfop value PRFOP.  */
3962
3963static const char *
3964svprfop_token (enum aarch64_svprfop prfop)
3965{
3966  switch (prfop)
3967    {
3968#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3969    AARCH64_FOR_SVPRFOP (CASE)
3970#undef CASE
3971    case AARCH64_NUM_SVPRFOPS:
3972      break;
3973    }
3974  gcc_unreachable ();
3975}
3976
3977/* Return the assembly string for an SVE prefetch operation with
3978   mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3979   and that SUFFIX is the format for the remaining operands.  */
3980
3981char *
3982aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3983			     const char *suffix)
3984{
3985  static char buffer[128];
3986  aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3987  unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3988				   mnemonic, svprfop_token (prfop), suffix);
3989  gcc_assert (written < sizeof (buffer));
3990  return buffer;
3991}
3992
3993/* Check whether we can calculate the number of elements in PATTERN
3994   at compile time, given that there are NELTS_PER_VQ elements per
3995   128-bit block.  Return the value if so, otherwise return -1.  */
3996
3997HOST_WIDE_INT
3998aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3999{
4000  unsigned int vl, const_vg;
4001  if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
4002    vl = 1 + (pattern - AARCH64_SV_VL1);
4003  else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
4004    vl = 16 << (pattern - AARCH64_SV_VL16);
4005  else if (aarch64_sve_vg.is_constant (&const_vg))
4006    {
4007      /* There are two vector granules per quadword.  */
4008      unsigned int nelts = (const_vg / 2) * nelts_per_vq;
4009      switch (pattern)
4010	{
4011	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
4012	case AARCH64_SV_MUL4: return nelts & -4;
4013	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
4014	case AARCH64_SV_ALL: return nelts;
4015	default: gcc_unreachable ();
4016	}
4017    }
4018  else
4019    return -1;
4020
4021  /* There are two vector granules per quadword.  */
4022  poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
4023  if (known_le (vl, nelts_all))
4024    return vl;
4025
4026  /* Requesting more elements than are available results in a PFALSE.  */
4027  if (known_gt (vl, nelts_all))
4028    return 0;
4029
4030  return -1;
4031}
4032
4033/* Return true if we can move VALUE into a register using a single
4034   CNT[BHWD] instruction.  */
4035
4036static bool
4037aarch64_sve_cnt_immediate_p (poly_int64 value)
4038{
4039  HOST_WIDE_INT factor = value.coeffs[0];
4040  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
4041  return (value.coeffs[1] == factor
4042	  && IN_RANGE (factor, 2, 16 * 16)
4043	  && (factor & 1) == 0
4044	  && factor <= 16 * (factor & -factor));
4045}
4046
4047/* Likewise for rtx X.  */
4048
4049bool
4050aarch64_sve_cnt_immediate_p (rtx x)
4051{
4052  poly_int64 value;
4053  return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
4054}
4055
4056/* Return the asm string for an instruction with a CNT-like vector size
4057   operand (a vector pattern followed by a multiplier in the range [1, 16]).
4058   PREFIX is the mnemonic without the size suffix and OPERANDS is the
4059   first part of the operands template (the part that comes before the
4060   vector size itself).  PATTERN is the pattern to use.  FACTOR is the
4061   number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
4062   in each quadword.  If it is zero, we can use any element size.  */
4063
4064static char *
4065aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4066				  aarch64_svpattern pattern,
4067				  unsigned int factor,
4068				  unsigned int nelts_per_vq)
4069{
4070  static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
4071
4072  if (nelts_per_vq == 0)
4073    /* There is some overlap in the ranges of the four CNT instructions.
4074       Here we always use the smallest possible element size, so that the
4075       multiplier is 1 whereever possible.  */
4076    nelts_per_vq = factor & -factor;
4077  int shift = std::min (exact_log2 (nelts_per_vq), 4);
4078  gcc_assert (IN_RANGE (shift, 1, 4));
4079  char suffix = "dwhb"[shift - 1];
4080
4081  factor >>= shift;
4082  unsigned int written;
4083  if (pattern == AARCH64_SV_ALL && factor == 1)
4084    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
4085			prefix, suffix, operands);
4086  else if (factor == 1)
4087    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
4088			prefix, suffix, operands, svpattern_token (pattern));
4089  else
4090    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4091			prefix, suffix, operands, svpattern_token (pattern),
4092			factor);
4093  gcc_assert (written < sizeof (buffer));
4094  return buffer;
4095}
4096
4097/* Return the asm string for an instruction with a CNT-like vector size
4098   operand (a vector pattern followed by a multiplier in the range [1, 16]).
4099   PREFIX is the mnemonic without the size suffix and OPERANDS is the
4100   first part of the operands template (the part that comes before the
4101   vector size itself).  X is the value of the vector size operand,
4102   as a polynomial integer rtx; we need to convert this into an "all"
4103   pattern with a multiplier.  */
4104
4105char *
4106aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4107				  rtx x)
4108{
4109  poly_int64 value = rtx_to_poly_int64 (x);
4110  gcc_assert (aarch64_sve_cnt_immediate_p (value));
4111  return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4112					   value.coeffs[1], 0);
4113}
4114
4115/* Return the asm string for an instruction with a CNT-like vector size
4116   operand (a vector pattern followed by a multiplier in the range [1, 16]).
4117   PREFIX is the mnemonic without the size suffix and OPERANDS is the
4118   first part of the operands template (the part that comes before the
4119   vector size itself).  CNT_PAT[0..2] are the operands of the
4120   UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
4121
4122char *
4123aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4124				      const char *operands, rtx *cnt_pat)
4125{
4126  aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4127  unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4128  unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4129  return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4130					   factor, nelts_per_vq);
4131}
4132
4133/* Return true if we can add X using a single SVE INC or DEC instruction.  */
4134
4135bool
4136aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4137{
4138  poly_int64 value;
4139  return (poly_int_rtx_p (x, &value)
4140	  && (aarch64_sve_cnt_immediate_p (value)
4141	      || aarch64_sve_cnt_immediate_p (-value)));
4142}
4143
4144/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4145   operand 0.  */
4146
4147char *
4148aarch64_output_sve_scalar_inc_dec (rtx offset)
4149{
4150  poly_int64 offset_value = rtx_to_poly_int64 (offset);
4151  gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4152  if (offset_value.coeffs[1] > 0)
4153    return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4154					     offset_value.coeffs[1], 0);
4155  else
4156    return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4157					     -offset_value.coeffs[1], 0);
4158}
4159
4160/* Return true if we can add VALUE to a register using a single ADDVL
4161   or ADDPL instruction.  */
4162
4163static bool
4164aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4165{
4166  HOST_WIDE_INT factor = value.coeffs[0];
4167  if (factor == 0 || value.coeffs[1] != factor)
4168    return false;
4169  /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4170     and a value of 16 is one vector width.  */
4171  return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4172	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4173}
4174
4175/* Likewise for rtx X.  */
4176
4177bool
4178aarch64_sve_addvl_addpl_immediate_p (rtx x)
4179{
4180  poly_int64 value;
4181  return (poly_int_rtx_p (x, &value)
4182	  && aarch64_sve_addvl_addpl_immediate_p (value));
4183}
4184
4185/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4186   to operand 1 and storing the result in operand 0.  */
4187
4188char *
4189aarch64_output_sve_addvl_addpl (rtx offset)
4190{
4191  static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4192  poly_int64 offset_value = rtx_to_poly_int64 (offset);
4193  gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4194
4195  int factor = offset_value.coeffs[1];
4196  if ((factor & 15) == 0)
4197    snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4198  else
4199    snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4200  return buffer;
4201}
4202
4203/* Return true if X is a valid immediate for an SVE vector INC or DEC
4204   instruction.  If it is, store the number of elements in each vector
4205   quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4206   factor in *FACTOR_OUT (if nonnull).  */
4207
4208bool
4209aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4210					unsigned int *nelts_per_vq_out)
4211{
4212  rtx elt;
4213  poly_int64 value;
4214
4215  if (!const_vec_duplicate_p (x, &elt)
4216      || !poly_int_rtx_p (elt, &value))
4217    return false;
4218
4219  unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4220  if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4221    /* There's no vector INCB.  */
4222    return false;
4223
4224  HOST_WIDE_INT factor = value.coeffs[0];
4225  if (value.coeffs[1] != factor)
4226    return false;
4227
4228  /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4229  if ((factor % nelts_per_vq) != 0
4230      || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4231    return false;
4232
4233  if (factor_out)
4234    *factor_out = factor;
4235  if (nelts_per_vq_out)
4236    *nelts_per_vq_out = nelts_per_vq;
4237  return true;
4238}
4239
4240/* Return true if X is a valid immediate for an SVE vector INC or DEC
4241   instruction.  */
4242
4243bool
4244aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4245{
4246  return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4247}
4248
4249/* Return the asm template for an SVE vector INC or DEC instruction.
4250   OPERANDS gives the operands before the vector count and X is the
4251   value of the vector count operand itself.  */
4252
4253char *
4254aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4255{
4256  int factor;
4257  unsigned int nelts_per_vq;
4258  if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4259    gcc_unreachable ();
4260  if (factor < 0)
4261    return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4262					     -factor, nelts_per_vq);
4263  else
4264    return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4265					     factor, nelts_per_vq);
4266}
4267
4268static int
4269aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4270				scalar_int_mode mode)
4271{
4272  int i;
4273  unsigned HOST_WIDE_INT val, val2, mask;
4274  int one_match, zero_match;
4275  int num_insns;
4276
4277  val = INTVAL (imm);
4278
4279  if (aarch64_move_imm (val, mode))
4280    {
4281      if (generate)
4282	emit_insn (gen_rtx_SET (dest, imm));
4283      return 1;
4284    }
4285
4286  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4287     (with XXXX non-zero). In that case check to see if the move can be done in
4288     a smaller mode.  */
4289  val2 = val & 0xffffffff;
4290  if (mode == DImode
4291      && aarch64_move_imm (val2, SImode)
4292      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4293    {
4294      if (generate)
4295	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4296
4297      /* Check if we have to emit a second instruction by checking to see
4298         if any of the upper 32 bits of the original DI mode value is set.  */
4299      if (val == val2)
4300	return 1;
4301
4302      i = (val >> 48) ? 48 : 32;
4303
4304      if (generate)
4305	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4306				    GEN_INT ((val >> i) & 0xffff)));
4307
4308      return 2;
4309    }
4310
4311  if ((val >> 32) == 0 || mode == SImode)
4312    {
4313      if (generate)
4314	{
4315	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4316	  if (mode == SImode)
4317	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4318				       GEN_INT ((val >> 16) & 0xffff)));
4319	  else
4320	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4321				       GEN_INT ((val >> 16) & 0xffff)));
4322	}
4323      return 2;
4324    }
4325
4326  /* Remaining cases are all for DImode.  */
4327
4328  mask = 0xffff;
4329  zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4330    ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4331  one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4332    ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4333
4334  if (zero_match != 2 && one_match != 2)
4335    {
4336      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4337	 For a 64-bit bitmask try whether changing 16 bits to all ones or
4338	 zeroes creates a valid bitmask.  To check any repeated bitmask,
4339	 try using 16 bits from the other 32-bit half of val.  */
4340
4341      for (i = 0; i < 64; i += 16, mask <<= 16)
4342	{
4343	  val2 = val & ~mask;
4344	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4345	    break;
4346	  val2 = val | mask;
4347	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4348	    break;
4349	  val2 = val2 & ~mask;
4350	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4351	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4352	    break;
4353	}
4354      if (i != 64)
4355	{
4356	  if (generate)
4357	    {
4358	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4359	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4360					 GEN_INT ((val >> i) & 0xffff)));
4361	    }
4362	  return 2;
4363	}
4364    }
4365
4366  /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4367     are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4368     otherwise skip zero bits.  */
4369
4370  num_insns = 1;
4371  mask = 0xffff;
4372  val2 = one_match > zero_match ? ~val : val;
4373  i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4374
4375  if (generate)
4376    emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4377					   ? (val | ~(mask << i))
4378					   : (val & (mask << i)))));
4379  for (i += 16; i < 64; i += 16)
4380    {
4381      if ((val2 & (mask << i)) == 0)
4382	continue;
4383      if (generate)
4384	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4385				   GEN_INT ((val >> i) & 0xffff)));
4386      num_insns ++;
4387    }
4388
4389  return num_insns;
4390}
4391
4392/* Return whether imm is a 128-bit immediate which is simple enough to
4393   expand inline.  */
4394bool
4395aarch64_mov128_immediate (rtx imm)
4396{
4397  if (GET_CODE (imm) == CONST_INT)
4398    return true;
4399
4400  gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4401
4402  rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4403  rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4404
4405  return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4406	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4407}
4408
4409
4410/* Return the number of temporary registers that aarch64_add_offset_1
4411   would need to add OFFSET to a register.  */
4412
4413static unsigned int
4414aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4415{
4416  return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4417}
4418
4419/* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4420   a non-polynomial OFFSET.  MODE is the mode of the addition.
4421   FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4422   be set and CFA adjustments added to the generated instructions.
4423
4424   TEMP1, if nonnull, is a register of mode MODE that can be used as a
4425   temporary if register allocation is already complete.  This temporary
4426   register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4427   to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4428   the immediate again.
4429
4430   Since this function may be used to adjust the stack pointer, we must
4431   ensure that it cannot cause transient stack deallocation (for example
4432   by first incrementing SP and then decrementing when adjusting by a
4433   large immediate).  */
4434
4435static void
4436aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4437		      rtx src, HOST_WIDE_INT offset, rtx temp1,
4438		      bool frame_related_p, bool emit_move_imm)
4439{
4440  gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4441  gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4442
4443  unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4444  rtx_insn *insn;
4445
4446  if (!moffset)
4447    {
4448      if (!rtx_equal_p (dest, src))
4449	{
4450	  insn = emit_insn (gen_rtx_SET (dest, src));
4451	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
4452	}
4453      return;
4454    }
4455
4456  /* Single instruction adjustment.  */
4457  if (aarch64_uimm12_shift (moffset))
4458    {
4459      insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4460      RTX_FRAME_RELATED_P (insn) = frame_related_p;
4461      return;
4462    }
4463
4464  /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4465     and either:
4466
4467     a) the offset cannot be loaded by a 16-bit move or
4468     b) there is no spare register into which we can move it.  */
4469  if (moffset < 0x1000000
4470      && ((!temp1 && !can_create_pseudo_p ())
4471	  || !aarch64_move_imm (moffset, mode)))
4472    {
4473      HOST_WIDE_INT low_off = moffset & 0xfff;
4474
4475      low_off = offset < 0 ? -low_off : low_off;
4476      insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4477      RTX_FRAME_RELATED_P (insn) = frame_related_p;
4478      insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4479      RTX_FRAME_RELATED_P (insn) = frame_related_p;
4480      return;
4481    }
4482
4483  /* Emit a move immediate if required and an addition/subtraction.  */
4484  if (emit_move_imm)
4485    {
4486      gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4487      temp1 = aarch64_force_temporary (mode, temp1,
4488				       gen_int_mode (moffset, mode));
4489    }
4490  insn = emit_insn (offset < 0
4491		    ? gen_sub3_insn (dest, src, temp1)
4492		    : gen_add3_insn (dest, src, temp1));
4493  if (frame_related_p)
4494    {
4495      RTX_FRAME_RELATED_P (insn) = frame_related_p;
4496      rtx adj = plus_constant (mode, src, offset);
4497      add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4498    }
4499}
4500
4501/* Return the number of temporary registers that aarch64_add_offset
4502   would need to move OFFSET into a register or add OFFSET to a register;
4503   ADD_P is true if we want the latter rather than the former.  */
4504
4505static unsigned int
4506aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4507{
4508  /* This follows the same structure as aarch64_add_offset.  */
4509  if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4510    return 0;
4511
4512  unsigned int count = 0;
4513  HOST_WIDE_INT factor = offset.coeffs[1];
4514  HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4515  poly_int64 poly_offset (factor, factor);
4516  if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4517    /* Need one register for the ADDVL/ADDPL result.  */
4518    count += 1;
4519  else if (factor != 0)
4520    {
4521      factor = abs (factor);
4522      if (factor > 16 * (factor & -factor))
4523	/* Need one register for the CNT result and one for the multiplication
4524	   factor.  If necessary, the second temporary can be reused for the
4525	   constant part of the offset.  */
4526	return 2;
4527      /* Need one register for the CNT result (which might then
4528	 be shifted).  */
4529      count += 1;
4530    }
4531  return count + aarch64_add_offset_1_temporaries (constant);
4532}
4533
4534/* If X can be represented as a poly_int64, return the number
4535   of temporaries that are required to add it to a register.
4536   Return -1 otherwise.  */
4537
4538int
4539aarch64_add_offset_temporaries (rtx x)
4540{
4541  poly_int64 offset;
4542  if (!poly_int_rtx_p (x, &offset))
4543    return -1;
4544  return aarch64_offset_temporaries (true, offset);
4545}
4546
4547/* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4548   FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4549   be set and CFA adjustments added to the generated instructions.
4550
4551   TEMP1, if nonnull, is a register of mode MODE that can be used as a
4552   temporary if register allocation is already complete.  This temporary
4553   register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4554   If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4555   false to avoid emitting the immediate again.
4556
4557   TEMP2, if nonnull, is a second temporary register that doesn't
4558   overlap either DEST or REG.
4559
4560   Since this function may be used to adjust the stack pointer, we must
4561   ensure that it cannot cause transient stack deallocation (for example
4562   by first incrementing SP and then decrementing when adjusting by a
4563   large immediate).  */
4564
4565static void
4566aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4567		    poly_int64 offset, rtx temp1, rtx temp2,
4568		    bool frame_related_p, bool emit_move_imm = true)
4569{
4570  gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4571  gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4572  gcc_assert (temp1 == NULL_RTX
4573	      || !frame_related_p
4574	      || !reg_overlap_mentioned_p (temp1, dest));
4575  gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4576
4577  /* Try using ADDVL or ADDPL to add the whole value.  */
4578  if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4579    {
4580      rtx offset_rtx = gen_int_mode (offset, mode);
4581      rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4582      RTX_FRAME_RELATED_P (insn) = frame_related_p;
4583      return;
4584    }
4585
4586  /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4587     SVE vector register, over and above the minimum size of 128 bits.
4588     This is equivalent to half the value returned by CNTD with a
4589     vector shape of ALL.  */
4590  HOST_WIDE_INT factor = offset.coeffs[1];
4591  HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4592
4593  /* Try using ADDVL or ADDPL to add the VG-based part.  */
4594  poly_int64 poly_offset (factor, factor);
4595  if (src != const0_rtx
4596      && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4597    {
4598      rtx offset_rtx = gen_int_mode (poly_offset, mode);
4599      if (frame_related_p)
4600	{
4601	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4602	  RTX_FRAME_RELATED_P (insn) = true;
4603	  src = dest;
4604	}
4605      else
4606	{
4607	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4608	  src = aarch64_force_temporary (mode, temp1, addr);
4609	  temp1 = temp2;
4610	  temp2 = NULL_RTX;
4611	}
4612    }
4613  /* Otherwise use a CNT-based sequence.  */
4614  else if (factor != 0)
4615    {
4616      /* Use a subtraction if we have a negative factor.  */
4617      rtx_code code = PLUS;
4618      if (factor < 0)
4619	{
4620	  factor = -factor;
4621	  code = MINUS;
4622	}
4623
4624      /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4625	 into the multiplication.  */
4626      rtx val;
4627      int shift = 0;
4628      if (factor & 1)
4629	/* Use a right shift by 1.  */
4630	shift = -1;
4631      else
4632	factor /= 2;
4633      HOST_WIDE_INT low_bit = factor & -factor;
4634      if (factor <= 16 * low_bit)
4635	{
4636	  if (factor > 16 * 8)
4637	    {
4638	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4639		 the value with the minimum multiplier and shift it into
4640		 position.  */
4641	      int extra_shift = exact_log2 (low_bit);
4642	      shift += extra_shift;
4643	      factor >>= extra_shift;
4644	    }
4645	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4646	}
4647      else
4648	{
4649	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4650	     directly, since that should increase the chances of being
4651	     able to use a shift and add sequence.  If LOW_BIT itself
4652	     is out of range, just use CNTD.  */
4653	  if (low_bit <= 16 * 8)
4654	    factor /= low_bit;
4655	  else
4656	    low_bit = 1;
4657
4658	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4659	  val = aarch64_force_temporary (mode, temp1, val);
4660
4661	  if (can_create_pseudo_p ())
4662	    {
4663	      rtx coeff1 = gen_int_mode (factor, mode);
4664	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4665	    }
4666	  else
4667	    {
4668	      /* Go back to using a negative multiplication factor if we have
4669		 no register from which to subtract.  */
4670	      if (code == MINUS && src == const0_rtx)
4671		{
4672		  factor = -factor;
4673		  code = PLUS;
4674		}
4675	      rtx coeff1 = gen_int_mode (factor, mode);
4676	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4677	      val = gen_rtx_MULT (mode, val, coeff1);
4678	    }
4679	}
4680
4681      if (shift > 0)
4682	{
4683	  /* Multiply by 1 << SHIFT.  */
4684	  val = aarch64_force_temporary (mode, temp1, val);
4685	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4686	}
4687      else if (shift == -1)
4688	{
4689	  /* Divide by 2.  */
4690	  val = aarch64_force_temporary (mode, temp1, val);
4691	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4692	}
4693
4694      /* Calculate SRC +/- CNTD * FACTOR / 2.  */
4695      if (src != const0_rtx)
4696	{
4697	  val = aarch64_force_temporary (mode, temp1, val);
4698	  val = gen_rtx_fmt_ee (code, mode, src, val);
4699	}
4700      else if (code == MINUS)
4701	{
4702	  val = aarch64_force_temporary (mode, temp1, val);
4703	  val = gen_rtx_NEG (mode, val);
4704	}
4705
4706      if (constant == 0 || frame_related_p)
4707	{
4708	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4709	  if (frame_related_p)
4710	    {
4711	      RTX_FRAME_RELATED_P (insn) = true;
4712	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
4713			    gen_rtx_SET (dest, plus_constant (Pmode, src,
4714							      poly_offset)));
4715	    }
4716	  src = dest;
4717	  if (constant == 0)
4718	    return;
4719	}
4720      else
4721	{
4722	  src = aarch64_force_temporary (mode, temp1, val);
4723	  temp1 = temp2;
4724	  temp2 = NULL_RTX;
4725	}
4726
4727      emit_move_imm = true;
4728    }
4729
4730  aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4731			frame_related_p, emit_move_imm);
4732}
4733
4734/* Like aarch64_add_offset, but the offset is given as an rtx rather
4735   than a poly_int64.  */
4736
4737void
4738aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4739			  rtx offset_rtx, rtx temp1, rtx temp2)
4740{
4741  aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4742		      temp1, temp2, false);
4743}
4744
4745/* Add DELTA to the stack pointer, marking the instructions frame-related.
4746   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4747   if TEMP1 already contains abs (DELTA).  */
4748
4749static inline void
4750aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4751{
4752  aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4753		      temp1, temp2, true, emit_move_imm);
4754}
4755
4756/* Subtract DELTA from the stack pointer, marking the instructions
4757   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4758   if nonnull.  */
4759
4760static inline void
4761aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4762		bool emit_move_imm = true)
4763{
4764  aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4765		      temp1, temp2, frame_related_p, emit_move_imm);
4766}
4767
4768/* Set DEST to (vec_series BASE STEP).  */
4769
4770static void
4771aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4772{
4773  machine_mode mode = GET_MODE (dest);
4774  scalar_mode inner = GET_MODE_INNER (mode);
4775
4776  /* Each operand can be a register or an immediate in the range [-16, 15].  */
4777  if (!aarch64_sve_index_immediate_p (base))
4778    base = force_reg (inner, base);
4779  if (!aarch64_sve_index_immediate_p (step))
4780    step = force_reg (inner, step);
4781
4782  emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4783}
4784
4785/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4786   register of mode MODE.  Use TARGET for the result if it's nonnull
4787   and convenient.
4788
4789   The two vector modes must have the same element mode.  The behavior
4790   is to duplicate architectural lane N of SRC into architectural lanes
4791   N + I * STEP of the result.  On big-endian targets, architectural
4792   lane 0 of an Advanced SIMD vector is the last element of the vector
4793   in memory layout, so for big-endian targets this operation has the
4794   effect of reversing SRC before duplicating it.  Callers need to
4795   account for this.  */
4796
4797rtx
4798aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4799{
4800  machine_mode src_mode = GET_MODE (src);
4801  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4802  insn_code icode = (BYTES_BIG_ENDIAN
4803		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
4804		     : code_for_aarch64_vec_duplicate_vq_le (mode));
4805
4806  unsigned int i = 0;
4807  expand_operand ops[3];
4808  create_output_operand (&ops[i++], target, mode);
4809  create_output_operand (&ops[i++], src, src_mode);
4810  if (BYTES_BIG_ENDIAN)
4811    {
4812      /* Create a PARALLEL describing the reversal of SRC.  */
4813      unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4814      rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4815						  nelts_per_vq - 1, -1);
4816      create_fixed_operand (&ops[i++], sel);
4817    }
4818  expand_insn (icode, i, ops);
4819  return ops[0].value;
4820}
4821
4822/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4823   the memory image into DEST.  Return true on success.  */
4824
4825static bool
4826aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4827{
4828  src = force_const_mem (GET_MODE (src), src);
4829  if (!src)
4830    return false;
4831
4832  /* Make sure that the address is legitimate.  */
4833  if (!aarch64_sve_ld1rq_operand_p (src))
4834    {
4835      rtx addr = force_reg (Pmode, XEXP (src, 0));
4836      src = replace_equiv_address (src, addr);
4837    }
4838
4839  machine_mode mode = GET_MODE (dest);
4840  machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4841  rtx ptrue = aarch64_ptrue_reg (pred_mode);
4842  emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4843  return true;
4844}
4845
4846/* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
4847   by N "background" values.  Try to move it into TARGET using:
4848
4849      PTRUE PRED.<T>, VL<N>
4850      MOV TRUE.<T>, #<foreground>
4851      MOV FALSE.<T>, #<background>
4852      SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
4853
4854   The PTRUE is always a single instruction but the MOVs might need a
4855   longer sequence.  If the background value is zero (as it often is),
4856   the sequence can sometimes collapse to a PTRUE followed by a
4857   zero-predicated move.
4858
4859   Return the target on success, otherwise return null.  */
4860
4861static rtx
4862aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
4863{
4864  gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
4865
4866  /* Make sure that the PTRUE is valid.  */
4867  machine_mode mode = GET_MODE (src);
4868  machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4869  unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4870  if (aarch64_svpattern_for_vl (pred_mode, npatterns)
4871      == AARCH64_NUM_SVPATTERNS)
4872    return NULL_RTX;
4873
4874  rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
4875  rtx_vector_builder true_builder (mode, npatterns, 1);
4876  rtx_vector_builder false_builder (mode, npatterns, 1);
4877  for (unsigned int i = 0; i < npatterns; ++i)
4878    {
4879      true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4880      pred_builder.quick_push (CONST1_RTX (BImode));
4881    }
4882  for (unsigned int i = 0; i < npatterns; ++i)
4883    {
4884      false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
4885      pred_builder.quick_push (CONST0_RTX (BImode));
4886    }
4887  expand_operand ops[4];
4888  create_output_operand (&ops[0], target, mode);
4889  create_input_operand (&ops[1], true_builder.build (), mode);
4890  create_input_operand (&ops[2], false_builder.build (), mode);
4891  create_input_operand (&ops[3], pred_builder.build (), pred_mode);
4892  expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
4893  return target;
4894}
4895
4896/* Return a register containing CONST_VECTOR SRC, given that SRC has an
4897   SVE data mode and isn't a legitimate constant.  Use TARGET for the
4898   result if convenient.
4899
4900   The returned register can have whatever mode seems most natural
4901   given the contents of SRC.  */
4902
4903static rtx
4904aarch64_expand_sve_const_vector (rtx target, rtx src)
4905{
4906  machine_mode mode = GET_MODE (src);
4907  unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4908  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4909  scalar_mode elt_mode = GET_MODE_INNER (mode);
4910  unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4911  unsigned int container_bits = aarch64_sve_container_bits (mode);
4912  unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4913
4914  if (nelts_per_pattern == 1
4915      && encoded_bits <= 128
4916      && container_bits != elt_bits)
4917    {
4918      /* We have a partial vector mode and a constant whose full-vector
4919	 equivalent would occupy a repeating 128-bit sequence.  Build that
4920	 full-vector equivalent instead, so that we have the option of
4921	 using LD1RQ and Advanced SIMD operations.  */
4922      unsigned int repeat = container_bits / elt_bits;
4923      machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4924      rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4925      for (unsigned int i = 0; i < npatterns; ++i)
4926	for (unsigned int j = 0; j < repeat; ++j)
4927	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4928      target = aarch64_target_reg (target, full_mode);
4929      return aarch64_expand_sve_const_vector (target, builder.build ());
4930    }
4931
4932  if (nelts_per_pattern == 1 && encoded_bits == 128)
4933    {
4934      /* The constant is a duplicated quadword but can't be narrowed
4935	 beyond a quadword.  Get the memory image of the first quadword
4936	 as a 128-bit vector and try using LD1RQ to load it from memory.
4937
4938	 The effect for both endiannesses is to load memory lane N into
4939	 architectural lanes N + I * STEP of the result.  On big-endian
4940	 targets, the layout of the 128-bit vector in an Advanced SIMD
4941	 register would be different from its layout in an SVE register,
4942	 but this 128-bit vector is a memory value only.  */
4943      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4944      rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4945      if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4946	return target;
4947    }
4948
4949  if (nelts_per_pattern == 1 && encoded_bits < 128)
4950    {
4951      /* The vector is a repeating sequence of 64 bits or fewer.
4952	 See if we can load them using an Advanced SIMD move and then
4953	 duplicate it to fill a vector.  This is better than using a GPR
4954	 move because it keeps everything in the same register file.  */
4955      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4956      rtx_vector_builder builder (vq_mode, npatterns, 1);
4957      for (unsigned int i = 0; i < npatterns; ++i)
4958	{
4959	  /* We want memory lane N to go into architectural lane N,
4960	     so reverse for big-endian targets.  The DUP .Q pattern
4961	     has a compensating reverse built-in.  */
4962	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4963	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4964	}
4965      rtx vq_src = builder.build ();
4966      if (aarch64_simd_valid_immediate (vq_src, NULL))
4967	{
4968	  vq_src = force_reg (vq_mode, vq_src);
4969	  return aarch64_expand_sve_dupq (target, mode, vq_src);
4970	}
4971
4972      /* Get an integer representation of the repeating part of Advanced
4973	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4974	 which for big-endian targets is lane-swapped wrt a normal
4975	 Advanced SIMD vector.  This means that for both endiannesses,
4976	 memory lane N of SVE vector SRC corresponds to architectural
4977	 lane N of a register holding VQ_SRC.  This in turn means that
4978	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4979	 as a single 128-bit value) and thus that memory lane 0 of SRC is
4980	 in the lsb of the integer.  Duplicating the integer therefore
4981	 ensures that memory lane N of SRC goes into architectural lane
4982	 N + I * INDEX of the SVE register.  */
4983      scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4984      rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4985      if (elt_value)
4986	{
4987	  /* Pretend that we had a vector of INT_MODE to start with.  */
4988	  elt_mode = int_mode;
4989	  mode = aarch64_full_sve_mode (int_mode).require ();
4990
4991	  /* If the integer can be moved into a general register by a
4992	     single instruction, do that and duplicate the result.  */
4993	  if (CONST_INT_P (elt_value)
4994	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4995	    {
4996	      elt_value = force_reg (elt_mode, elt_value);
4997	      return expand_vector_broadcast (mode, elt_value);
4998	    }
4999	}
5000      else if (npatterns == 1)
5001	/* We're duplicating a single value, but can't do better than
5002	   force it to memory and load from there.  This handles things
5003	   like symbolic constants.  */
5004	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5005
5006      if (elt_value)
5007	{
5008	  /* Load the element from memory if we can, otherwise move it into
5009	     a register and use a DUP.  */
5010	  rtx op = force_const_mem (elt_mode, elt_value);
5011	  if (!op)
5012	    op = force_reg (elt_mode, elt_value);
5013	  return expand_vector_broadcast (mode, op);
5014	}
5015    }
5016
5017  /* Try using INDEX.  */
5018  rtx base, step;
5019  if (const_vec_series_p (src, &base, &step))
5020    {
5021      aarch64_expand_vec_series (target, base, step);
5022      return target;
5023    }
5024
5025  /* From here on, it's better to force the whole constant to memory
5026     if we can.  */
5027  if (GET_MODE_NUNITS (mode).is_constant ())
5028    return NULL_RTX;
5029
5030  if (nelts_per_pattern == 2)
5031    if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5032      return res;
5033
5034  /* Expand each pattern individually.  */
5035  gcc_assert (npatterns > 1);
5036  rtx_vector_builder builder;
5037  auto_vec<rtx, 16> vectors (npatterns);
5038  for (unsigned int i = 0; i < npatterns; ++i)
5039    {
5040      builder.new_vector (mode, 1, nelts_per_pattern);
5041      for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5042	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5043      vectors.quick_push (force_reg (mode, builder.build ()));
5044    }
5045
5046  /* Use permutes to interleave the separate vectors.  */
5047  while (npatterns > 1)
5048    {
5049      npatterns /= 2;
5050      for (unsigned int i = 0; i < npatterns; ++i)
5051	{
5052	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5053	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5054	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5055	  vectors[i] = tmp;
5056	}
5057    }
5058  gcc_assert (vectors[0] == target);
5059  return target;
5060}
5061
5062/* Use WHILE to set a predicate register of mode MODE in which the first
5063   VL bits are set and the rest are clear.  Use TARGET for the register
5064   if it's nonnull and convenient.  */
5065
5066static rtx
5067aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5068				 unsigned int vl)
5069{
5070  rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5071  target = aarch64_target_reg (target, mode);
5072  emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5073			target, const0_rtx, limit));
5074  return target;
5075}
5076
5077static rtx
5078aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5079
5080/* BUILDER is a constant predicate in which the index of every set bit
5081   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5082   by inverting every element at a multiple of ELT_SIZE and EORing the
5083   result with an ELT_SIZE PTRUE.
5084
5085   Return a register that contains the constant on success, otherwise
5086   return null.  Use TARGET as the register if it is nonnull and
5087   convenient.  */
5088
5089static rtx
5090aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5091				   unsigned int elt_size)
5092{
5093  /* Invert every element at a multiple of ELT_SIZE, keeping the
5094     other bits zero.  */
5095  rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5096				  builder.nelts_per_pattern ());
5097  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5098    if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5099      inv_builder.quick_push (const1_rtx);
5100    else
5101      inv_builder.quick_push (const0_rtx);
5102  inv_builder.finalize ();
5103
5104  /* See if we can load the constant cheaply.  */
5105  rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5106  if (!inv)
5107    return NULL_RTX;
5108
5109  /* EOR the result with an ELT_SIZE PTRUE.  */
5110  rtx mask = aarch64_ptrue_all (elt_size);
5111  mask = force_reg (VNx16BImode, mask);
5112  inv = gen_lowpart (VNx16BImode, inv);
5113  target = aarch64_target_reg (target, VNx16BImode);
5114  emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5115  return target;
5116}
5117
5118/* BUILDER is a constant predicate in which the index of every set bit
5119   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5120   using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5121   register on success, otherwise return null.  Use TARGET as the register
5122   if nonnull and convenient.  */
5123
5124static rtx
5125aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5126				   unsigned int elt_size,
5127				   unsigned int permute_size)
5128{
5129  /* We're going to split the constant into two new constants A and B,
5130     with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5131     and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5132
5133     A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5134     B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5135
5136     where _ indicates elements that will be discarded by the permute.
5137
5138     First calculate the ELT_SIZEs for A and B.  */
5139  unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5140  unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5141  for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5142    if (INTVAL (builder.elt (i)) != 0)
5143      {
5144	if (i & permute_size)
5145	  b_elt_size |= i - permute_size;
5146	else
5147	  a_elt_size |= i;
5148      }
5149  a_elt_size &= -a_elt_size;
5150  b_elt_size &= -b_elt_size;
5151
5152  /* Now construct the vectors themselves.  */
5153  rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5154				builder.nelts_per_pattern ());
5155  rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5156				builder.nelts_per_pattern ());
5157  unsigned int nelts = builder.encoded_nelts ();
5158  for (unsigned int i = 0; i < nelts; ++i)
5159    if (i & (elt_size - 1))
5160      {
5161	a_builder.quick_push (const0_rtx);
5162	b_builder.quick_push (const0_rtx);
5163      }
5164    else if ((i & permute_size) == 0)
5165      {
5166	/* The A and B elements are significant.  */
5167	a_builder.quick_push (builder.elt (i));
5168	b_builder.quick_push (builder.elt (i + permute_size));
5169      }
5170    else
5171      {
5172	/* The A and B elements are going to be discarded, so pick whatever
5173	   is likely to give a nice constant.  We are targeting element
5174	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5175	   with the aim of each being a sequence of ones followed by
5176	   a sequence of zeros.  So:
5177
5178	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5179	     duplicate the last X_ELT_SIZE element, to extend the
5180	     current sequence of ones or zeros.
5181
5182	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5183	     zero, so that the constant really does have X_ELT_SIZE and
5184	     not a smaller size.  */
5185	if (a_elt_size > permute_size)
5186	  a_builder.quick_push (const0_rtx);
5187	else
5188	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
5189	if (b_elt_size > permute_size)
5190	  b_builder.quick_push (const0_rtx);
5191	else
5192	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
5193      }
5194  a_builder.finalize ();
5195  b_builder.finalize ();
5196
5197  /* Try loading A into a register.  */
5198  rtx_insn *last = get_last_insn ();
5199  rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5200  if (!a)
5201    return NULL_RTX;
5202
5203  /* Try loading B into a register.  */
5204  rtx b = a;
5205  if (a_builder != b_builder)
5206    {
5207      b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5208      if (!b)
5209	{
5210	  delete_insns_since (last);
5211	  return NULL_RTX;
5212	}
5213    }
5214
5215  /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5216     operands but permutes them as though they had mode MODE.  */
5217  machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5218  target = aarch64_target_reg (target, GET_MODE (a));
5219  rtx type_reg = CONST0_RTX (mode);
5220  emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5221  return target;
5222}
5223
5224/* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5225   constant in BUILDER into an SVE predicate register.  Return the register
5226   on success, otherwise return null.  Use TARGET for the register if
5227   nonnull and convenient.
5228
5229   ALLOW_RECURSE_P is true if we can use methods that would call this
5230   function recursively.  */
5231
5232static rtx
5233aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5234				 bool allow_recurse_p)
5235{
5236  if (builder.encoded_nelts () == 1)
5237    /* A PFALSE or a PTRUE .B ALL.  */
5238    return aarch64_emit_set_immediate (target, builder);
5239
5240  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5241  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5242    {
5243      /* If we can load the constant using PTRUE, use it as-is.  */
5244      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5245      if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5246	return aarch64_emit_set_immediate (target, builder);
5247
5248      /* Otherwise use WHILE to set the first VL bits.  */
5249      return aarch64_sve_move_pred_via_while (target, mode, vl);
5250    }
5251
5252  if (!allow_recurse_p)
5253    return NULL_RTX;
5254
5255  /* Try inverting the vector in element size ELT_SIZE and then EORing
5256     the result with an ELT_SIZE PTRUE.  */
5257  if (INTVAL (builder.elt (0)) == 0)
5258    if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5259						     elt_size))
5260      return res;
5261
5262  /* Try using TRN1 to permute two simpler constants.  */
5263  for (unsigned int i = elt_size; i <= 8; i *= 2)
5264    if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5265						     elt_size, i))
5266      return res;
5267
5268  return NULL_RTX;
5269}
5270
5271/* Return an SVE predicate register that contains the VNx16BImode
5272   constant in BUILDER, without going through the move expanders.
5273
5274   The returned register can have whatever mode seems most natural
5275   given the contents of BUILDER.  Use TARGET for the result if
5276   convenient.  */
5277
5278static rtx
5279aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5280{
5281  /* Try loading the constant using pure predicate operations.  */
5282  if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5283    return res;
5284
5285  /* Try forcing the constant to memory.  */
5286  if (builder.full_nelts ().is_constant ())
5287    if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5288      {
5289	target = aarch64_target_reg (target, VNx16BImode);
5290	emit_move_insn (target, mem);
5291	return target;
5292      }
5293
5294  /* The last resort is to load the constant as an integer and then
5295     compare it against zero.  Use -1 for set bits in order to increase
5296     the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5297  rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5298				  builder.nelts_per_pattern ());
5299  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5300    int_builder.quick_push (INTVAL (builder.elt (i))
5301			    ? constm1_rtx : const0_rtx);
5302  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5303					   int_builder.build ());
5304}
5305
5306/* Set DEST to immediate IMM.  */
5307
5308void
5309aarch64_expand_mov_immediate (rtx dest, rtx imm)
5310{
5311  machine_mode mode = GET_MODE (dest);
5312
5313  /* Check on what type of symbol it is.  */
5314  scalar_int_mode int_mode;
5315  if ((GET_CODE (imm) == SYMBOL_REF
5316       || GET_CODE (imm) == LABEL_REF
5317       || GET_CODE (imm) == CONST
5318       || GET_CODE (imm) == CONST_POLY_INT)
5319      && is_a <scalar_int_mode> (mode, &int_mode))
5320    {
5321      rtx mem;
5322      poly_int64 offset;
5323      HOST_WIDE_INT const_offset;
5324      enum aarch64_symbol_type sty;
5325
5326      /* If we have (const (plus symbol offset)), separate out the offset
5327	 before we start classifying the symbol.  */
5328      rtx base = strip_offset (imm, &offset);
5329
5330      /* We must always add an offset involving VL separately, rather than
5331	 folding it into the relocation.  */
5332      if (!offset.is_constant (&const_offset))
5333	{
5334	  if (!TARGET_SVE)
5335	    {
5336	      aarch64_report_sve_required ();
5337	      return;
5338	    }
5339	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5340	    emit_insn (gen_rtx_SET (dest, imm));
5341	  else
5342	    {
5343	      /* Do arithmetic on 32-bit values if the result is smaller
5344		 than that.  */
5345	      if (partial_subreg_p (int_mode, SImode))
5346		{
5347		  /* It is invalid to do symbol calculations in modes
5348		     narrower than SImode.  */
5349		  gcc_assert (base == const0_rtx);
5350		  dest = gen_lowpart (SImode, dest);
5351		  int_mode = SImode;
5352		}
5353	      if (base != const0_rtx)
5354		{
5355		  base = aarch64_force_temporary (int_mode, dest, base);
5356		  aarch64_add_offset (int_mode, dest, base, offset,
5357				      NULL_RTX, NULL_RTX, false);
5358		}
5359	      else
5360		aarch64_add_offset (int_mode, dest, base, offset,
5361				    dest, NULL_RTX, false);
5362	    }
5363	  return;
5364	}
5365
5366      sty = aarch64_classify_symbol (base, const_offset);
5367      switch (sty)
5368	{
5369	case SYMBOL_FORCE_TO_MEM:
5370	  if (const_offset != 0
5371	      && targetm.cannot_force_const_mem (int_mode, imm))
5372	    {
5373	      gcc_assert (can_create_pseudo_p ());
5374	      base = aarch64_force_temporary (int_mode, dest, base);
5375	      aarch64_add_offset (int_mode, dest, base, const_offset,
5376				  NULL_RTX, NULL_RTX, false);
5377	      return;
5378	    }
5379
5380	  mem = force_const_mem (ptr_mode, imm);
5381	  gcc_assert (mem);
5382
5383	  /* If we aren't generating PC relative literals, then
5384	     we need to expand the literal pool access carefully.
5385	     This is something that needs to be done in a number
5386	     of places, so could well live as a separate function.  */
5387	  if (!aarch64_pcrelative_literal_loads)
5388	    {
5389	      gcc_assert (can_create_pseudo_p ());
5390	      base = gen_reg_rtx (ptr_mode);
5391	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5392	      if (ptr_mode != Pmode)
5393		base = convert_memory_address (Pmode, base);
5394	      mem = gen_rtx_MEM (ptr_mode, base);
5395	    }
5396
5397	  if (int_mode != ptr_mode)
5398	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5399
5400	  emit_insn (gen_rtx_SET (dest, mem));
5401
5402	  return;
5403
5404        case SYMBOL_SMALL_TLSGD:
5405        case SYMBOL_SMALL_TLSDESC:
5406	case SYMBOL_SMALL_TLSIE:
5407	case SYMBOL_SMALL_GOT_28K:
5408	case SYMBOL_SMALL_GOT_4G:
5409	case SYMBOL_TINY_GOT:
5410	case SYMBOL_TINY_TLSIE:
5411	  if (const_offset != 0)
5412	    {
5413	      gcc_assert(can_create_pseudo_p ());
5414	      base = aarch64_force_temporary (int_mode, dest, base);
5415	      aarch64_add_offset (int_mode, dest, base, const_offset,
5416				  NULL_RTX, NULL_RTX, false);
5417	      return;
5418	    }
5419	  /* FALLTHRU */
5420
5421	case SYMBOL_SMALL_ABSOLUTE:
5422	case SYMBOL_TINY_ABSOLUTE:
5423	case SYMBOL_TLSLE12:
5424	case SYMBOL_TLSLE24:
5425	case SYMBOL_TLSLE32:
5426	case SYMBOL_TLSLE48:
5427	  aarch64_load_symref_appropriately (dest, imm, sty);
5428	  return;
5429
5430	default:
5431	  gcc_unreachable ();
5432	}
5433    }
5434
5435  if (!CONST_INT_P (imm))
5436    {
5437      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5438	{
5439	  /* Only the low bit of each .H, .S and .D element is defined,
5440	     so we can set the upper bits to whatever we like.  If the
5441	     predicate is all-true in MODE, prefer to set all the undefined
5442	     bits as well, so that we can share a single .B predicate for
5443	     all modes.  */
5444	  if (imm == CONSTM1_RTX (mode))
5445	    imm = CONSTM1_RTX (VNx16BImode);
5446
5447	  /* All methods for constructing predicate modes wider than VNx16BI
5448	     will set the upper bits of each element to zero.  Expose this
5449	     by moving such constants as a VNx16BI, so that all bits are
5450	     significant and so that constants for different modes can be
5451	     shared.  The wider constant will still be available as a
5452	     REG_EQUAL note.  */
5453	  rtx_vector_builder builder;
5454	  if (aarch64_get_sve_pred_bits (builder, imm))
5455	    {
5456	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
5457	      if (dest != res)
5458		emit_move_insn (dest, gen_lowpart (mode, res));
5459	      return;
5460	    }
5461	}
5462
5463      if (GET_CODE (imm) == HIGH
5464	  || aarch64_simd_valid_immediate (imm, NULL))
5465	{
5466	  emit_insn (gen_rtx_SET (dest, imm));
5467	  return;
5468	}
5469
5470      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5471	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5472	  {
5473	    if (dest != res)
5474	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5475	    return;
5476	  }
5477
5478      rtx mem = force_const_mem (mode, imm);
5479      gcc_assert (mem);
5480      emit_move_insn (dest, mem);
5481      return;
5482    }
5483
5484  aarch64_internal_mov_immediate (dest, imm, true,
5485				  as_a <scalar_int_mode> (mode));
5486}
5487
5488/* Return the MEM rtx that provides the canary value that should be used
5489   for stack-smashing protection.  MODE is the mode of the memory.
5490   For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5491   (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
5492   indicates whether the caller is performing a SET or a TEST operation.  */
5493
5494rtx
5495aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5496				  aarch64_salt_type salt_type)
5497{
5498  rtx addr;
5499  if (aarch64_stack_protector_guard == SSP_GLOBAL)
5500    {
5501      gcc_assert (MEM_P (decl_rtl));
5502      addr = XEXP (decl_rtl, 0);
5503      poly_int64 offset;
5504      rtx base = strip_offset_and_salt (addr, &offset);
5505      if (!SYMBOL_REF_P (base))
5506	return decl_rtl;
5507
5508      rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5509      addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5510      addr = gen_rtx_CONST (Pmode, addr);
5511      addr = plus_constant (Pmode, addr, offset);
5512    }
5513  else
5514    {
5515      /* Calculate the address from the system register.  */
5516      rtx salt = GEN_INT (salt_type);
5517      addr = gen_reg_rtx (mode);
5518      if (mode == DImode)
5519	emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5520      else
5521	{
5522	  emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5523	  addr = convert_memory_address (Pmode, addr);
5524	}
5525      addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5526    }
5527  return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5528}
5529
5530/* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5531   that is known to contain PTRUE.  */
5532
5533void
5534aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5535{
5536  expand_operand ops[3];
5537  machine_mode mode = GET_MODE (dest);
5538  create_output_operand (&ops[0], dest, mode);
5539  create_input_operand (&ops[1], pred, GET_MODE(pred));
5540  create_input_operand (&ops[2], src, mode);
5541  temporary_volatile_ok v (true);
5542  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5543}
5544
5545/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5546   operand is in memory.  In this case we need to use the predicated LD1
5547   and ST1 instead of LDR and STR, both for correctness on big-endian
5548   targets and because LD1 and ST1 support a wider range of addressing modes.
5549   PRED_MODE is the mode of the predicate.
5550
5551   See the comment at the head of aarch64-sve.md for details about the
5552   big-endian handling.  */
5553
5554void
5555aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5556{
5557  machine_mode mode = GET_MODE (dest);
5558  rtx ptrue = aarch64_ptrue_reg (pred_mode);
5559  if (!register_operand (src, mode)
5560      && !register_operand (dest, mode))
5561    {
5562      rtx tmp = gen_reg_rtx (mode);
5563      if (MEM_P (src))
5564	aarch64_emit_sve_pred_move (tmp, ptrue, src);
5565      else
5566	emit_move_insn (tmp, src);
5567      src = tmp;
5568    }
5569  aarch64_emit_sve_pred_move (dest, ptrue, src);
5570}
5571
5572/* Called only on big-endian targets.  See whether an SVE vector move
5573   from SRC to DEST is effectively a REV[BHW] instruction, because at
5574   least one operand is a subreg of an SVE vector that has wider or
5575   narrower elements.  Return true and emit the instruction if so.
5576
5577   For example:
5578
5579     (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5580
5581   represents a VIEW_CONVERT between the following vectors, viewed
5582   in memory order:
5583
5584     R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5585     R1: { [0],      [1],      [2],      [3],     ... }
5586
5587   The high part of lane X in R2 should therefore correspond to lane X*2
5588   of R1, but the register representations are:
5589
5590         msb                                      lsb
5591     R2: ...... [1].high  [1].low   [0].high  [0].low
5592     R1: ...... [3]       [2]       [1]       [0]
5593
5594   where the low part of lane X in R2 corresponds to lane X*2 in R1.
5595   We therefore need a reverse operation to swap the high and low values
5596   around.
5597
5598   This is purely an optimization.  Without it we would spill the
5599   subreg operand to the stack in one mode and reload it in the
5600   other mode, which has the same effect as the REV.  */
5601
5602bool
5603aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5604{
5605  gcc_assert (BYTES_BIG_ENDIAN);
5606
5607  /* Do not try to optimize subregs that LRA has created for matched
5608     reloads.  These subregs only exist as a temporary measure to make
5609     the RTL well-formed, but they are exempt from the usual
5610     TARGET_CAN_CHANGE_MODE_CLASS rules.
5611
5612     For example, if we have:
5613
5614       (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5615
5616     and the constraints require R1 and R2 to be in the same register,
5617     LRA may need to create RTL such as:
5618
5619       (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5620       (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5621       (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5622
5623     which forces both the input and output of the original instruction
5624     to use the same hard register.  But for this to work, the normal
5625     rules have to be suppressed on the subreg input, otherwise LRA
5626     would need to reload that input too, meaning that the process
5627     would never terminate.  To compensate for this, the normal rules
5628     are also suppressed for the subreg output of the first move.
5629     Ignoring the special case and handling the first move normally
5630     would therefore generate wrong code: we would reverse the elements
5631     for the first subreg but not reverse them back for the second subreg.  */
5632  if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
5633    dest = SUBREG_REG (dest);
5634  if (SUBREG_P (src) && !LRA_SUBREG_P (src))
5635    src = SUBREG_REG (src);
5636
5637  /* The optimization handles two single SVE REGs with different element
5638     sizes.  */
5639  if (!REG_P (dest)
5640      || !REG_P (src)
5641      || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5642      || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5643      || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5644	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5645    return false;
5646
5647  /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5648  rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5649  rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5650			       UNSPEC_REV_SUBREG);
5651  emit_insn (gen_rtx_SET (dest, unspec));
5652  return true;
5653}
5654
5655/* Return a copy of X with mode MODE, without changing its other
5656   attributes.  Unlike gen_lowpart, this doesn't care whether the
5657   mode change is valid.  */
5658
5659rtx
5660aarch64_replace_reg_mode (rtx x, machine_mode mode)
5661{
5662  if (GET_MODE (x) == mode)
5663    return x;
5664
5665  x = shallow_copy_rtx (x);
5666  set_mode_and_regno (x, mode, REGNO (x));
5667  return x;
5668}
5669
5670/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5671   stored in wider integer containers.  */
5672
5673static unsigned int
5674aarch64_sve_rev_unspec (machine_mode mode)
5675{
5676  switch (GET_MODE_UNIT_SIZE (mode))
5677    {
5678    case 1: return UNSPEC_REVB;
5679    case 2: return UNSPEC_REVH;
5680    case 4: return UNSPEC_REVW;
5681    }
5682  gcc_unreachable ();
5683}
5684
5685/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5686   operands.  */
5687
5688void
5689aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5690{
5691  /* Decide which REV operation we need.  The mode with wider elements
5692     determines the mode of the operands and the mode with the narrower
5693     elements determines the reverse width.  */
5694  machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5695  machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5696  if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5697      < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5698    std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5699
5700  unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5701  machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5702
5703  /* Get the operands in the appropriate modes and emit the instruction.  */
5704  ptrue = gen_lowpart (pred_mode, ptrue);
5705  dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5706  src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5707  emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5708			       dest, ptrue, src));
5709}
5710
5711static bool
5712aarch64_function_ok_for_sibcall (tree, tree exp)
5713{
5714  if (crtl->abi->id () != expr_callee_abi (exp).id ())
5715    return false;
5716
5717  return true;
5718}
5719
5720/* Subroutine of aarch64_pass_by_reference for arguments that are not
5721   passed in SVE registers.  */
5722
5723static bool
5724aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5725			     const function_arg_info &arg)
5726{
5727  HOST_WIDE_INT size;
5728  machine_mode dummymode;
5729  int nregs;
5730
5731  /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
5732  if (arg.mode == BLKmode && arg.type)
5733    size = int_size_in_bytes (arg.type);
5734  else
5735    /* No frontends can create types with variable-sized modes, so we
5736       shouldn't be asked to pass or return them.  */
5737    size = GET_MODE_SIZE (arg.mode).to_constant ();
5738
5739  /* Aggregates are passed by reference based on their size.  */
5740  if (arg.aggregate_type_p ())
5741    size = int_size_in_bytes (arg.type);
5742
5743  /* Variable sized arguments are always returned by reference.  */
5744  if (size < 0)
5745    return true;
5746
5747  /* Can this be a candidate to be passed in fp/simd register(s)?  */
5748  if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5749					       &dummymode, &nregs, NULL,
5750					       !pcum || pcum->silent_p))
5751    return false;
5752
5753  /* Arguments which are variable sized or larger than 2 registers are
5754     passed by reference unless they are a homogenous floating point
5755     aggregate.  */
5756  return size > 2 * UNITS_PER_WORD;
5757}
5758
5759/* Implement TARGET_PASS_BY_REFERENCE.  */
5760
5761static bool
5762aarch64_pass_by_reference (cumulative_args_t pcum_v,
5763			   const function_arg_info &arg)
5764{
5765  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5766
5767  if (!arg.type)
5768    return aarch64_pass_by_reference_1 (pcum, arg);
5769
5770  pure_scalable_type_info pst_info;
5771  switch (pst_info.analyze (arg.type))
5772    {
5773    case pure_scalable_type_info::IS_PST:
5774      if (pcum && !pcum->silent_p && !TARGET_SVE)
5775	/* We can't gracefully recover at this point, so make this a
5776	   fatal error.  */
5777	fatal_error (input_location, "arguments of type %qT require"
5778		     " the SVE ISA extension", arg.type);
5779
5780      /* Variadic SVE types are passed by reference.  Normal non-variadic
5781	 arguments are too if we've run out of registers.  */
5782      return (!arg.named
5783	      || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5784	      || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5785
5786    case pure_scalable_type_info::DOESNT_MATTER:
5787      gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5788      return true;
5789
5790    case pure_scalable_type_info::NO_ABI_IDENTITY:
5791    case pure_scalable_type_info::ISNT_PST:
5792      return aarch64_pass_by_reference_1 (pcum, arg);
5793    }
5794  gcc_unreachable ();
5795}
5796
5797/* Return TRUE if VALTYPE is padded to its least significant bits.  */
5798static bool
5799aarch64_return_in_msb (const_tree valtype)
5800{
5801  machine_mode dummy_mode;
5802  int dummy_int;
5803
5804  /* Never happens in little-endian mode.  */
5805  if (!BYTES_BIG_ENDIAN)
5806    return false;
5807
5808  /* Only composite types smaller than or equal to 16 bytes can
5809     be potentially returned in registers.  */
5810  if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5811      || int_size_in_bytes (valtype) <= 0
5812      || int_size_in_bytes (valtype) > 16)
5813    return false;
5814
5815  /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5816     or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5817     is always passed/returned in the least significant bits of fp/simd
5818     register(s).  */
5819  if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5820					       &dummy_mode, &dummy_int, NULL,
5821					       false))
5822    return false;
5823
5824  /* Likewise pure scalable types for SVE vector and predicate registers.  */
5825  pure_scalable_type_info pst_info;
5826  if (pst_info.analyze_registers (valtype))
5827    return false;
5828
5829  return true;
5830}
5831
5832/* Implement TARGET_FUNCTION_VALUE.
5833   Define how to find the value returned by a function.  */
5834
5835static rtx
5836aarch64_function_value (const_tree type, const_tree func,
5837			bool outgoing ATTRIBUTE_UNUSED)
5838{
5839  machine_mode mode;
5840  int unsignedp;
5841
5842  mode = TYPE_MODE (type);
5843  if (INTEGRAL_TYPE_P (type))
5844    mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5845
5846  pure_scalable_type_info pst_info;
5847  if (type && pst_info.analyze_registers (type))
5848    return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5849
5850  /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5851     are returned in memory, not by value.  */
5852  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5853  bool sve_p = (vec_flags & VEC_ANY_SVE);
5854
5855  if (aarch64_return_in_msb (type))
5856    {
5857      HOST_WIDE_INT size = int_size_in_bytes (type);
5858
5859      if (size % UNITS_PER_WORD != 0)
5860	{
5861	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5862	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5863	}
5864    }
5865
5866  int count;
5867  machine_mode ag_mode;
5868  if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5869					       NULL, false))
5870    {
5871      gcc_assert (!sve_p);
5872      if (!aarch64_composite_type_p (type, mode))
5873	{
5874	  gcc_assert (count == 1 && mode == ag_mode);
5875	  return gen_rtx_REG (mode, V0_REGNUM);
5876	}
5877      else
5878	{
5879	  int i;
5880	  rtx par;
5881
5882	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5883	  for (i = 0; i < count; i++)
5884	    {
5885	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5886	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5887	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5888	      XVECEXP (par, 0, i) = tmp;
5889	    }
5890	  return par;
5891	}
5892    }
5893  else
5894    {
5895      if (sve_p)
5896	{
5897	  /* Vector types can acquire a partial SVE mode using things like
5898	     __attribute__((vector_size(N))), and this is potentially useful.
5899	     However, the choice of mode doesn't affect the type's ABI
5900	     identity, so we should treat the types as though they had
5901	     the associated integer mode, just like they did before SVE
5902	     was introduced.
5903
5904	     We know that the vector must be 128 bits or smaller,
5905	     otherwise we'd have returned it in memory instead.  */
5906	  gcc_assert (type
5907		      && (aarch64_some_values_include_pst_objects_p (type)
5908			  || (vec_flags & VEC_PARTIAL)));
5909
5910	  scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5911	  rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5912	  rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5913	  return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5914	}
5915      return gen_rtx_REG (mode, R0_REGNUM);
5916    }
5917}
5918
5919/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5920   Return true if REGNO is the number of a hard register in which the values
5921   of called function may come back.  */
5922
5923static bool
5924aarch64_function_value_regno_p (const unsigned int regno)
5925{
5926  /* Maximum of 16 bytes can be returned in the general registers.  Examples
5927     of 16-byte return values are: 128-bit integers and 16-byte small
5928     structures (excluding homogeneous floating-point aggregates).  */
5929  if (regno == R0_REGNUM || regno == R1_REGNUM)
5930    return true;
5931
5932  /* Up to four fp/simd registers can return a function value, e.g. a
5933     homogeneous floating-point aggregate having four members.  */
5934  if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5935    return TARGET_FLOAT;
5936
5937  return false;
5938}
5939
5940/* Subroutine for aarch64_return_in_memory for types that are not returned
5941   in SVE registers.  */
5942
5943static bool
5944aarch64_return_in_memory_1 (const_tree type)
5945{
5946  HOST_WIDE_INT size;
5947  machine_mode ag_mode;
5948  int count;
5949
5950  if (!AGGREGATE_TYPE_P (type)
5951      && TREE_CODE (type) != COMPLEX_TYPE
5952      && TREE_CODE (type) != VECTOR_TYPE)
5953    /* Simple scalar types always returned in registers.  */
5954    return false;
5955
5956  if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5957					       &ag_mode, &count, NULL, false))
5958    return false;
5959
5960  /* Types larger than 2 registers returned in memory.  */
5961  size = int_size_in_bytes (type);
5962  return (size < 0 || size > 2 * UNITS_PER_WORD);
5963}
5964
5965/* Implement TARGET_RETURN_IN_MEMORY.
5966
5967   If the type T of the result of a function is such that
5968     void func (T arg)
5969   would require that arg be passed as a value in a register (or set of
5970   registers) according to the parameter passing rules, then the result
5971   is returned in the same registers as would be used for such an
5972   argument.  */
5973
5974static bool
5975aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5976{
5977  pure_scalable_type_info pst_info;
5978  switch (pst_info.analyze (type))
5979    {
5980    case pure_scalable_type_info::IS_PST:
5981      return (pst_info.num_zr () > NUM_FP_ARG_REGS
5982	      || pst_info.num_pr () > NUM_PR_ARG_REGS);
5983
5984    case pure_scalable_type_info::DOESNT_MATTER:
5985      gcc_assert (aarch64_return_in_memory_1 (type));
5986      return true;
5987
5988    case pure_scalable_type_info::NO_ABI_IDENTITY:
5989    case pure_scalable_type_info::ISNT_PST:
5990      return aarch64_return_in_memory_1 (type);
5991    }
5992  gcc_unreachable ();
5993}
5994
5995static bool
5996aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5997			       const_tree type, int *nregs)
5998{
5999  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6000  return aarch64_vfp_is_call_or_return_candidate (mode, type,
6001						  &pcum->aapcs_vfp_rmode,
6002						  nregs, NULL, pcum->silent_p);
6003}
6004
6005/* Given MODE and TYPE of a function argument, return the alignment in
6006   bits.  The idea is to suppress any stronger alignment requested by
6007   the user and opt for the natural alignment (specified in AAPCS64 \S
6008   4.1).  ABI_BREAK is set to the old alignment if the alignment was
6009   incorrectly calculated in versions of GCC prior to GCC-9.  This is
6010   a helper function for local use only.  */
6011
6012static unsigned int
6013aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6014				bool *abi_break)
6015{
6016  *abi_break = false;
6017  if (!type)
6018    return GET_MODE_ALIGNMENT (mode);
6019
6020  if (integer_zerop (TYPE_SIZE (type)))
6021    return 0;
6022
6023  gcc_assert (TYPE_MODE (type) == mode);
6024
6025  if (!AGGREGATE_TYPE_P (type))
6026    return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
6027
6028  if (TREE_CODE (type) == ARRAY_TYPE)
6029    return TYPE_ALIGN (TREE_TYPE (type));
6030
6031  unsigned int alignment = 0;
6032  unsigned int bitfield_alignment = 0;
6033  for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6034    if (TREE_CODE (field) == FIELD_DECL)
6035      {
6036	/* Note that we explicitly consider zero-sized fields here,
6037	   even though they don't map to AAPCS64 machine types.
6038	   For example, in:
6039
6040	       struct __attribute__((aligned(8))) empty {};
6041
6042	       struct s {
6043		 [[no_unique_address]] empty e;
6044		 int x;
6045	       };
6046
6047	   "s" contains only one Fundamental Data Type (the int field)
6048	   but gains 8-byte alignment and size thanks to "e".  */
6049	alignment = std::max (alignment, DECL_ALIGN (field));
6050	if (DECL_BIT_FIELD_TYPE (field))
6051	  bitfield_alignment
6052	    = std::max (bitfield_alignment,
6053			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6054      }
6055
6056  if (bitfield_alignment > alignment)
6057    {
6058      *abi_break = true;
6059      return bitfield_alignment;
6060    }
6061
6062  return alignment;
6063}
6064
6065/* Layout a function argument according to the AAPCS64 rules.  The rule
6066   numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6067   mode that was originally given to us by the target hook, whereas the
6068   mode in ARG might be the result of replacing partial SVE modes with
6069   the equivalent integer mode.  */
6070
6071static void
6072aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6073{
6074  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6075  tree type = arg.type;
6076  machine_mode mode = arg.mode;
6077  int ncrn, nvrn, nregs;
6078  bool allocate_ncrn, allocate_nvrn;
6079  HOST_WIDE_INT size;
6080  bool abi_break;
6081
6082  /* We need to do this once per argument.  */
6083  if (pcum->aapcs_arg_processed)
6084    return;
6085
6086  bool warn_pcs_change
6087    = (warn_psabi
6088       && !pcum->silent_p
6089       && (currently_expanding_function_start
6090	   || currently_expanding_gimple_stmt));
6091
6092  unsigned int alignment
6093    = aarch64_function_arg_alignment (mode, type, &abi_break);
6094  gcc_assert (!alignment || abi_break < alignment);
6095
6096  pcum->aapcs_arg_processed = true;
6097
6098  pure_scalable_type_info pst_info;
6099  if (type && pst_info.analyze_registers (type))
6100    {
6101      /* aarch64_function_arg_alignment has never had an effect on
6102	 this case.  */
6103
6104      /* The PCS says that it is invalid to pass an SVE value to an
6105	 unprototyped function.  There is no ABI-defined location we
6106	 can return in this case, so we have no real choice but to raise
6107	 an error immediately, even though this is only a query function.  */
6108      if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6109	{
6110	  gcc_assert (!pcum->silent_p);
6111	  error ("SVE type %qT cannot be passed to an unprototyped function",
6112		 arg.type);
6113	  /* Avoid repeating the message, and avoid tripping the assert
6114	     below.  */
6115	  pcum->pcs_variant = ARM_PCS_SVE;
6116	}
6117
6118      /* We would have converted the argument into pass-by-reference
6119	 form if it didn't fit in registers.  */
6120      pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6121      pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6122      gcc_assert (arg.named
6123		  && pcum->pcs_variant == ARM_PCS_SVE
6124		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6125		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6126      pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6127					  P0_REGNUM + pcum->aapcs_nprn);
6128      return;
6129    }
6130
6131  /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6132     are passed by reference, not by value.  */
6133  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6134  bool sve_p = (vec_flags & VEC_ANY_SVE);
6135  if (sve_p)
6136    /* Vector types can acquire a partial SVE mode using things like
6137       __attribute__((vector_size(N))), and this is potentially useful.
6138       However, the choice of mode doesn't affect the type's ABI
6139       identity, so we should treat the types as though they had
6140       the associated integer mode, just like they did before SVE
6141       was introduced.
6142
6143       We know that the vector must be 128 bits or smaller,
6144       otherwise we'd have passed it in memory instead.  */
6145    gcc_assert (type
6146		&& (aarch64_some_values_include_pst_objects_p (type)
6147		    || (vec_flags & VEC_PARTIAL)));
6148
6149  /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6150  if (type)
6151    size = int_size_in_bytes (type);
6152  else
6153    /* No frontends can create types with variable-sized modes, so we
6154       shouldn't be asked to pass or return them.  */
6155    size = GET_MODE_SIZE (mode).to_constant ();
6156  size = ROUND_UP (size, UNITS_PER_WORD);
6157
6158  allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6159  allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6160						 mode,
6161						 type,
6162						 &nregs);
6163  gcc_assert (!sve_p || !allocate_nvrn);
6164
6165  /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6166     The following code thus handles passing by SIMD/FP registers first.  */
6167
6168  nvrn = pcum->aapcs_nvrn;
6169
6170  /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6171     and homogenous short-vector aggregates (HVA).  */
6172  if (allocate_nvrn)
6173    {
6174      /* aarch64_function_arg_alignment has never had an effect on
6175	 this case.  */
6176      if (!pcum->silent_p && !TARGET_FLOAT)
6177	aarch64_err_no_fpadvsimd (mode);
6178
6179      if (nvrn + nregs <= NUM_FP_ARG_REGS)
6180	{
6181	  pcum->aapcs_nextnvrn = nvrn + nregs;
6182	  if (!aarch64_composite_type_p (type, mode))
6183	    {
6184	      gcc_assert (nregs == 1);
6185	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6186	    }
6187	  else
6188	    {
6189	      rtx par;
6190	      int i;
6191	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6192	      for (i = 0; i < nregs; i++)
6193		{
6194		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6195					 V0_REGNUM + nvrn + i);
6196		  rtx offset = gen_int_mode
6197		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6198		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6199		  XVECEXP (par, 0, i) = tmp;
6200		}
6201	      pcum->aapcs_reg = par;
6202	    }
6203	  return;
6204	}
6205      else
6206	{
6207	  /* C.3 NSRN is set to 8.  */
6208	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6209	  goto on_stack;
6210	}
6211    }
6212
6213  ncrn = pcum->aapcs_ncrn;
6214  nregs = size / UNITS_PER_WORD;
6215
6216  /* C6 - C9.  though the sign and zero extension semantics are
6217     handled elsewhere.  This is the case where the argument fits
6218     entirely general registers.  */
6219  if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6220    {
6221      gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6222
6223      /* C.8 if the argument has an alignment of 16 then the NGRN is
6224	 rounded up to the next even number.  */
6225      if (nregs == 2
6226	  && ncrn % 2
6227	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6228	     comparison is there because for > 16 * BITS_PER_UNIT
6229	     alignment nregs should be > 2 and therefore it should be
6230	     passed by reference rather than value.  */
6231	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
6232	      == 16 * BITS_PER_UNIT))
6233	{
6234	  if (warn_pcs_change && abi_break)
6235	    inform (input_location, "parameter passing for argument of type "
6236		    "%qT changed in GCC 9.1", type);
6237	  ++ncrn;
6238	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6239	}
6240
6241      /* If an argument with an SVE mode needs to be shifted up to the
6242	 high part of the register, treat it as though it had an integer mode.
6243	 Using the normal (parallel [...]) would suppress the shifting.  */
6244      if (sve_p
6245	  && BYTES_BIG_ENDIAN
6246	  && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6247	  && aarch64_pad_reg_upward (mode, type, false))
6248	{
6249	  mode = int_mode_for_mode (mode).require ();
6250	  sve_p = false;
6251	}
6252
6253      /* NREGS can be 0 when e.g. an empty structure is to be passed.
6254	 A reg is still generated for it, but the caller should be smart
6255	 enough not to use it.  */
6256      if (nregs == 0
6257	  || (nregs == 1 && !sve_p)
6258	  || GET_MODE_CLASS (mode) == MODE_INT)
6259	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6260      else
6261	{
6262	  rtx par;
6263	  int i;
6264
6265	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6266	  for (i = 0; i < nregs; i++)
6267	    {
6268	      scalar_int_mode reg_mode = word_mode;
6269	      if (nregs == 1)
6270		reg_mode = int_mode_for_mode (mode).require ();
6271	      rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6272	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6273				       GEN_INT (i * UNITS_PER_WORD));
6274	      XVECEXP (par, 0, i) = tmp;
6275	    }
6276	  pcum->aapcs_reg = par;
6277	}
6278
6279      pcum->aapcs_nextncrn = ncrn + nregs;
6280      return;
6281    }
6282
6283  /* C.11  */
6284  pcum->aapcs_nextncrn = NUM_ARG_REGS;
6285
6286  /* The argument is passed on stack; record the needed number of words for
6287     this argument and align the total size if necessary.  */
6288on_stack:
6289  pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6290
6291  if (aarch64_function_arg_alignment (mode, type, &abi_break)
6292      == 16 * BITS_PER_UNIT)
6293    {
6294      int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6295      if (pcum->aapcs_stack_size != new_size)
6296	{
6297	  if (warn_pcs_change && abi_break)
6298	    inform (input_location, "parameter passing for argument of type "
6299		    "%qT changed in GCC 9.1", type);
6300	  pcum->aapcs_stack_size = new_size;
6301	}
6302    }
6303  return;
6304}
6305
6306/* Implement TARGET_FUNCTION_ARG.  */
6307
6308static rtx
6309aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6310{
6311  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6312  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6313	      || pcum->pcs_variant == ARM_PCS_SIMD
6314	      || pcum->pcs_variant == ARM_PCS_SVE);
6315
6316  if (arg.end_marker_p ())
6317    return gen_int_mode (pcum->pcs_variant, DImode);
6318
6319  aarch64_layout_arg (pcum_v, arg);
6320  return pcum->aapcs_reg;
6321}
6322
6323void
6324aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6325			      const_tree fntype,
6326			      rtx libname ATTRIBUTE_UNUSED,
6327			      const_tree fndecl ATTRIBUTE_UNUSED,
6328			      unsigned n_named ATTRIBUTE_UNUSED,
6329			      bool silent_p)
6330{
6331  pcum->aapcs_ncrn = 0;
6332  pcum->aapcs_nvrn = 0;
6333  pcum->aapcs_nprn = 0;
6334  pcum->aapcs_nextncrn = 0;
6335  pcum->aapcs_nextnvrn = 0;
6336  pcum->aapcs_nextnprn = 0;
6337  if (fntype)
6338    pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6339  else
6340    pcum->pcs_variant = ARM_PCS_AAPCS64;
6341  pcum->aapcs_reg = NULL_RTX;
6342  pcum->aapcs_arg_processed = false;
6343  pcum->aapcs_stack_words = 0;
6344  pcum->aapcs_stack_size = 0;
6345  pcum->silent_p = silent_p;
6346
6347  if (!silent_p
6348      && !TARGET_FLOAT
6349      && fndecl && TREE_PUBLIC (fndecl)
6350      && fntype && fntype != error_mark_node)
6351    {
6352      const_tree type = TREE_TYPE (fntype);
6353      machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
6354      int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
6355      if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6356						   &mode, &nregs, NULL, false))
6357	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6358    }
6359
6360  if (!silent_p
6361      && !TARGET_SVE
6362      && pcum->pcs_variant == ARM_PCS_SVE)
6363    {
6364      /* We can't gracefully recover at this point, so make this a
6365	 fatal error.  */
6366      if (fndecl)
6367	fatal_error (input_location, "%qE requires the SVE ISA extension",
6368		     fndecl);
6369      else
6370	fatal_error (input_location, "calls to functions of type %qT require"
6371		     " the SVE ISA extension", fntype);
6372    }
6373}
6374
6375static void
6376aarch64_function_arg_advance (cumulative_args_t pcum_v,
6377			      const function_arg_info &arg)
6378{
6379  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6380  if (pcum->pcs_variant == ARM_PCS_AAPCS64
6381      || pcum->pcs_variant == ARM_PCS_SIMD
6382      || pcum->pcs_variant == ARM_PCS_SVE)
6383    {
6384      aarch64_layout_arg (pcum_v, arg);
6385      gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6386		  != (pcum->aapcs_stack_words != 0));
6387      pcum->aapcs_arg_processed = false;
6388      pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6389      pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6390      pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6391      pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6392      pcum->aapcs_stack_words = 0;
6393      pcum->aapcs_reg = NULL_RTX;
6394    }
6395}
6396
6397bool
6398aarch64_function_arg_regno_p (unsigned regno)
6399{
6400  return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6401	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6402}
6403
6404/* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6405   PARM_BOUNDARY bits of alignment, but will be given anything up
6406   to STACK_BOUNDARY bits if the type requires it.  This makes sure
6407   that both before and after the layout of each argument, the Next
6408   Stacked Argument Address (NSAA) will have a minimum alignment of
6409   8 bytes.  */
6410
6411static unsigned int
6412aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6413{
6414  bool abi_break;
6415  unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6416							   &abi_break);
6417  if (abi_break && warn_psabi)
6418    inform (input_location, "parameter passing for argument of type "
6419	    "%qT changed in GCC 9.1", type);
6420
6421  return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6422}
6423
6424/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6425
6426static fixed_size_mode
6427aarch64_get_reg_raw_mode (int regno)
6428{
6429  if (TARGET_SVE && FP_REGNUM_P (regno))
6430    /* Don't use the SVE part of the register for __builtin_apply and
6431       __builtin_return.  The SVE registers aren't used by the normal PCS,
6432       so using them there would be a waste of time.  The PCS extensions
6433       for SVE types are fundamentally incompatible with the
6434       __builtin_return/__builtin_apply interface.  */
6435    return as_a <fixed_size_mode> (V16QImode);
6436  return default_get_reg_raw_mode (regno);
6437}
6438
6439/* Implement TARGET_FUNCTION_ARG_PADDING.
6440
6441   Small aggregate types are placed in the lowest memory address.
6442
6443   The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6444
6445static pad_direction
6446aarch64_function_arg_padding (machine_mode mode, const_tree type)
6447{
6448  /* On little-endian targets, the least significant byte of every stack
6449     argument is passed at the lowest byte address of the stack slot.  */
6450  if (!BYTES_BIG_ENDIAN)
6451    return PAD_UPWARD;
6452
6453  /* Otherwise, integral, floating-point and pointer types are padded downward:
6454     the least significant byte of a stack argument is passed at the highest
6455     byte address of the stack slot.  */
6456  if (type
6457      ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6458	 || POINTER_TYPE_P (type))
6459      : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6460    return PAD_DOWNWARD;
6461
6462  /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6463  return PAD_UPWARD;
6464}
6465
6466/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6467
6468   It specifies padding for the last (may also be the only)
6469   element of a block move between registers and memory.  If
6470   assuming the block is in the memory, padding upward means that
6471   the last element is padded after its highest significant byte,
6472   while in downward padding, the last element is padded at the
6473   its least significant byte side.
6474
6475   Small aggregates and small complex types are always padded
6476   upwards.
6477
6478   We don't need to worry about homogeneous floating-point or
6479   short-vector aggregates; their move is not affected by the
6480   padding direction determined here.  Regardless of endianness,
6481   each element of such an aggregate is put in the least
6482   significant bits of a fp/simd register.
6483
6484   Return !BYTES_BIG_ENDIAN if the least significant byte of the
6485   register has useful data, and return the opposite if the most
6486   significant byte does.  */
6487
6488bool
6489aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6490		     bool first ATTRIBUTE_UNUSED)
6491{
6492
6493  /* Aside from pure scalable types, small composite types are always
6494     padded upward.  */
6495  if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6496    {
6497      HOST_WIDE_INT size;
6498      if (type)
6499	size = int_size_in_bytes (type);
6500      else
6501	/* No frontends can create types with variable-sized modes, so we
6502	   shouldn't be asked to pass or return them.  */
6503	size = GET_MODE_SIZE (mode).to_constant ();
6504      if (size < 2 * UNITS_PER_WORD)
6505	{
6506	  pure_scalable_type_info pst_info;
6507	  if (pst_info.analyze_registers (type))
6508	    return false;
6509	  return true;
6510	}
6511    }
6512
6513  /* Otherwise, use the default padding.  */
6514  return !BYTES_BIG_ENDIAN;
6515}
6516
6517static scalar_int_mode
6518aarch64_libgcc_cmp_return_mode (void)
6519{
6520  return SImode;
6521}
6522
6523#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6524
6525/* We use the 12-bit shifted immediate arithmetic instructions so values
6526   must be multiple of (1 << 12), i.e. 4096.  */
6527#define ARITH_FACTOR 4096
6528
6529#if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6530#error Cannot use simple address calculation for stack probing
6531#endif
6532
6533/* The pair of scratch registers used for stack probing.  */
6534#define PROBE_STACK_FIRST_REG  R9_REGNUM
6535#define PROBE_STACK_SECOND_REG R10_REGNUM
6536
6537/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6538   inclusive.  These are offsets from the current stack pointer.  */
6539
6540static void
6541aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6542{
6543  HOST_WIDE_INT size;
6544  if (!poly_size.is_constant (&size))
6545    {
6546      sorry ("stack probes for SVE frames");
6547      return;
6548    }
6549
6550  rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6551
6552  /* See the same assertion on PROBE_INTERVAL above.  */
6553  gcc_assert ((first % ARITH_FACTOR) == 0);
6554
6555  /* See if we have a constant small number of probes to generate.  If so,
6556     that's the easy case.  */
6557  if (size <= PROBE_INTERVAL)
6558    {
6559      const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6560
6561      emit_set_insn (reg1,
6562		     plus_constant (Pmode,
6563				    stack_pointer_rtx, -(first + base)));
6564      emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6565    }
6566
6567  /* The run-time loop is made up of 8 insns in the generic case while the
6568     compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6569  else if (size <= 4 * PROBE_INTERVAL)
6570    {
6571      HOST_WIDE_INT i, rem;
6572
6573      emit_set_insn (reg1,
6574		     plus_constant (Pmode,
6575				    stack_pointer_rtx,
6576				    -(first + PROBE_INTERVAL)));
6577      emit_stack_probe (reg1);
6578
6579      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6580	 it exceeds SIZE.  If only two probes are needed, this will not
6581	 generate any code.  Then probe at FIRST + SIZE.  */
6582      for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6583	{
6584	  emit_set_insn (reg1,
6585			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6586	  emit_stack_probe (reg1);
6587	}
6588
6589      rem = size - (i - PROBE_INTERVAL);
6590      if (rem > 256)
6591	{
6592	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6593
6594	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6595	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6596	}
6597      else
6598	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6599    }
6600
6601  /* Otherwise, do the same as above, but in a loop.  Note that we must be
6602     extra careful with variables wrapping around because we might be at
6603     the very top (or the very bottom) of the address space and we have
6604     to be able to handle this case properly; in particular, we use an
6605     equality test for the loop condition.  */
6606  else
6607    {
6608      rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6609
6610      /* Step 1: round SIZE to the previous multiple of the interval.  */
6611
6612      HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6613
6614
6615      /* Step 2: compute initial and final value of the loop counter.  */
6616
6617      /* TEST_ADDR = SP + FIRST.  */
6618      emit_set_insn (reg1,
6619		     plus_constant (Pmode, stack_pointer_rtx, -first));
6620
6621      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6622      HOST_WIDE_INT adjustment = - (first + rounded_size);
6623      if (! aarch64_uimm12_shift (adjustment))
6624	{
6625	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6626					  true, Pmode);
6627	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6628	}
6629      else
6630	emit_set_insn (reg2,
6631		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
6632
6633      /* Step 3: the loop
6634
6635	 do
6636	   {
6637	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6638	     probe at TEST_ADDR
6639	   }
6640	 while (TEST_ADDR != LAST_ADDR)
6641
6642	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6643	 until it is equal to ROUNDED_SIZE.  */
6644
6645      emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6646
6647
6648      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6649	 that SIZE is equal to ROUNDED_SIZE.  */
6650
6651      if (size != rounded_size)
6652	{
6653	  HOST_WIDE_INT rem = size - rounded_size;
6654
6655	  if (rem > 256)
6656	    {
6657	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6658
6659	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6660	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6661	    }
6662	  else
6663	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6664	}
6665    }
6666
6667  /* Make sure nothing is scheduled before we are done.  */
6668  emit_insn (gen_blockage ());
6669}
6670
6671/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6672   absolute addresses.  */
6673
6674const char *
6675aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6676{
6677  static int labelno = 0;
6678  char loop_lab[32];
6679  rtx xops[2];
6680
6681  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6682
6683  /* Loop.  */
6684  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6685
6686  HOST_WIDE_INT stack_clash_probe_interval
6687    = 1 << param_stack_clash_protection_guard_size;
6688
6689  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6690  xops[0] = reg1;
6691  HOST_WIDE_INT interval;
6692  if (flag_stack_clash_protection)
6693    interval = stack_clash_probe_interval;
6694  else
6695    interval = PROBE_INTERVAL;
6696
6697  gcc_assert (aarch64_uimm12_shift (interval));
6698  xops[1] = GEN_INT (interval);
6699
6700  output_asm_insn ("sub\t%0, %0, %1", xops);
6701
6702  /* If doing stack clash protection then we probe up by the ABI specified
6703     amount.  We do this because we're dropping full pages at a time in the
6704     loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
6705  if (flag_stack_clash_protection)
6706    xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6707  else
6708    xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6709
6710  /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
6711     by this amount for each iteration.  */
6712  output_asm_insn ("str\txzr, [%0, %1]", xops);
6713
6714  /* Test if TEST_ADDR == LAST_ADDR.  */
6715  xops[1] = reg2;
6716  output_asm_insn ("cmp\t%0, %1", xops);
6717
6718  /* Branch.  */
6719  fputs ("\tb.ne\t", asm_out_file);
6720  assemble_name_raw (asm_out_file, loop_lab);
6721  fputc ('\n', asm_out_file);
6722
6723  return "";
6724}
6725
6726/* Emit the probe loop for doing stack clash probes and stack adjustments for
6727   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6728   of GUARD_SIZE.  When a probe is emitted it is done at most
6729   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6730   at most MIN_PROBE_THRESHOLD.  By the end of this function
6731   BASE = BASE - ADJUSTMENT.  */
6732
6733const char *
6734aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6735				      rtx min_probe_threshold, rtx guard_size)
6736{
6737  /* This function is not allowed to use any instruction generation function
6738     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
6739     so instead emit the code you want using output_asm_insn.  */
6740  gcc_assert (flag_stack_clash_protection);
6741  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6742  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6743
6744  /* The minimum required allocation before the residual requires probing.  */
6745  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6746
6747  /* Clamp the value down to the nearest value that can be used with a cmp.  */
6748  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6749  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6750
6751  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6752  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6753
6754  static int labelno = 0;
6755  char loop_start_lab[32];
6756  char loop_end_lab[32];
6757  rtx xops[2];
6758
6759  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6760  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6761
6762  /* Emit loop start label.  */
6763  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6764
6765  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
6766  xops[0] = adjustment;
6767  xops[1] = probe_offset_value_rtx;
6768  output_asm_insn ("cmp\t%0, %1", xops);
6769
6770  /* Branch to end if not enough adjustment to probe.  */
6771  fputs ("\tb.lt\t", asm_out_file);
6772  assemble_name_raw (asm_out_file, loop_end_lab);
6773  fputc ('\n', asm_out_file);
6774
6775  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
6776  xops[0] = base;
6777  xops[1] = probe_offset_value_rtx;
6778  output_asm_insn ("sub\t%0, %0, %1", xops);
6779
6780  /* Probe at BASE.  */
6781  xops[1] = const0_rtx;
6782  output_asm_insn ("str\txzr, [%0, %1]", xops);
6783
6784  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
6785  xops[0] = adjustment;
6786  xops[1] = probe_offset_value_rtx;
6787  output_asm_insn ("sub\t%0, %0, %1", xops);
6788
6789  /* Branch to start if still more bytes to allocate.  */
6790  fputs ("\tb\t", asm_out_file);
6791  assemble_name_raw (asm_out_file, loop_start_lab);
6792  fputc ('\n', asm_out_file);
6793
6794  /* No probe leave.  */
6795  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6796
6797  /* BASE = BASE - ADJUSTMENT.  */
6798  xops[0] = base;
6799  xops[1] = adjustment;
6800  output_asm_insn ("sub\t%0, %0, %1", xops);
6801  return "";
6802}
6803
6804/* Determine whether a frame chain needs to be generated.  */
6805static bool
6806aarch64_needs_frame_chain (void)
6807{
6808  /* Force a frame chain for EH returns so the return address is at FP+8.  */
6809  if (frame_pointer_needed || crtl->calls_eh_return)
6810    return true;
6811
6812  /* A leaf function cannot have calls or write LR.  */
6813  bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6814
6815  /* Don't use a frame chain in leaf functions if leaf frame pointers
6816     are disabled.  */
6817  if (flag_omit_leaf_frame_pointer && is_leaf)
6818    return false;
6819
6820  return aarch64_use_frame_pointer;
6821}
6822
6823/* Mark the registers that need to be saved by the callee and calculate
6824   the size of the callee-saved registers area and frame record (both FP
6825   and LR may be omitted).  */
6826static void
6827aarch64_layout_frame (void)
6828{
6829  poly_int64 offset = 0;
6830  int regno, last_fp_reg = INVALID_REGNUM;
6831  machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6832  poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6833  bool frame_related_fp_reg_p = false;
6834  aarch64_frame &frame = cfun->machine->frame;
6835
6836  frame.emit_frame_chain = aarch64_needs_frame_chain ();
6837
6838  /* Adjust the outgoing arguments size if required.  Keep it in sync with what
6839     the mid-end is doing.  */
6840  crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6841
6842#define SLOT_NOT_REQUIRED (-2)
6843#define SLOT_REQUIRED     (-1)
6844
6845  frame.wb_candidate1 = INVALID_REGNUM;
6846  frame.wb_candidate2 = INVALID_REGNUM;
6847  frame.spare_pred_reg = INVALID_REGNUM;
6848
6849  /* First mark all the registers that really need to be saved...  */
6850  for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6851    frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6852
6853  /* ... that includes the eh data registers (if needed)...  */
6854  if (crtl->calls_eh_return)
6855    for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6856      frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6857
6858  /* ... and any callee saved register that dataflow says is live.  */
6859  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6860    if (df_regs_ever_live_p (regno)
6861	&& !fixed_regs[regno]
6862	&& (regno == R30_REGNUM
6863	    || !crtl->abi->clobbers_full_reg_p (regno)))
6864      frame.reg_offset[regno] = SLOT_REQUIRED;
6865
6866  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6867    if (df_regs_ever_live_p (regno)
6868	&& !fixed_regs[regno]
6869	&& !crtl->abi->clobbers_full_reg_p (regno))
6870      {
6871	frame.reg_offset[regno] = SLOT_REQUIRED;
6872	last_fp_reg = regno;
6873	if (aarch64_emit_cfi_for_reg_p (regno))
6874	  frame_related_fp_reg_p = true;
6875      }
6876
6877  /* Big-endian SVE frames need a spare predicate register in order
6878     to save Z8-Z15.  Decide which register they should use.  Prefer
6879     an unused argument register if possible, so that we don't force P4
6880     to be saved unnecessarily.  */
6881  if (frame_related_fp_reg_p
6882      && crtl->abi->id () == ARM_PCS_SVE
6883      && BYTES_BIG_ENDIAN)
6884    {
6885      bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6886      bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6887      for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6888	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6889	  break;
6890      gcc_assert (regno <= P7_REGNUM);
6891      frame.spare_pred_reg = regno;
6892      df_set_regs_ever_live (regno, true);
6893    }
6894
6895  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6896    if (df_regs_ever_live_p (regno)
6897	&& !fixed_regs[regno]
6898	&& !crtl->abi->clobbers_full_reg_p (regno))
6899      frame.reg_offset[regno] = SLOT_REQUIRED;
6900
6901  /* With stack-clash, LR must be saved in non-leaf functions.  */
6902  gcc_assert (crtl->is_leaf
6903	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6904
6905  /* Now assign stack slots for the registers.  Start with the predicate
6906     registers, since predicate LDR and STR have a relatively small
6907     offset range.  These saves happen below the hard frame pointer.  */
6908  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6909    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6910      {
6911	frame.reg_offset[regno] = offset;
6912	offset += BYTES_PER_SVE_PRED;
6913      }
6914
6915  if (maybe_ne (offset, 0))
6916    {
6917      /* If we have any vector registers to save above the predicate registers,
6918	 the offset of the vector register save slots need to be a multiple
6919	 of the vector size.  This lets us use the immediate forms of LDR/STR
6920	 (or LD1/ST1 for big-endian).
6921
6922	 A vector register is 8 times the size of a predicate register,
6923	 and we need to save a maximum of 12 predicate registers, so the
6924	 first vector register will be at either #1, MUL VL or #2, MUL VL.
6925
6926	 If we don't have any vector registers to save, and we know how
6927	 big the predicate save area is, we can just round it up to the
6928	 next 16-byte boundary.  */
6929      if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6930	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6931      else
6932	{
6933	  if (known_le (offset, vector_save_size))
6934	    offset = vector_save_size;
6935	  else if (known_le (offset, vector_save_size * 2))
6936	    offset = vector_save_size * 2;
6937	  else
6938	    gcc_unreachable ();
6939	}
6940    }
6941
6942  /* If we need to save any SVE vector registers, add them next.  */
6943  if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6944    for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6945      if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6946	{
6947	  frame.reg_offset[regno] = offset;
6948	  offset += vector_save_size;
6949	}
6950
6951  /* OFFSET is now the offset of the hard frame pointer from the bottom
6952     of the callee save area.  */
6953  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6954  frame.below_hard_fp_saved_regs_size = offset;
6955  if (frame.emit_frame_chain)
6956    {
6957      /* FP and LR are placed in the linkage record.  */
6958      frame.reg_offset[R29_REGNUM] = offset;
6959      frame.wb_candidate1 = R29_REGNUM;
6960      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6961      frame.wb_candidate2 = R30_REGNUM;
6962      offset += 2 * UNITS_PER_WORD;
6963    }
6964
6965  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6966    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6967      {
6968	frame.reg_offset[regno] = offset;
6969	if (frame.wb_candidate1 == INVALID_REGNUM)
6970	  frame.wb_candidate1 = regno;
6971	else if (frame.wb_candidate2 == INVALID_REGNUM)
6972	  frame.wb_candidate2 = regno;
6973	offset += UNITS_PER_WORD;
6974      }
6975
6976  poly_int64 max_int_offset = offset;
6977  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6978  bool has_align_gap = maybe_ne (offset, max_int_offset);
6979
6980  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6981    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6982      {
6983	/* If there is an alignment gap between integer and fp callee-saves,
6984	   allocate the last fp register to it if possible.  */
6985	if (regno == last_fp_reg
6986	    && has_align_gap
6987	    && known_eq (vector_save_size, 8)
6988	    && multiple_p (offset, 16))
6989	  {
6990	    frame.reg_offset[regno] = max_int_offset;
6991	    break;
6992	  }
6993
6994	frame.reg_offset[regno] = offset;
6995	if (frame.wb_candidate1 == INVALID_REGNUM)
6996	  frame.wb_candidate1 = regno;
6997	else if (frame.wb_candidate2 == INVALID_REGNUM
6998		 && frame.wb_candidate1 >= V0_REGNUM)
6999	  frame.wb_candidate2 = regno;
7000	offset += vector_save_size;
7001      }
7002
7003  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7004
7005  frame.saved_regs_size = offset;
7006
7007  poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
7008
7009  poly_int64 above_outgoing_args
7010    = aligned_upper_bound (varargs_and_saved_regs_size
7011			   + get_frame_size (),
7012			   STACK_BOUNDARY / BITS_PER_UNIT);
7013
7014  frame.hard_fp_offset
7015    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
7016
7017  /* Both these values are already aligned.  */
7018  gcc_assert (multiple_p (crtl->outgoing_args_size,
7019			  STACK_BOUNDARY / BITS_PER_UNIT));
7020  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
7021
7022  frame.locals_offset = frame.saved_varargs_size;
7023
7024  frame.initial_adjust = 0;
7025  frame.final_adjust = 0;
7026  frame.callee_adjust = 0;
7027  frame.sve_callee_adjust = 0;
7028  frame.callee_offset = 0;
7029
7030  HOST_WIDE_INT max_push_offset = 0;
7031  if (frame.wb_candidate2 != INVALID_REGNUM)
7032    max_push_offset = 512;
7033  else if (frame.wb_candidate1 != INVALID_REGNUM)
7034    max_push_offset = 256;
7035
7036  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
7037  HOST_WIDE_INT const_saved_regs_size;
7038  if (frame.frame_size.is_constant (&const_size)
7039      && const_size < max_push_offset
7040      && known_eq (frame.hard_fp_offset, const_size))
7041    {
7042      /* Simple, small frame with no outgoing arguments:
7043
7044	 stp reg1, reg2, [sp, -frame_size]!
7045	 stp reg3, reg4, [sp, 16]  */
7046      frame.callee_adjust = const_size;
7047    }
7048  else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
7049	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
7050	   && const_outgoing_args_size + const_saved_regs_size < 512
7051	   /* We could handle this case even with outgoing args, provided
7052	      that the number of args left us with valid offsets for all
7053	      predicate and vector save slots.  It's such a rare case that
7054	      it hardly seems worth the effort though.  */
7055	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
7056	   && !(cfun->calls_alloca
7057		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
7058		&& const_fp_offset < max_push_offset))
7059    {
7060      /* Frame with small outgoing arguments:
7061
7062	 sub sp, sp, frame_size
7063	 stp reg1, reg2, [sp, outgoing_args_size]
7064	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
7065      frame.initial_adjust = frame.frame_size;
7066      frame.callee_offset = const_outgoing_args_size;
7067    }
7068  else if (saves_below_hard_fp_p
7069	   && known_eq (frame.saved_regs_size,
7070			frame.below_hard_fp_saved_regs_size))
7071    {
7072      /* Frame in which all saves are SVE saves:
7073
7074	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
7075	 save SVE registers relative to SP
7076	 sub sp, sp, outgoing_args_size  */
7077      frame.initial_adjust = (frame.hard_fp_offset
7078			      + frame.below_hard_fp_saved_regs_size);
7079      frame.final_adjust = crtl->outgoing_args_size;
7080    }
7081  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
7082	   && const_fp_offset < max_push_offset)
7083    {
7084      /* Frame with large outgoing arguments or SVE saves, but with
7085	 a small local area:
7086
7087	 stp reg1, reg2, [sp, -hard_fp_offset]!
7088	 stp reg3, reg4, [sp, 16]
7089	 [sub sp, sp, below_hard_fp_saved_regs_size]
7090	 [save SVE registers relative to SP]
7091	 sub sp, sp, outgoing_args_size  */
7092      frame.callee_adjust = const_fp_offset;
7093      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7094      frame.final_adjust = crtl->outgoing_args_size;
7095    }
7096  else
7097    {
7098      /* Frame with large local area and outgoing arguments or SVE saves,
7099	 using frame pointer:
7100
7101	 sub sp, sp, hard_fp_offset
7102	 stp x29, x30, [sp, 0]
7103	 add x29, sp, 0
7104	 stp reg3, reg4, [sp, 16]
7105	 [sub sp, sp, below_hard_fp_saved_regs_size]
7106	 [save SVE registers relative to SP]
7107	 sub sp, sp, outgoing_args_size  */
7108      frame.initial_adjust = frame.hard_fp_offset;
7109      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7110      frame.final_adjust = crtl->outgoing_args_size;
7111    }
7112
7113  /* Make sure the individual adjustments add up to the full frame size.  */
7114  gcc_assert (known_eq (frame.initial_adjust
7115			+ frame.callee_adjust
7116			+ frame.sve_callee_adjust
7117			+ frame.final_adjust, frame.frame_size));
7118
7119  frame.laid_out = true;
7120}
7121
7122/* Return true if the register REGNO is saved on entry to
7123   the current function.  */
7124
7125static bool
7126aarch64_register_saved_on_entry (int regno)
7127{
7128  return known_ge (cfun->machine->frame.reg_offset[regno], 0);
7129}
7130
7131/* Return the next register up from REGNO up to LIMIT for the callee
7132   to save.  */
7133
7134static unsigned
7135aarch64_next_callee_save (unsigned regno, unsigned limit)
7136{
7137  while (regno <= limit && !aarch64_register_saved_on_entry (regno))
7138    regno ++;
7139  return regno;
7140}
7141
7142/* Push the register number REGNO of mode MODE to the stack with write-back
7143   adjusting the stack by ADJUSTMENT.  */
7144
7145static void
7146aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
7147			   HOST_WIDE_INT adjustment)
7148 {
7149  rtx base_rtx = stack_pointer_rtx;
7150  rtx insn, reg, mem;
7151
7152  reg = gen_rtx_REG (mode, regno);
7153  mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7154			    plus_constant (Pmode, base_rtx, -adjustment));
7155  mem = gen_frame_mem (mode, mem);
7156
7157  insn = emit_move_insn (mem, reg);
7158  RTX_FRAME_RELATED_P (insn) = 1;
7159}
7160
7161/* Generate and return an instruction to store the pair of registers
7162   REG and REG2 of mode MODE to location BASE with write-back adjusting
7163   the stack location BASE by ADJUSTMENT.  */
7164
7165static rtx
7166aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7167			  HOST_WIDE_INT adjustment)
7168{
7169  switch (mode)
7170    {
7171    case E_DImode:
7172      return gen_storewb_pairdi_di (base, base, reg, reg2,
7173				    GEN_INT (-adjustment),
7174				    GEN_INT (UNITS_PER_WORD - adjustment));
7175    case E_DFmode:
7176      return gen_storewb_pairdf_di (base, base, reg, reg2,
7177				    GEN_INT (-adjustment),
7178				    GEN_INT (UNITS_PER_WORD - adjustment));
7179    case E_TFmode:
7180      return gen_storewb_pairtf_di (base, base, reg, reg2,
7181				    GEN_INT (-adjustment),
7182				    GEN_INT (UNITS_PER_VREG - adjustment));
7183    default:
7184      gcc_unreachable ();
7185    }
7186}
7187
7188/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7189   stack pointer by ADJUSTMENT.  */
7190
7191static void
7192aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7193{
7194  rtx_insn *insn;
7195  machine_mode mode = aarch64_reg_save_mode (regno1);
7196
7197  if (regno2 == INVALID_REGNUM)
7198    return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7199
7200  rtx reg1 = gen_rtx_REG (mode, regno1);
7201  rtx reg2 = gen_rtx_REG (mode, regno2);
7202
7203  insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7204					      reg2, adjustment));
7205  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7206  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7207  RTX_FRAME_RELATED_P (insn) = 1;
7208}
7209
7210/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7211   adjusting it by ADJUSTMENT afterwards.  */
7212
7213static rtx
7214aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7215			 HOST_WIDE_INT adjustment)
7216{
7217  switch (mode)
7218    {
7219    case E_DImode:
7220      return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7221				   GEN_INT (UNITS_PER_WORD));
7222    case E_DFmode:
7223      return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7224				   GEN_INT (UNITS_PER_WORD));
7225    case E_TFmode:
7226      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7227				   GEN_INT (UNITS_PER_VREG));
7228    default:
7229      gcc_unreachable ();
7230    }
7231}
7232
7233/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7234   afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7235   into CFI_OPS.  */
7236
7237static void
7238aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7239		  rtx *cfi_ops)
7240{
7241  machine_mode mode = aarch64_reg_save_mode (regno1);
7242  rtx reg1 = gen_rtx_REG (mode, regno1);
7243
7244  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7245
7246  if (regno2 == INVALID_REGNUM)
7247    {
7248      rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7249      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7250      emit_move_insn (reg1, gen_frame_mem (mode, mem));
7251    }
7252  else
7253    {
7254      rtx reg2 = gen_rtx_REG (mode, regno2);
7255      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7256      emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7257					  reg2, adjustment));
7258    }
7259}
7260
7261/* Generate and return a store pair instruction of mode MODE to store
7262   register REG1 to MEM1 and register REG2 to MEM2.  */
7263
7264static rtx
7265aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7266			rtx reg2)
7267{
7268  switch (mode)
7269    {
7270    case E_DImode:
7271      return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7272
7273    case E_DFmode:
7274      return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7275
7276    case E_TFmode:
7277      return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7278
7279    default:
7280      gcc_unreachable ();
7281    }
7282}
7283
7284/* Generate and regurn a load pair isntruction of mode MODE to load register
7285   REG1 from MEM1 and register REG2 from MEM2.  */
7286
7287static rtx
7288aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7289		       rtx mem2)
7290{
7291  switch (mode)
7292    {
7293    case E_DImode:
7294      return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7295
7296    case E_DFmode:
7297      return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7298
7299    case E_TFmode:
7300      return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7301
7302    default:
7303      gcc_unreachable ();
7304    }
7305}
7306
7307/* Return TRUE if return address signing should be enabled for the current
7308   function, otherwise return FALSE.  */
7309
7310bool
7311aarch64_return_address_signing_enabled (void)
7312{
7313  /* This function should only be called after frame laid out.   */
7314  gcc_assert (cfun->machine->frame.laid_out);
7315
7316  /* Turn return address signing off in any function that uses
7317     __builtin_eh_return.  The address passed to __builtin_eh_return
7318     is not signed so either it has to be signed (with original sp)
7319     or the code path that uses it has to avoid authenticating it.
7320     Currently eh return introduces a return to anywhere gadget, no
7321     matter what we do here since it uses ret with user provided
7322     address. An ideal fix for that is to use indirect branch which
7323     can be protected with BTI j (to some extent).  */
7324  if (crtl->calls_eh_return)
7325    return false;
7326
7327  /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7328     if its LR is pushed onto stack.  */
7329  return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7330	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7331	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7332}
7333
7334/* Return TRUE if Branch Target Identification Mechanism is enabled.  */
7335bool
7336aarch64_bti_enabled (void)
7337{
7338  return (aarch64_enable_bti == 1);
7339}
7340
7341/* The caller is going to use ST1D or LD1D to save or restore an SVE
7342   register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7343   the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
7344
7345     (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7346	 or LD1D address
7347
7348     (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7349	 if the variable isn't already nonnull
7350
7351   (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7352   Handle this case using a temporary base register that is suitable for
7353   all offsets in that range.  Use ANCHOR_REG as this base register if it
7354   is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
7355
7356static inline void
7357aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7358				     rtx &anchor_reg, poly_int64 &offset,
7359				     rtx &ptrue)
7360{
7361  if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7362    {
7363      /* This is the maximum valid offset of the anchor from the base.
7364	 Lower values would be valid too.  */
7365      poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7366      if (!anchor_reg)
7367	{
7368	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7369	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7370				    gen_int_mode (anchor_offset, Pmode)));
7371	}
7372      base_rtx = anchor_reg;
7373      offset -= anchor_offset;
7374    }
7375  if (!ptrue)
7376    {
7377      int pred_reg = cfun->machine->frame.spare_pred_reg;
7378      emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7379		      CONSTM1_RTX (VNx16BImode));
7380      ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7381    }
7382}
7383
7384/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7385   is saved at BASE + OFFSET.  */
7386
7387static void
7388aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7389			    rtx base, poly_int64 offset)
7390{
7391  rtx mem = gen_frame_mem (GET_MODE (reg),
7392			   plus_constant (Pmode, base, offset));
7393  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7394}
7395
7396/* Emit code to save the callee-saved registers from register number START
7397   to LIMIT to the stack at the location starting at offset START_OFFSET,
7398   skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7399   is true if the hard frame pointer has been set up.  */
7400
7401static void
7402aarch64_save_callee_saves (poly_int64 start_offset,
7403			   unsigned start, unsigned limit, bool skip_wb,
7404			   bool hard_fp_valid_p)
7405{
7406  rtx_insn *insn;
7407  unsigned regno;
7408  unsigned regno2;
7409  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7410
7411  for (regno = aarch64_next_callee_save (start, limit);
7412       regno <= limit;
7413       regno = aarch64_next_callee_save (regno + 1, limit))
7414    {
7415      rtx reg, mem;
7416      poly_int64 offset;
7417      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7418
7419      if (skip_wb
7420	  && (regno == cfun->machine->frame.wb_candidate1
7421	      || regno == cfun->machine->frame.wb_candidate2))
7422	continue;
7423
7424      if (cfun->machine->reg_is_wrapped_separately[regno])
7425	continue;
7426
7427      machine_mode mode = aarch64_reg_save_mode (regno);
7428      reg = gen_rtx_REG (mode, regno);
7429      offset = start_offset + cfun->machine->frame.reg_offset[regno];
7430      rtx base_rtx = stack_pointer_rtx;
7431      poly_int64 sp_offset = offset;
7432
7433      HOST_WIDE_INT const_offset;
7434      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7435	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7436					     offset, ptrue);
7437      else if (GP_REGNUM_P (regno)
7438	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
7439	{
7440	  gcc_assert (known_eq (start_offset, 0));
7441	  poly_int64 fp_offset
7442	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
7443	  if (hard_fp_valid_p)
7444	    base_rtx = hard_frame_pointer_rtx;
7445	  else
7446	    {
7447	      if (!anchor_reg)
7448		{
7449		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7450		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7451					    gen_int_mode (fp_offset, Pmode)));
7452		}
7453	      base_rtx = anchor_reg;
7454	    }
7455	  offset -= fp_offset;
7456	}
7457      mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7458      bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7459
7460      if (!aarch64_sve_mode_p (mode)
7461	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7462	  && !cfun->machine->reg_is_wrapped_separately[regno2]
7463	  && known_eq (GET_MODE_SIZE (mode),
7464		       cfun->machine->frame.reg_offset[regno2]
7465		       - cfun->machine->frame.reg_offset[regno]))
7466	{
7467	  rtx reg2 = gen_rtx_REG (mode, regno2);
7468	  rtx mem2;
7469
7470	  offset += GET_MODE_SIZE (mode);
7471	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7472	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7473						    reg2));
7474
7475	  /* The first part of a frame-related parallel insn is
7476	     always assumed to be relevant to the frame
7477	     calculations; subsequent parts, are only
7478	     frame-related if explicitly marked.  */
7479	  if (aarch64_emit_cfi_for_reg_p (regno2))
7480	    {
7481	      if (need_cfa_note_p)
7482		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7483					    sp_offset + GET_MODE_SIZE (mode));
7484	      else
7485		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7486	    }
7487
7488	  regno = regno2;
7489	}
7490      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7491	{
7492	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7493	  need_cfa_note_p = true;
7494	}
7495      else if (aarch64_sve_mode_p (mode))
7496	insn = emit_insn (gen_rtx_SET (mem, reg));
7497      else
7498	insn = emit_move_insn (mem, reg);
7499
7500      RTX_FRAME_RELATED_P (insn) = frame_related_p;
7501      if (frame_related_p && need_cfa_note_p)
7502	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7503    }
7504}
7505
7506/* Emit code to restore the callee registers from register number START
7507   up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7508   skipping any write-back candidates if SKIP_WB is true.  Write the
7509   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7510
7511static void
7512aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7513			      unsigned limit, bool skip_wb, rtx *cfi_ops)
7514{
7515  unsigned regno;
7516  unsigned regno2;
7517  poly_int64 offset;
7518  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7519
7520  for (regno = aarch64_next_callee_save (start, limit);
7521       regno <= limit;
7522       regno = aarch64_next_callee_save (regno + 1, limit))
7523    {
7524      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7525      if (cfun->machine->reg_is_wrapped_separately[regno])
7526	continue;
7527
7528      rtx reg, mem;
7529
7530      if (skip_wb
7531	  && (regno == cfun->machine->frame.wb_candidate1
7532	      || regno == cfun->machine->frame.wb_candidate2))
7533	continue;
7534
7535      machine_mode mode = aarch64_reg_save_mode (regno);
7536      reg = gen_rtx_REG (mode, regno);
7537      offset = start_offset + cfun->machine->frame.reg_offset[regno];
7538      rtx base_rtx = stack_pointer_rtx;
7539      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7540	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7541					     offset, ptrue);
7542      mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7543
7544      if (!aarch64_sve_mode_p (mode)
7545	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7546	  && !cfun->machine->reg_is_wrapped_separately[regno2]
7547	  && known_eq (GET_MODE_SIZE (mode),
7548		       cfun->machine->frame.reg_offset[regno2]
7549		       - cfun->machine->frame.reg_offset[regno]))
7550	{
7551	  rtx reg2 = gen_rtx_REG (mode, regno2);
7552	  rtx mem2;
7553
7554	  offset += GET_MODE_SIZE (mode);
7555	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7556	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7557
7558	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7559	  regno = regno2;
7560	}
7561      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7562	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7563      else if (aarch64_sve_mode_p (mode))
7564	emit_insn (gen_rtx_SET (reg, mem));
7565      else
7566	emit_move_insn (reg, mem);
7567      if (frame_related_p)
7568	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7569    }
7570}
7571
7572/* Return true if OFFSET is a signed 4-bit value multiplied by the size
7573   of MODE.  */
7574
7575static inline bool
7576offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7577{
7578  HOST_WIDE_INT multiple;
7579  return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7580	  && IN_RANGE (multiple, -8, 7));
7581}
7582
7583/* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7584   of MODE.  */
7585
7586static inline bool
7587offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7588{
7589  HOST_WIDE_INT multiple;
7590  return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7591	  && IN_RANGE (multiple, 0, 63));
7592}
7593
7594/* Return true if OFFSET is a signed 7-bit value multiplied by the size
7595   of MODE.  */
7596
7597bool
7598aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7599{
7600  HOST_WIDE_INT multiple;
7601  return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7602	  && IN_RANGE (multiple, -64, 63));
7603}
7604
7605/* Return true if OFFSET is a signed 9-bit value.  */
7606
7607bool
7608aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7609				       poly_int64 offset)
7610{
7611  HOST_WIDE_INT const_offset;
7612  return (offset.is_constant (&const_offset)
7613	  && IN_RANGE (const_offset, -256, 255));
7614}
7615
7616/* Return true if OFFSET is a signed 9-bit value multiplied by the size
7617   of MODE.  */
7618
7619static inline bool
7620offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7621{
7622  HOST_WIDE_INT multiple;
7623  return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7624	  && IN_RANGE (multiple, -256, 255));
7625}
7626
7627/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7628   of MODE.  */
7629
7630static inline bool
7631offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7632{
7633  HOST_WIDE_INT multiple;
7634  return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7635	  && IN_RANGE (multiple, 0, 4095));
7636}
7637
7638/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7639
7640static sbitmap
7641aarch64_get_separate_components (void)
7642{
7643  sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7644  bitmap_clear (components);
7645
7646  /* The registers we need saved to the frame.  */
7647  for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7648    if (aarch64_register_saved_on_entry (regno))
7649      {
7650	/* Punt on saves and restores that use ST1D and LD1D.  We could
7651	   try to be smarter, but it would involve making sure that the
7652	   spare predicate register itself is safe to use at the save
7653	   and restore points.  Also, when a frame pointer is being used,
7654	   the slots are often out of reach of ST1D and LD1D anyway.  */
7655	machine_mode mode = aarch64_reg_save_mode (regno);
7656	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7657	  continue;
7658
7659	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7660
7661	/* If the register is saved in the first SVE save slot, we use
7662	   it as a stack probe for -fstack-clash-protection.  */
7663	if (flag_stack_clash_protection
7664	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7665	    && known_eq (offset, 0))
7666	  continue;
7667
7668	/* Get the offset relative to the register we'll use.  */
7669	if (frame_pointer_needed)
7670	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7671	else
7672	  offset += crtl->outgoing_args_size;
7673
7674	/* Check that we can access the stack slot of the register with one
7675	   direct load with no adjustments needed.  */
7676	if (aarch64_sve_mode_p (mode)
7677	    ? offset_9bit_signed_scaled_p (mode, offset)
7678	    : offset_12bit_unsigned_scaled_p (mode, offset))
7679	  bitmap_set_bit (components, regno);
7680      }
7681
7682  /* Don't mess with the hard frame pointer.  */
7683  if (frame_pointer_needed)
7684    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7685
7686  /* If the spare predicate register used by big-endian SVE code
7687     is call-preserved, it must be saved in the main prologue
7688     before any saves that use it.  */
7689  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7690    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7691
7692  unsigned reg1 = cfun->machine->frame.wb_candidate1;
7693  unsigned reg2 = cfun->machine->frame.wb_candidate2;
7694  /* If registers have been chosen to be stored/restored with
7695     writeback don't interfere with them to avoid having to output explicit
7696     stack adjustment instructions.  */
7697  if (reg2 != INVALID_REGNUM)
7698    bitmap_clear_bit (components, reg2);
7699  if (reg1 != INVALID_REGNUM)
7700    bitmap_clear_bit (components, reg1);
7701
7702  bitmap_clear_bit (components, LR_REGNUM);
7703  bitmap_clear_bit (components, SP_REGNUM);
7704
7705  return components;
7706}
7707
7708/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
7709
7710static sbitmap
7711aarch64_components_for_bb (basic_block bb)
7712{
7713  bitmap in = DF_LIVE_IN (bb);
7714  bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7715  bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7716
7717  sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7718  bitmap_clear (components);
7719
7720  /* Clobbered registers don't generate values in any meaningful sense,
7721     since nothing after the clobber can rely on their value.  And we can't
7722     say that partially-clobbered registers are unconditionally killed,
7723     because whether they're killed or not depends on the mode of the
7724     value they're holding.  Thus partially call-clobbered registers
7725     appear in neither the kill set nor the gen set.
7726
7727     Check manually for any calls that clobber more of a register than the
7728     current function can.  */
7729  function_abi_aggregator callee_abis;
7730  rtx_insn *insn;
7731  FOR_BB_INSNS (bb, insn)
7732    if (CALL_P (insn))
7733      callee_abis.note_callee_abi (insn_callee_abi (insn));
7734  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7735
7736  /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
7737  for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7738    if (!fixed_regs[regno]
7739	&& !crtl->abi->clobbers_full_reg_p (regno)
7740	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7741	    || bitmap_bit_p (in, regno)
7742	    || bitmap_bit_p (gen, regno)
7743	    || bitmap_bit_p (kill, regno)))
7744      {
7745	bitmap_set_bit (components, regno);
7746
7747	/* If there is a callee-save at an adjacent offset, add it too
7748	   to increase the use of LDP/STP.  */
7749	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7750	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7751
7752	if (regno2 <= LAST_SAVED_REGNUM)
7753	  {
7754	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7755	    if (regno < regno2
7756		? known_eq (offset + 8, offset2)
7757		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7758	      bitmap_set_bit (components, regno2);
7759	  }
7760      }
7761
7762  return components;
7763}
7764
7765/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7766   Nothing to do for aarch64.  */
7767
7768static void
7769aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7770{
7771}
7772
7773/* Return the next set bit in BMP from START onwards.  Return the total number
7774   of bits in BMP if no set bit is found at or after START.  */
7775
7776static unsigned int
7777aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7778{
7779  unsigned int nbits = SBITMAP_SIZE (bmp);
7780  if (start == nbits)
7781    return start;
7782
7783  gcc_assert (start < nbits);
7784  for (unsigned int i = start; i < nbits; i++)
7785    if (bitmap_bit_p (bmp, i))
7786      return i;
7787
7788  return nbits;
7789}
7790
7791/* Do the work for aarch64_emit_prologue_components and
7792   aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
7793   to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7794   for these components or the epilogue sequence.  That is, it determines
7795   whether we should emit stores or loads and what kind of CFA notes to attach
7796   to the insns.  Otherwise the logic for the two sequences is very
7797   similar.  */
7798
7799static void
7800aarch64_process_components (sbitmap components, bool prologue_p)
7801{
7802  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7803			     ? HARD_FRAME_POINTER_REGNUM
7804			     : STACK_POINTER_REGNUM);
7805
7806  unsigned last_regno = SBITMAP_SIZE (components);
7807  unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7808  rtx_insn *insn = NULL;
7809
7810  while (regno != last_regno)
7811    {
7812      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7813      machine_mode mode = aarch64_reg_save_mode (regno);
7814
7815      rtx reg = gen_rtx_REG (mode, regno);
7816      poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7817      if (frame_pointer_needed)
7818	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7819      else
7820	offset += crtl->outgoing_args_size;
7821
7822      rtx addr = plus_constant (Pmode, ptr_reg, offset);
7823      rtx mem = gen_frame_mem (mode, addr);
7824
7825      rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7826      unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7827      /* No more registers to handle after REGNO.
7828	 Emit a single save/restore and exit.  */
7829      if (regno2 == last_regno)
7830	{
7831	  insn = emit_insn (set);
7832	  if (frame_related_p)
7833	    {
7834	      RTX_FRAME_RELATED_P (insn) = 1;
7835	      if (prologue_p)
7836		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7837	      else
7838		add_reg_note (insn, REG_CFA_RESTORE, reg);
7839	    }
7840	  break;
7841	}
7842
7843      poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7844      /* The next register is not of the same class or its offset is not
7845	 mergeable with the current one into a pair.  */
7846      if (aarch64_sve_mode_p (mode)
7847	  || !satisfies_constraint_Ump (mem)
7848	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7849	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7850	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7851		       GET_MODE_SIZE (mode)))
7852	{
7853	  insn = emit_insn (set);
7854	  if (frame_related_p)
7855	    {
7856	      RTX_FRAME_RELATED_P (insn) = 1;
7857	      if (prologue_p)
7858		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7859	      else
7860		add_reg_note (insn, REG_CFA_RESTORE, reg);
7861	    }
7862
7863	  regno = regno2;
7864	  continue;
7865	}
7866
7867      bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7868
7869      /* REGNO2 can be saved/restored in a pair with REGNO.  */
7870      rtx reg2 = gen_rtx_REG (mode, regno2);
7871      if (frame_pointer_needed)
7872	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7873      else
7874	offset2 += crtl->outgoing_args_size;
7875      rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7876      rtx mem2 = gen_frame_mem (mode, addr2);
7877      rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7878			     : gen_rtx_SET (reg2, mem2);
7879
7880      if (prologue_p)
7881	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7882      else
7883	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7884
7885      if (frame_related_p || frame_related2_p)
7886	{
7887	  RTX_FRAME_RELATED_P (insn) = 1;
7888	  if (prologue_p)
7889	    {
7890	      if (frame_related_p)
7891		add_reg_note (insn, REG_CFA_OFFSET, set);
7892	      if (frame_related2_p)
7893		add_reg_note (insn, REG_CFA_OFFSET, set2);
7894	    }
7895	  else
7896	    {
7897	      if (frame_related_p)
7898		add_reg_note (insn, REG_CFA_RESTORE, reg);
7899	      if (frame_related2_p)
7900		add_reg_note (insn, REG_CFA_RESTORE, reg2);
7901	    }
7902	}
7903
7904      regno = aarch64_get_next_set_bit (components, regno2 + 1);
7905    }
7906}
7907
7908/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
7909
7910static void
7911aarch64_emit_prologue_components (sbitmap components)
7912{
7913  aarch64_process_components (components, true);
7914}
7915
7916/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7917
7918static void
7919aarch64_emit_epilogue_components (sbitmap components)
7920{
7921  aarch64_process_components (components, false);
7922}
7923
7924/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7925
7926static void
7927aarch64_set_handled_components (sbitmap components)
7928{
7929  for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7930    if (bitmap_bit_p (components, regno))
7931      cfun->machine->reg_is_wrapped_separately[regno] = true;
7932}
7933
7934/* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7935   determining the probe offset for alloca.  */
7936
7937static HOST_WIDE_INT
7938aarch64_stack_clash_protection_alloca_probe_range (void)
7939{
7940  return STACK_CLASH_CALLER_GUARD;
7941}
7942
7943
7944/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7945   registers.  If POLY_SIZE is not large enough to require a probe this function
7946   will only adjust the stack.  When allocating the stack space
7947   FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7948   FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7949   arguments.  If we are then we ensure that any allocation larger than the ABI
7950   defined buffer needs a probe so that the invariant of having a 1KB buffer is
7951   maintained.
7952
7953   We emit barriers after each stack adjustment to prevent optimizations from
7954   breaking the invariant that we never drop the stack more than a page.  This
7955   invariant is needed to make it easier to correctly handle asynchronous
7956   events, e.g. if we were to allow the stack to be dropped by more than a page
7957   and then have multiple probes up and we take a signal somewhere in between
7958   then the signal handler doesn't know the state of the stack and can make no
7959   assumptions about which pages have been probed.  */
7960
7961static void
7962aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7963					poly_int64 poly_size,
7964					bool frame_related_p,
7965					bool final_adjustment_p)
7966{
7967  HOST_WIDE_INT guard_size
7968    = 1 << param_stack_clash_protection_guard_size;
7969  HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7970  HOST_WIDE_INT min_probe_threshold
7971    = (final_adjustment_p
7972       ? guard_used_by_caller
7973       : guard_size - guard_used_by_caller);
7974  /* When doing the final adjustment for the outgoing arguments, take into
7975     account any unprobed space there is above the current SP.  There are
7976     two cases:
7977
7978     - When saving SVE registers below the hard frame pointer, we force
7979       the lowest save to take place in the prologue before doing the final
7980       adjustment (i.e. we don't allow the save to be shrink-wrapped).
7981       This acts as a probe at SP, so there is no unprobed space.
7982
7983     - When there are no SVE register saves, we use the store of the link
7984       register as a probe.  We can't assume that LR was saved at position 0
7985       though, so treat any space below it as unprobed.  */
7986  if (final_adjustment_p
7987      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7988    {
7989      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7990      if (known_ge (lr_offset, 0))
7991	min_probe_threshold -= lr_offset.to_constant ();
7992      else
7993	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7994    }
7995
7996  poly_int64 frame_size = cfun->machine->frame.frame_size;
7997
7998  /* We should always have a positive probe threshold.  */
7999  gcc_assert (min_probe_threshold > 0);
8000
8001  if (flag_stack_clash_protection && !final_adjustment_p)
8002    {
8003      poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8004      poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8005      poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8006
8007      if (known_eq (frame_size, 0))
8008	{
8009	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
8010	}
8011      else if (known_lt (initial_adjust + sve_callee_adjust,
8012			 guard_size - guard_used_by_caller)
8013	       && known_lt (final_adjust, guard_used_by_caller))
8014	{
8015	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
8016	}
8017    }
8018
8019  /* If SIZE is not large enough to require probing, just adjust the stack and
8020     exit.  */
8021  if (known_lt (poly_size, min_probe_threshold)
8022      || !flag_stack_clash_protection)
8023    {
8024      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
8025      return;
8026    }
8027
8028  HOST_WIDE_INT size;
8029  /* Handle the SVE non-constant case first.  */
8030  if (!poly_size.is_constant (&size))
8031    {
8032     if (dump_file)
8033      {
8034	fprintf (dump_file, "Stack clash SVE prologue: ");
8035	print_dec (poly_size, dump_file);
8036	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
8037      }
8038
8039      /* First calculate the amount of bytes we're actually spilling.  */
8040      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
8041			  poly_size, temp1, temp2, false, true);
8042
8043      rtx_insn *insn = get_last_insn ();
8044
8045      if (frame_related_p)
8046	{
8047	  /* This is done to provide unwinding information for the stack
8048	     adjustments we're about to do, however to prevent the optimizers
8049	     from removing the R11 move and leaving the CFA note (which would be
8050	     very wrong) we tie the old and new stack pointer together.
8051	     The tie will expand to nothing but the optimizers will not touch
8052	     the instruction.  */
8053	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8054	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
8055	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
8056
8057	  /* We want the CFA independent of the stack pointer for the
8058	     duration of the loop.  */
8059	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
8060	  RTX_FRAME_RELATED_P (insn) = 1;
8061	}
8062
8063      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
8064      rtx guard_const = gen_int_mode (guard_size, Pmode);
8065
8066      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
8067						   stack_pointer_rtx, temp1,
8068						   probe_const, guard_const));
8069
8070      /* Now reset the CFA register if needed.  */
8071      if (frame_related_p)
8072	{
8073	  add_reg_note (insn, REG_CFA_DEF_CFA,
8074			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8075				      gen_int_mode (poly_size, Pmode)));
8076	  RTX_FRAME_RELATED_P (insn) = 1;
8077	}
8078
8079      return;
8080    }
8081
8082  if (dump_file)
8083    fprintf (dump_file,
8084	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
8085	     " bytes, probing will be required.\n", size);
8086
8087  /* Round size to the nearest multiple of guard_size, and calculate the
8088     residual as the difference between the original size and the rounded
8089     size.  */
8090  HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
8091  HOST_WIDE_INT residual = size - rounded_size;
8092
8093  /* We can handle a small number of allocations/probes inline.  Otherwise
8094     punt to a loop.  */
8095  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
8096    {
8097      for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
8098	{
8099	  aarch64_sub_sp (NULL, temp2, guard_size, true);
8100	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8101					   guard_used_by_caller));
8102	  emit_insn (gen_blockage ());
8103	}
8104      dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
8105    }
8106  else
8107    {
8108      /* Compute the ending address.  */
8109      aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
8110			  temp1, NULL, false, true);
8111      rtx_insn *insn = get_last_insn ();
8112
8113      /* For the initial allocation, we don't have a frame pointer
8114	 set up, so we always need CFI notes.  If we're doing the
8115	 final allocation, then we may have a frame pointer, in which
8116	 case it is the CFA, otherwise we need CFI notes.
8117
8118	 We can determine which allocation we are doing by looking at
8119	 the value of FRAME_RELATED_P since the final allocations are not
8120	 frame related.  */
8121      if (frame_related_p)
8122	{
8123	  /* We want the CFA independent of the stack pointer for the
8124	     duration of the loop.  */
8125	  add_reg_note (insn, REG_CFA_DEF_CFA,
8126			plus_constant (Pmode, temp1, rounded_size));
8127	  RTX_FRAME_RELATED_P (insn) = 1;
8128	}
8129
8130      /* This allocates and probes the stack.  Note that this re-uses some of
8131	 the existing Ada stack protection code.  However we are guaranteed not
8132	 to enter the non loop or residual branches of that code.
8133
8134	 The non-loop part won't be entered because if our allocation amount
8135	 doesn't require a loop, the case above would handle it.
8136
8137	 The residual amount won't be entered because TEMP1 is a mutliple of
8138	 the allocation size.  The residual will always be 0.  As such, the only
8139	 part we are actually using from that code is the loop setup.  The
8140	 actual probing is done in aarch64_output_probe_stack_range.  */
8141      insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
8142					       stack_pointer_rtx, temp1));
8143
8144      /* Now reset the CFA register if needed.  */
8145      if (frame_related_p)
8146	{
8147	  add_reg_note (insn, REG_CFA_DEF_CFA,
8148			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8149	  RTX_FRAME_RELATED_P (insn) = 1;
8150	}
8151
8152      emit_insn (gen_blockage ());
8153      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8154    }
8155
8156  /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
8157     be probed.  This maintains the requirement that each page is probed at
8158     least once.  For initial probing we probe only if the allocation is
8159     more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8160     if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
8161     GUARD_SIZE.  This works that for any allocation that is large enough to
8162     trigger a probe here, we'll have at least one, and if they're not large
8163     enough for this code to emit anything for them, The page would have been
8164     probed by the saving of FP/LR either by this function or any callees.  If
8165     we don't have any callees then we won't have more stack adjustments and so
8166     are still safe.  */
8167  if (residual)
8168    {
8169      HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8170      /* If we're doing final adjustments, and we've done any full page
8171	 allocations then any residual needs to be probed.  */
8172      if (final_adjustment_p && rounded_size != 0)
8173	min_probe_threshold = 0;
8174      /* If doing a small final adjustment, we always probe at offset 0.
8175	 This is done to avoid issues when LR is not at position 0 or when
8176	 the final adjustment is smaller than the probing offset.  */
8177      else if (final_adjustment_p && rounded_size == 0)
8178	residual_probe_offset = 0;
8179
8180      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8181      if (residual >= min_probe_threshold)
8182	{
8183	  if (dump_file)
8184	    fprintf (dump_file,
8185		     "Stack clash AArch64 prologue residuals: "
8186		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8187		     "\n", residual);
8188
8189	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8190					     residual_probe_offset));
8191	  emit_insn (gen_blockage ());
8192	}
8193    }
8194}
8195
8196/* Return 1 if the register is used by the epilogue.  We need to say the
8197   return register is used, but only after epilogue generation is complete.
8198   Note that in the case of sibcalls, the values "used by the epilogue" are
8199   considered live at the start of the called function.
8200
8201   For SIMD functions we need to return 1 for FP registers that are saved and
8202   restored by a function but are not zero in call_used_regs.  If we do not do
8203   this optimizations may remove the restore of the register.  */
8204
8205int
8206aarch64_epilogue_uses (int regno)
8207{
8208  if (epilogue_completed)
8209    {
8210      if (regno == LR_REGNUM)
8211	return 1;
8212    }
8213  return 0;
8214}
8215
8216/* AArch64 stack frames generated by this compiler look like:
8217
8218	+-------------------------------+
8219	|                               |
8220	|  incoming stack arguments     |
8221	|                               |
8222	+-------------------------------+
8223	|                               | <-- incoming stack pointer (aligned)
8224	|  callee-allocated save area   |
8225	|  for register varargs         |
8226	|                               |
8227	+-------------------------------+
8228	|  local variables              | <-- frame_pointer_rtx
8229	|                               |
8230	+-------------------------------+
8231	|  padding                      | \
8232	+-------------------------------+  |
8233	|  callee-saved registers       |  | frame.saved_regs_size
8234	+-------------------------------+  |
8235	|  LR'                          |  |
8236	+-------------------------------+  |
8237	|  FP'                          |  |
8238	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
8239	|  SVE vector registers         |  | \
8240	+-------------------------------+  |  | below_hard_fp_saved_regs_size
8241	|  SVE predicate registers      | /  /
8242	+-------------------------------+
8243	|  dynamic allocation           |
8244	+-------------------------------+
8245	|  padding                      |
8246	+-------------------------------+
8247	|  outgoing stack arguments     | <-- arg_pointer
8248        |                               |
8249	+-------------------------------+
8250	|                               | <-- stack_pointer_rtx (aligned)
8251
8252   Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8253   but leave frame_pointer_rtx and hard_frame_pointer_rtx
8254   unchanged.
8255
8256   By default for stack-clash we assume the guard is at least 64KB, but this
8257   value is configurable to either 4KB or 64KB.  We also force the guard size to
8258   be the same as the probing interval and both values are kept in sync.
8259
8260   With those assumptions the callee can allocate up to 63KB (or 3KB depending
8261   on the guard size) of stack space without probing.
8262
8263   When probing is needed, we emit a probe at the start of the prologue
8264   and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8265
8266   We have to track how much space has been allocated and the only stores
8267   to the stack we track as implicit probes are the FP/LR stores.
8268
8269   For outgoing arguments we probe if the size is larger than 1KB, such that
8270   the ABI specified buffer is maintained for the next callee.
8271
8272   The following registers are reserved during frame layout and should not be
8273   used for any other purpose:
8274
8275   - r11: Used by stack clash protection when SVE is enabled, and also
8276	  as an anchor register when saving and restoring registers
8277   - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8278   - r14 and r15: Used for speculation tracking.
8279   - r16(IP0), r17(IP1): Used by indirect tailcalls.
8280   - r30(LR), r29(FP): Used by standard frame layout.
8281
8282   These registers must be avoided in frame layout related code unless the
8283   explicit intention is to interact with one of the features listed above.  */
8284
8285/* Generate the prologue instructions for entry into a function.
8286   Establish the stack frame by decreasing the stack pointer with a
8287   properly calculated size and, if necessary, create a frame record
8288   filled with the values of LR and previous frame pointer.  The
8289   current FP is also set up if it is in use.  */
8290
8291void
8292aarch64_expand_prologue (void)
8293{
8294  poly_int64 frame_size = cfun->machine->frame.frame_size;
8295  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8296  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8297  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8298  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8299  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8300  poly_int64 below_hard_fp_saved_regs_size
8301    = cfun->machine->frame.below_hard_fp_saved_regs_size;
8302  unsigned reg1 = cfun->machine->frame.wb_candidate1;
8303  unsigned reg2 = cfun->machine->frame.wb_candidate2;
8304  bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8305  rtx_insn *insn;
8306
8307  if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8308    {
8309      /* Fold the SVE allocation into the initial allocation.
8310	 We don't do this in aarch64_layout_arg to avoid pessimizing
8311	 the epilogue code.  */
8312      initial_adjust += sve_callee_adjust;
8313      sve_callee_adjust = 0;
8314    }
8315
8316  /* Sign return address for functions.  */
8317  if (aarch64_return_address_signing_enabled ())
8318    {
8319      switch (aarch64_ra_sign_key)
8320	{
8321	  case AARCH64_KEY_A:
8322	    insn = emit_insn (gen_paciasp ());
8323	    break;
8324	  case AARCH64_KEY_B:
8325	    insn = emit_insn (gen_pacibsp ());
8326	    break;
8327	  default:
8328	    gcc_unreachable ();
8329	}
8330      add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8331      RTX_FRAME_RELATED_P (insn) = 1;
8332    }
8333
8334  if (flag_stack_usage_info)
8335    current_function_static_stack_size = constant_lower_bound (frame_size);
8336
8337  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8338    {
8339      if (crtl->is_leaf && !cfun->calls_alloca)
8340	{
8341	  if (maybe_gt (frame_size, PROBE_INTERVAL)
8342	      && maybe_gt (frame_size, get_stack_check_protect ()))
8343	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
8344					    (frame_size
8345					     - get_stack_check_protect ()));
8346	}
8347      else if (maybe_gt (frame_size, 0))
8348	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8349    }
8350
8351  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8352  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8353
8354  /* In theory we should never have both an initial adjustment
8355     and a callee save adjustment.  Verify that is the case since the
8356     code below does not handle it for -fstack-clash-protection.  */
8357  gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8358
8359  /* Will only probe if the initial adjustment is larger than the guard
8360     less the amount of the guard reserved for use by the caller's
8361     outgoing args.  */
8362  aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8363					  true, false);
8364
8365  if (callee_adjust != 0)
8366    aarch64_push_regs (reg1, reg2, callee_adjust);
8367
8368  /* The offset of the frame chain record (if any) from the current SP.  */
8369  poly_int64 chain_offset = (initial_adjust + callee_adjust
8370			     - cfun->machine->frame.hard_fp_offset);
8371  gcc_assert (known_ge (chain_offset, 0));
8372
8373  /* The offset of the bottom of the save area from the current SP.  */
8374  poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8375
8376  if (emit_frame_chain)
8377    {
8378      if (callee_adjust == 0)
8379	{
8380	  reg1 = R29_REGNUM;
8381	  reg2 = R30_REGNUM;
8382	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8383				     false, false);
8384	}
8385      else
8386	gcc_assert (known_eq (chain_offset, 0));
8387      aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8388			  stack_pointer_rtx, chain_offset,
8389			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8390      if (frame_pointer_needed && !frame_size.is_constant ())
8391	{
8392	  /* Variable-sized frames need to describe the save slot
8393	     address using DW_CFA_expression rather than DW_CFA_offset.
8394	     This means that, without taking further action, the
8395	     locations of the registers that we've already saved would
8396	     remain based on the stack pointer even after we redefine
8397	     the CFA based on the frame pointer.  We therefore need new
8398	     DW_CFA_expressions to re-express the save slots with addresses
8399	     based on the frame pointer.  */
8400	  rtx_insn *insn = get_last_insn ();
8401	  gcc_assert (RTX_FRAME_RELATED_P (insn));
8402
8403	  /* Add an explicit CFA definition if this was previously
8404	     implicit.  */
8405	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8406	    {
8407	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
8408				       callee_offset);
8409	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
8410			    gen_rtx_SET (hard_frame_pointer_rtx, src));
8411	    }
8412
8413	  /* Change the save slot expressions for the registers that
8414	     we've already saved.  */
8415	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8416				      hard_frame_pointer_rtx, UNITS_PER_WORD);
8417	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8418				      hard_frame_pointer_rtx, 0);
8419	}
8420      emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8421    }
8422
8423  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8424			     callee_adjust != 0 || emit_frame_chain,
8425			     emit_frame_chain);
8426  if (maybe_ne (sve_callee_adjust, 0))
8427    {
8428      gcc_assert (!flag_stack_clash_protection
8429		  || known_eq (initial_adjust, 0));
8430      aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8431					      sve_callee_adjust,
8432					      !frame_pointer_needed, false);
8433      saved_regs_offset += sve_callee_adjust;
8434    }
8435  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8436			     false, emit_frame_chain);
8437  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8438			     callee_adjust != 0 || emit_frame_chain,
8439			     emit_frame_chain);
8440
8441  /* We may need to probe the final adjustment if it is larger than the guard
8442     that is assumed by the called.  */
8443  aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8444					  !frame_pointer_needed, true);
8445}
8446
8447/* Return TRUE if we can use a simple_return insn.
8448
8449   This function checks whether the callee saved stack is empty, which
8450   means no restore actions are need. The pro_and_epilogue will use
8451   this to check whether shrink-wrapping opt is feasible.  */
8452
8453bool
8454aarch64_use_return_insn_p (void)
8455{
8456  if (!reload_completed)
8457    return false;
8458
8459  if (crtl->profile)
8460    return false;
8461
8462  return known_eq (cfun->machine->frame.frame_size, 0);
8463}
8464
8465/* Generate the epilogue instructions for returning from a function.
8466   This is almost exactly the reverse of the prolog sequence, except
8467   that we need to insert barriers to avoid scheduling loads that read
8468   from a deallocated stack, and we optimize the unwind records by
8469   emitting them all together if possible.  */
8470void
8471aarch64_expand_epilogue (bool for_sibcall)
8472{
8473  poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8474  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8475  poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8476  poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8477  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8478  poly_int64 below_hard_fp_saved_regs_size
8479    = cfun->machine->frame.below_hard_fp_saved_regs_size;
8480  unsigned reg1 = cfun->machine->frame.wb_candidate1;
8481  unsigned reg2 = cfun->machine->frame.wb_candidate2;
8482  rtx cfi_ops = NULL;
8483  rtx_insn *insn;
8484  /* A stack clash protection prologue may not have left EP0_REGNUM or
8485     EP1_REGNUM in a usable state.  The same is true for allocations
8486     with an SVE component, since we then need both temporary registers
8487     for each allocation.  For stack clash we are in a usable state if
8488     the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8489  HOST_WIDE_INT guard_size
8490    = 1 << param_stack_clash_protection_guard_size;
8491  HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8492
8493  /* We can re-use the registers when:
8494
8495     (a) the deallocation amount is the same as the corresponding
8496	 allocation amount (which is false if we combine the initial
8497	 and SVE callee save allocations in the prologue); and
8498
8499     (b) the allocation amount doesn't need a probe (which is false
8500	 if the amount is guard_size - guard_used_by_caller or greater).
8501
8502     In such situations the register should remain live with the correct
8503     value.  */
8504  bool can_inherit_p = (initial_adjust.is_constant ()
8505			&& final_adjust.is_constant ()
8506			&& (!flag_stack_clash_protection
8507			    || (known_lt (initial_adjust,
8508					  guard_size - guard_used_by_caller)
8509				&& known_eq (sve_callee_adjust, 0))));
8510
8511  /* We need to add memory barrier to prevent read from deallocated stack.  */
8512  bool need_barrier_p
8513    = maybe_ne (get_frame_size ()
8514		+ cfun->machine->frame.saved_varargs_size, 0);
8515
8516  /* Emit a barrier to prevent loads from a deallocated stack.  */
8517  if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8518      || cfun->calls_alloca
8519      || crtl->calls_eh_return)
8520    {
8521      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8522      need_barrier_p = false;
8523    }
8524
8525  /* Restore the stack pointer from the frame pointer if it may not
8526     be the same as the stack pointer.  */
8527  rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8528  rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8529  if (frame_pointer_needed
8530      && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8531    /* If writeback is used when restoring callee-saves, the CFA
8532       is restored on the instruction doing the writeback.  */
8533    aarch64_add_offset (Pmode, stack_pointer_rtx,
8534			hard_frame_pointer_rtx,
8535			-callee_offset - below_hard_fp_saved_regs_size,
8536			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8537  else
8538     /* The case where we need to re-use the register here is very rare, so
8539	avoid the complicated condition and just always emit a move if the
8540	immediate doesn't fit.  */
8541     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8542
8543  /* Restore the vector registers before the predicate registers,
8544     so that we can use P4 as a temporary for big-endian SVE frames.  */
8545  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8546				callee_adjust != 0, &cfi_ops);
8547  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8548				false, &cfi_ops);
8549  if (maybe_ne (sve_callee_adjust, 0))
8550    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8551  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8552				R0_REGNUM, R30_REGNUM,
8553				callee_adjust != 0, &cfi_ops);
8554
8555  if (need_barrier_p)
8556    emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8557
8558  if (callee_adjust != 0)
8559    aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8560
8561  if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8562    {
8563      /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8564      insn = get_last_insn ();
8565      rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8566      REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8567      RTX_FRAME_RELATED_P (insn) = 1;
8568      cfi_ops = NULL;
8569    }
8570
8571  /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8572     add restriction on emit_move optimization to leaf functions.  */
8573  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8574		  (!can_inherit_p || !crtl->is_leaf
8575		   || df_regs_ever_live_p (EP0_REGNUM)));
8576
8577  if (cfi_ops)
8578    {
8579      /* Emit delayed restores and reset the CFA to be SP.  */
8580      insn = get_last_insn ();
8581      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8582      REG_NOTES (insn) = cfi_ops;
8583      RTX_FRAME_RELATED_P (insn) = 1;
8584    }
8585
8586  /* We prefer to emit the combined return/authenticate instruction RETAA,
8587     however there are three cases in which we must instead emit an explicit
8588     authentication instruction.
8589
8590	1) Sibcalls don't return in a normal way, so if we're about to call one
8591	   we must authenticate.
8592
8593	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8594	   generating code for !TARGET_ARMV8_3 we can't use it and must
8595	   explicitly authenticate.
8596
8597	3) On an eh_return path we make extra stack adjustments to update the
8598	   canonical frame address to be the exception handler's CFA.  We want
8599	   to authenticate using the CFA of the function which calls eh_return.
8600    */
8601  if (aarch64_return_address_signing_enabled ()
8602      && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8603    {
8604      switch (aarch64_ra_sign_key)
8605	{
8606	  case AARCH64_KEY_A:
8607	    insn = emit_insn (gen_autiasp ());
8608	    break;
8609	  case AARCH64_KEY_B:
8610	    insn = emit_insn (gen_autibsp ());
8611	    break;
8612	  default:
8613	    gcc_unreachable ();
8614	}
8615      add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8616      RTX_FRAME_RELATED_P (insn) = 1;
8617    }
8618
8619  /* Stack adjustment for exception handler.  */
8620  if (crtl->calls_eh_return && !for_sibcall)
8621    {
8622      /* We need to unwind the stack by the offset computed by
8623	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8624	 to be SP; letting the CFA move during this adjustment
8625	 is just as correct as retaining the CFA from the body
8626	 of the function.  Therefore, do nothing special.  */
8627      emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8628    }
8629
8630  emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8631  if (!for_sibcall)
8632    emit_jump_insn (ret_rtx);
8633}
8634
8635/* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8636   normally or return to a previous frame after unwinding.
8637
8638   An EH return uses a single shared return sequence.  The epilogue is
8639   exactly like a normal epilogue except that it has an extra input
8640   register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8641   that must be applied after the frame has been destroyed.  An extra label
8642   is inserted before the epilogue which initializes this register to zero,
8643   and this is the entry point for a normal return.
8644
8645   An actual EH return updates the return address, initializes the stack
8646   adjustment and jumps directly into the epilogue (bypassing the zeroing
8647   of the adjustment).  Since the return address is typically saved on the
8648   stack when a function makes a call, the saved LR must be updated outside
8649   the epilogue.
8650
8651   This poses problems as the store is generated well before the epilogue,
8652   so the offset of LR is not known yet.  Also optimizations will remove the
8653   store as it appears dead, even after the epilogue is generated (as the
8654   base or offset for loading LR is different in many cases).
8655
8656   To avoid these problems this implementation forces the frame pointer
8657   in eh_return functions so that the location of LR is fixed and known early.
8658   It also marks the store volatile, so no optimization is permitted to
8659   remove the store.  */
8660rtx
8661aarch64_eh_return_handler_rtx (void)
8662{
8663  rtx tmp = gen_frame_mem (Pmode,
8664    plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8665
8666  /* Mark the store volatile, so no optimization is permitted to remove it.  */
8667  MEM_VOLATILE_P (tmp) = true;
8668  return tmp;
8669}
8670
8671/* Output code to add DELTA to the first argument, and then jump
8672   to FUNCTION.  Used for C++ multiple inheritance.  */
8673static void
8674aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8675			 HOST_WIDE_INT delta,
8676			 HOST_WIDE_INT vcall_offset,
8677			 tree function)
8678{
8679  /* The this pointer is always in x0.  Note that this differs from
8680     Arm where the this pointer maybe bumped to r1 if r0 is required
8681     to return a pointer to an aggregate.  On AArch64 a result value
8682     pointer will be in x8.  */
8683  int this_regno = R0_REGNUM;
8684  rtx this_rtx, temp0, temp1, addr, funexp;
8685  rtx_insn *insn;
8686  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8687
8688  if (aarch64_bti_enabled ())
8689    emit_insn (gen_bti_c());
8690
8691  reload_completed = 1;
8692  emit_note (NOTE_INSN_PROLOGUE_END);
8693
8694  this_rtx = gen_rtx_REG (Pmode, this_regno);
8695  temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8696  temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8697
8698  if (vcall_offset == 0)
8699    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8700  else
8701    {
8702      gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8703
8704      addr = this_rtx;
8705      if (delta != 0)
8706	{
8707	  if (delta >= -256 && delta < 256)
8708	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8709				       plus_constant (Pmode, this_rtx, delta));
8710	  else
8711	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8712				temp1, temp0, false);
8713	}
8714
8715      if (Pmode == ptr_mode)
8716	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8717      else
8718	aarch64_emit_move (temp0,
8719			   gen_rtx_ZERO_EXTEND (Pmode,
8720						gen_rtx_MEM (ptr_mode, addr)));
8721
8722      if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8723	  addr = plus_constant (Pmode, temp0, vcall_offset);
8724      else
8725	{
8726	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8727					  Pmode);
8728	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8729	}
8730
8731      if (Pmode == ptr_mode)
8732	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8733      else
8734	aarch64_emit_move (temp1,
8735			   gen_rtx_SIGN_EXTEND (Pmode,
8736						gen_rtx_MEM (ptr_mode, addr)));
8737
8738      emit_insn (gen_add2_insn (this_rtx, temp1));
8739    }
8740
8741  /* Generate a tail call to the target function.  */
8742  if (!TREE_USED (function))
8743    {
8744      assemble_external (function);
8745      TREE_USED (function) = 1;
8746    }
8747  funexp = XEXP (DECL_RTL (function), 0);
8748  funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8749  rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8750  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8751  SIBLING_CALL_P (insn) = 1;
8752
8753  insn = get_insns ();
8754  shorten_branches (insn);
8755
8756  assemble_start_function (thunk, fnname);
8757  final_start_function (insn, file, 1);
8758  final (insn, file, 1);
8759  final_end_function ();
8760  assemble_end_function (thunk, fnname);
8761
8762  /* Stop pretending to be a post-reload pass.  */
8763  reload_completed = 0;
8764}
8765
8766static bool
8767aarch64_tls_referenced_p (rtx x)
8768{
8769  if (!TARGET_HAVE_TLS)
8770    return false;
8771  subrtx_iterator::array_type array;
8772  FOR_EACH_SUBRTX (iter, array, x, ALL)
8773    {
8774      const_rtx x = *iter;
8775      if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8776	return true;
8777      /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8778	 TLS offsets, not real symbol references.  */
8779      if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8780	iter.skip_subrtxes ();
8781    }
8782  return false;
8783}
8784
8785
8786/* Return true if val can be encoded as a 12-bit unsigned immediate with
8787   a left shift of 0 or 12 bits.  */
8788bool
8789aarch64_uimm12_shift (HOST_WIDE_INT val)
8790{
8791  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8792	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8793	  );
8794}
8795
8796/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8797   that can be created with a left shift of 0 or 12.  */
8798static HOST_WIDE_INT
8799aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8800{
8801  /* Check to see if the value fits in 24 bits, as that is the maximum we can
8802     handle correctly.  */
8803  gcc_assert ((val & 0xffffff) == val);
8804
8805  if (((val & 0xfff) << 0) == val)
8806    return val;
8807
8808  return val & (0xfff << 12);
8809}
8810
8811/* Return true if val is an immediate that can be loaded into a
8812   register by a MOVZ instruction.  */
8813static bool
8814aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8815{
8816  if (GET_MODE_SIZE (mode) > 4)
8817    {
8818      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8819	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8820	return 1;
8821    }
8822  else
8823    {
8824      /* Ignore sign extension.  */
8825      val &= (HOST_WIDE_INT) 0xffffffff;
8826    }
8827  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8828	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8829}
8830
8831/* Test whether:
8832
8833     X = (X & AND_VAL) | IOR_VAL;
8834
8835   can be implemented using:
8836
8837     MOVK X, #(IOR_VAL >> shift), LSL #shift
8838
8839   Return the shift if so, otherwise return -1.  */
8840int
8841aarch64_movk_shift (const wide_int_ref &and_val,
8842		    const wide_int_ref &ior_val)
8843{
8844  unsigned int precision = and_val.get_precision ();
8845  unsigned HOST_WIDE_INT mask = 0xffff;
8846  for (unsigned int shift = 0; shift < precision; shift += 16)
8847    {
8848      if (and_val == ~mask && (ior_val & mask) == ior_val)
8849	return shift;
8850      mask <<= 16;
8851    }
8852  return -1;
8853}
8854
8855/* VAL is a value with the inner mode of MODE.  Replicate it to fill a
8856   64-bit (DImode) integer.  */
8857
8858static unsigned HOST_WIDE_INT
8859aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8860{
8861  unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8862  while (size < 64)
8863    {
8864      val &= (HOST_WIDE_INT_1U << size) - 1;
8865      val |= val << size;
8866      size *= 2;
8867    }
8868  return val;
8869}
8870
8871/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
8872
8873static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8874  {
8875    0x0000000100000001ull,
8876    0x0001000100010001ull,
8877    0x0101010101010101ull,
8878    0x1111111111111111ull,
8879    0x5555555555555555ull,
8880  };
8881
8882
8883/* Return true if val is a valid bitmask immediate.  */
8884
8885bool
8886aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8887{
8888  unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8889  int bits;
8890
8891  /* Check for a single sequence of one bits and return quickly if so.
8892     The special cases of all ones and all zeroes returns false.  */
8893  val = aarch64_replicate_bitmask_imm (val_in, mode);
8894  tmp = val + (val & -val);
8895
8896  if (tmp == (tmp & -tmp))
8897    return (val + 1) > 1;
8898
8899  /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
8900  if (mode == SImode)
8901    val = (val << 32) | (val & 0xffffffff);
8902
8903  /* Invert if the immediate doesn't start with a zero bit - this means we
8904     only need to search for sequences of one bits.  */
8905  if (val & 1)
8906    val = ~val;
8907
8908  /* Find the first set bit and set tmp to val with the first sequence of one
8909     bits removed.  Return success if there is a single sequence of ones.  */
8910  first_one = val & -val;
8911  tmp = val & (val + first_one);
8912
8913  if (tmp == 0)
8914    return true;
8915
8916  /* Find the next set bit and compute the difference in bit position.  */
8917  next_one = tmp & -tmp;
8918  bits = clz_hwi (first_one) - clz_hwi (next_one);
8919  mask = val ^ tmp;
8920
8921  /* Check the bit position difference is a power of 2, and that the first
8922     sequence of one bits fits within 'bits' bits.  */
8923  if ((mask >> bits) != 0 || bits != (bits & -bits))
8924    return false;
8925
8926  /* Check the sequence of one bits is repeated 64/bits times.  */
8927  return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8928}
8929
8930/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8931   Assumed precondition: VAL_IN Is not zero.  */
8932
8933unsigned HOST_WIDE_INT
8934aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8935{
8936  int lowest_bit_set = ctz_hwi (val_in);
8937  int highest_bit_set = floor_log2 (val_in);
8938  gcc_assert (val_in != 0);
8939
8940  return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8941	  (HOST_WIDE_INT_1U << lowest_bit_set));
8942}
8943
8944/* Create constant where bits outside of lowest bit set to highest bit set
8945   are set to 1.  */
8946
8947unsigned HOST_WIDE_INT
8948aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8949{
8950  return val_in | ~aarch64_and_split_imm1 (val_in);
8951}
8952
8953/* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8954
8955bool
8956aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8957{
8958  scalar_int_mode int_mode;
8959  if (!is_a <scalar_int_mode> (mode, &int_mode))
8960    return false;
8961
8962  if (aarch64_bitmask_imm (val_in, int_mode))
8963    return false;
8964
8965  if (aarch64_move_imm (val_in, int_mode))
8966    return false;
8967
8968  unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8969
8970  return aarch64_bitmask_imm (imm2, int_mode);
8971}
8972
8973/* Return true if val is an immediate that can be loaded into a
8974   register in a single instruction.  */
8975bool
8976aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8977{
8978  scalar_int_mode int_mode;
8979  if (!is_a <scalar_int_mode> (mode, &int_mode))
8980    return false;
8981
8982  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8983    return 1;
8984  return aarch64_bitmask_imm (val, int_mode);
8985}
8986
8987static bool
8988aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8989{
8990  if (GET_CODE (x) == HIGH)
8991    return true;
8992
8993  /* There's no way to calculate VL-based values using relocations.  */
8994  subrtx_iterator::array_type array;
8995  FOR_EACH_SUBRTX (iter, array, x, ALL)
8996    if (GET_CODE (*iter) == CONST_POLY_INT)
8997      return true;
8998
8999  poly_int64 offset;
9000  rtx base = strip_offset_and_salt (x, &offset);
9001  if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
9002    {
9003      /* We checked for POLY_INT_CST offsets above.  */
9004      if (aarch64_classify_symbol (base, offset.to_constant ())
9005	  != SYMBOL_FORCE_TO_MEM)
9006	return true;
9007      else
9008	/* Avoid generating a 64-bit relocation in ILP32; leave
9009	   to aarch64_expand_mov_immediate to handle it properly.  */
9010	return mode != ptr_mode;
9011    }
9012
9013  return aarch64_tls_referenced_p (x);
9014}
9015
9016/* Implement TARGET_CASE_VALUES_THRESHOLD.
9017   The expansion for a table switch is quite expensive due to the number
9018   of instructions, the table lookup and hard to predict indirect jump.
9019   When optimizing for speed, and -O3 enabled, use the per-core tuning if
9020   set, otherwise use tables for > 16 cases as a tradeoff between size and
9021   performance.  When optimizing for size, use the default setting.  */
9022
9023static unsigned int
9024aarch64_case_values_threshold (void)
9025{
9026  /* Use the specified limit for the number of cases before using jump
9027     tables at higher optimization levels.  */
9028  if (optimize > 2
9029      && selected_cpu->tune->max_case_values != 0)
9030    return selected_cpu->tune->max_case_values;
9031  else
9032    return optimize_size ? default_case_values_threshold () : 17;
9033}
9034
9035/* Return true if register REGNO is a valid index register.
9036   STRICT_P is true if REG_OK_STRICT is in effect.  */
9037
9038bool
9039aarch64_regno_ok_for_index_p (int regno, bool strict_p)
9040{
9041  if (!HARD_REGISTER_NUM_P (regno))
9042    {
9043      if (!strict_p)
9044	return true;
9045
9046      if (!reg_renumber)
9047	return false;
9048
9049      regno = reg_renumber[regno];
9050    }
9051  return GP_REGNUM_P (regno);
9052}
9053
9054/* Return true if register REGNO is a valid base register for mode MODE.
9055   STRICT_P is true if REG_OK_STRICT is in effect.  */
9056
9057bool
9058aarch64_regno_ok_for_base_p (int regno, bool strict_p)
9059{
9060  if (!HARD_REGISTER_NUM_P (regno))
9061    {
9062      if (!strict_p)
9063	return true;
9064
9065      if (!reg_renumber)
9066	return false;
9067
9068      regno = reg_renumber[regno];
9069    }
9070
9071  /* The fake registers will be eliminated to either the stack or
9072     hard frame pointer, both of which are usually valid base registers.
9073     Reload deals with the cases where the eliminated form isn't valid.  */
9074  return (GP_REGNUM_P (regno)
9075	  || regno == SP_REGNUM
9076	  || regno == FRAME_POINTER_REGNUM
9077	  || regno == ARG_POINTER_REGNUM);
9078}
9079
9080/* Return true if X is a valid base register for mode MODE.
9081   STRICT_P is true if REG_OK_STRICT is in effect.  */
9082
9083static bool
9084aarch64_base_register_rtx_p (rtx x, bool strict_p)
9085{
9086  if (!strict_p
9087      && GET_CODE (x) == SUBREG
9088      && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
9089    x = SUBREG_REG (x);
9090
9091  return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
9092}
9093
9094/* Return true if address offset is a valid index.  If it is, fill in INFO
9095   appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9096
9097static bool
9098aarch64_classify_index (struct aarch64_address_info *info, rtx x,
9099			machine_mode mode, bool strict_p)
9100{
9101  enum aarch64_address_type type;
9102  rtx index;
9103  int shift;
9104
9105  /* (reg:P) */
9106  if ((REG_P (x) || GET_CODE (x) == SUBREG)
9107      && GET_MODE (x) == Pmode)
9108    {
9109      type = ADDRESS_REG_REG;
9110      index = x;
9111      shift = 0;
9112    }
9113  /* (sign_extend:DI (reg:SI)) */
9114  else if ((GET_CODE (x) == SIGN_EXTEND
9115	    || GET_CODE (x) == ZERO_EXTEND)
9116	   && GET_MODE (x) == DImode
9117	   && GET_MODE (XEXP (x, 0)) == SImode)
9118    {
9119      type = (GET_CODE (x) == SIGN_EXTEND)
9120	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9121      index = XEXP (x, 0);
9122      shift = 0;
9123    }
9124  /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
9125  else if (GET_CODE (x) == MULT
9126	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9127	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9128	   && GET_MODE (XEXP (x, 0)) == DImode
9129	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9130	   && CONST_INT_P (XEXP (x, 1)))
9131    {
9132      type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9133	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9134      index = XEXP (XEXP (x, 0), 0);
9135      shift = exact_log2 (INTVAL (XEXP (x, 1)));
9136    }
9137  /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
9138  else if (GET_CODE (x) == ASHIFT
9139	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9140	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9141	   && GET_MODE (XEXP (x, 0)) == DImode
9142	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9143	   && CONST_INT_P (XEXP (x, 1)))
9144    {
9145      type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9146	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9147      index = XEXP (XEXP (x, 0), 0);
9148      shift = INTVAL (XEXP (x, 1));
9149    }
9150  /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
9151  else if ((GET_CODE (x) == SIGN_EXTRACT
9152	    || GET_CODE (x) == ZERO_EXTRACT)
9153	   && GET_MODE (x) == DImode
9154	   && GET_CODE (XEXP (x, 0)) == MULT
9155	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9156	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
9157    {
9158      type = (GET_CODE (x) == SIGN_EXTRACT)
9159	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9160      index = XEXP (XEXP (x, 0), 0);
9161      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9162      if (INTVAL (XEXP (x, 1)) != 32 + shift
9163	  || INTVAL (XEXP (x, 2)) != 0)
9164	shift = -1;
9165    }
9166  /* (and:DI (mult:DI (reg:DI) (const_int scale))
9167     (const_int 0xffffffff<<shift)) */
9168  else if (GET_CODE (x) == AND
9169	   && GET_MODE (x) == DImode
9170	   && GET_CODE (XEXP (x, 0)) == MULT
9171	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9172	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9173	   && CONST_INT_P (XEXP (x, 1)))
9174    {
9175      type = ADDRESS_REG_UXTW;
9176      index = XEXP (XEXP (x, 0), 0);
9177      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9178      if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9179	shift = -1;
9180    }
9181  /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
9182  else if ((GET_CODE (x) == SIGN_EXTRACT
9183	    || GET_CODE (x) == ZERO_EXTRACT)
9184	   && GET_MODE (x) == DImode
9185	   && GET_CODE (XEXP (x, 0)) == ASHIFT
9186	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9187	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
9188    {
9189      type = (GET_CODE (x) == SIGN_EXTRACT)
9190	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9191      index = XEXP (XEXP (x, 0), 0);
9192      shift = INTVAL (XEXP (XEXP (x, 0), 1));
9193      if (INTVAL (XEXP (x, 1)) != 32 + shift
9194	  || INTVAL (XEXP (x, 2)) != 0)
9195	shift = -1;
9196    }
9197  /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9198     (const_int 0xffffffff<<shift)) */
9199  else if (GET_CODE (x) == AND
9200	   && GET_MODE (x) == DImode
9201	   && GET_CODE (XEXP (x, 0)) == ASHIFT
9202	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9203	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9204	   && CONST_INT_P (XEXP (x, 1)))
9205    {
9206      type = ADDRESS_REG_UXTW;
9207      index = XEXP (XEXP (x, 0), 0);
9208      shift = INTVAL (XEXP (XEXP (x, 0), 1));
9209      if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9210	shift = -1;
9211    }
9212  /* (mult:P (reg:P) (const_int scale)) */
9213  else if (GET_CODE (x) == MULT
9214	   && GET_MODE (x) == Pmode
9215	   && GET_MODE (XEXP (x, 0)) == Pmode
9216	   && CONST_INT_P (XEXP (x, 1)))
9217    {
9218      type = ADDRESS_REG_REG;
9219      index = XEXP (x, 0);
9220      shift = exact_log2 (INTVAL (XEXP (x, 1)));
9221    }
9222  /* (ashift:P (reg:P) (const_int shift)) */
9223  else if (GET_CODE (x) == ASHIFT
9224	   && GET_MODE (x) == Pmode
9225	   && GET_MODE (XEXP (x, 0)) == Pmode
9226	   && CONST_INT_P (XEXP (x, 1)))
9227    {
9228      type = ADDRESS_REG_REG;
9229      index = XEXP (x, 0);
9230      shift = INTVAL (XEXP (x, 1));
9231    }
9232  else
9233    return false;
9234
9235  if (!strict_p
9236      && GET_CODE (index) == SUBREG
9237      && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9238    index = SUBREG_REG (index);
9239
9240  if (aarch64_sve_data_mode_p (mode))
9241    {
9242      if (type != ADDRESS_REG_REG
9243	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9244	return false;
9245    }
9246  else
9247    {
9248      if (shift != 0
9249	  && !(IN_RANGE (shift, 1, 3)
9250	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9251	return false;
9252    }
9253
9254  if (REG_P (index)
9255      && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9256    {
9257      info->type = type;
9258      info->offset = index;
9259      info->shift = shift;
9260      return true;
9261    }
9262
9263  return false;
9264}
9265
9266/* Return true if MODE is one of the modes for which we
9267   support LDP/STP operations.  */
9268
9269static bool
9270aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9271{
9272  return mode == SImode || mode == DImode
9273	 || mode == SFmode || mode == DFmode
9274	 || (aarch64_vector_mode_supported_p (mode)
9275	     && (known_eq (GET_MODE_SIZE (mode), 8)
9276		 || (known_eq (GET_MODE_SIZE (mode), 16)
9277		    && (aarch64_tune_params.extra_tuning_flags
9278			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9279}
9280
9281/* Return true if REGNO is a virtual pointer register, or an eliminable
9282   "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
9283   include stack_pointer or hard_frame_pointer.  */
9284static bool
9285virt_or_elim_regno_p (unsigned regno)
9286{
9287  return ((regno >= FIRST_VIRTUAL_REGISTER
9288	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9289	  || regno == FRAME_POINTER_REGNUM
9290	  || regno == ARG_POINTER_REGNUM);
9291}
9292
9293/* Return true if X is a valid address of type TYPE for machine mode MODE.
9294   If it is, fill in INFO appropriately.  STRICT_P is true if
9295   REG_OK_STRICT is in effect.  */
9296
9297bool
9298aarch64_classify_address (struct aarch64_address_info *info,
9299			  rtx x, machine_mode mode, bool strict_p,
9300			  aarch64_addr_query_type type)
9301{
9302  enum rtx_code code = GET_CODE (x);
9303  rtx op0, op1;
9304  poly_int64 offset;
9305
9306  HOST_WIDE_INT const_size;
9307
9308  /* Whether a vector mode is partial doesn't affect address legitimacy.
9309     Partial vectors like VNx8QImode allow the same indexed addressing
9310     mode and MUL VL addressing mode as full vectors like VNx16QImode;
9311     in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
9312  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9313  vec_flags &= ~VEC_PARTIAL;
9314
9315  /* On BE, we use load/store pair for all large int mode load/stores.
9316     TI/TFmode may also use a load/store pair.  */
9317  bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9318  bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9319			    || type == ADDR_QUERY_LDP_STP_N
9320			    || mode == TImode
9321			    || mode == TFmode
9322			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9323
9324  /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9325     corresponds to the actual size of the memory being loaded/stored and the
9326     mode of the corresponding addressing mode is half of that.  */
9327  if (type == ADDR_QUERY_LDP_STP_N
9328      && known_eq (GET_MODE_SIZE (mode), 16))
9329    mode = DFmode;
9330
9331  bool allow_reg_index_p = (!load_store_pair_p
9332			    && (known_lt (GET_MODE_SIZE (mode), 16)
9333				|| vec_flags == VEC_ADVSIMD
9334				|| vec_flags & VEC_SVE_DATA));
9335
9336  /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9337     [Rn, #offset, MUL VL].  */
9338  if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9339      && (code != REG && code != PLUS))
9340    return false;
9341
9342  /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9343     REG addressing.  */
9344  if (advsimd_struct_p
9345      && !BYTES_BIG_ENDIAN
9346      && (code != POST_INC && code != REG))
9347    return false;
9348
9349  gcc_checking_assert (GET_MODE (x) == VOIDmode
9350		       || SCALAR_INT_MODE_P (GET_MODE (x)));
9351
9352  switch (code)
9353    {
9354    case REG:
9355    case SUBREG:
9356      info->type = ADDRESS_REG_IMM;
9357      info->base = x;
9358      info->offset = const0_rtx;
9359      info->const_offset = 0;
9360      return aarch64_base_register_rtx_p (x, strict_p);
9361
9362    case PLUS:
9363      op0 = XEXP (x, 0);
9364      op1 = XEXP (x, 1);
9365
9366      if (! strict_p
9367	  && REG_P (op0)
9368	  && virt_or_elim_regno_p (REGNO (op0))
9369	  && poly_int_rtx_p (op1, &offset))
9370	{
9371	  info->type = ADDRESS_REG_IMM;
9372	  info->base = op0;
9373	  info->offset = op1;
9374	  info->const_offset = offset;
9375
9376	  return true;
9377	}
9378
9379      if (maybe_ne (GET_MODE_SIZE (mode), 0)
9380	  && aarch64_base_register_rtx_p (op0, strict_p)
9381	  && poly_int_rtx_p (op1, &offset))
9382	{
9383	  info->type = ADDRESS_REG_IMM;
9384	  info->base = op0;
9385	  info->offset = op1;
9386	  info->const_offset = offset;
9387
9388	  /* TImode and TFmode values are allowed in both pairs of X
9389	     registers and individual Q registers.  The available
9390	     address modes are:
9391	     X,X: 7-bit signed scaled offset
9392	     Q:   9-bit signed offset
9393	     We conservatively require an offset representable in either mode.
9394	     When performing the check for pairs of X registers i.e.  LDP/STP
9395	     pass down DImode since that is the natural size of the LDP/STP
9396	     instruction memory accesses.  */
9397	  if (mode == TImode || mode == TFmode)
9398	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9399		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9400			|| offset_12bit_unsigned_scaled_p (mode, offset)));
9401
9402	  /* A 7bit offset check because OImode will emit a ldp/stp
9403	     instruction (only big endian will get here).
9404	     For ldp/stp instructions, the offset is scaled for the size of a
9405	     single element of the pair.  */
9406	  if (mode == OImode)
9407	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9408
9409	  /* Three 9/12 bit offsets checks because CImode will emit three
9410	     ldr/str instructions (only big endian will get here).  */
9411	  if (mode == CImode)
9412	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9413		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9414							       offset + 32)
9415			|| offset_12bit_unsigned_scaled_p (V16QImode,
9416							   offset + 32)));
9417
9418	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
9419	     instructions (only big endian will get here).  */
9420	  if (mode == XImode)
9421	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9422		    && aarch64_offset_7bit_signed_scaled_p (TImode,
9423							    offset + 32));
9424
9425	  /* Make "m" use the LD1 offset range for SVE data modes, so
9426	     that pre-RTL optimizers like ivopts will work to that
9427	     instead of the wider LDR/STR range.  */
9428	  if (vec_flags == VEC_SVE_DATA)
9429	    return (type == ADDR_QUERY_M
9430		    ? offset_4bit_signed_scaled_p (mode, offset)
9431		    : offset_9bit_signed_scaled_p (mode, offset));
9432
9433	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9434	    {
9435	      poly_int64 end_offset = (offset
9436				       + GET_MODE_SIZE (mode)
9437				       - BYTES_PER_SVE_VECTOR);
9438	      return (type == ADDR_QUERY_M
9439		      ? offset_4bit_signed_scaled_p (mode, offset)
9440		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9441			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9442							 end_offset)));
9443	    }
9444
9445	  if (vec_flags == VEC_SVE_PRED)
9446	    return offset_9bit_signed_scaled_p (mode, offset);
9447
9448	  if (load_store_pair_p)
9449	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9450		     || known_eq (GET_MODE_SIZE (mode), 8)
9451		     || known_eq (GET_MODE_SIZE (mode), 16))
9452		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9453	  else
9454	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9455		    || offset_12bit_unsigned_scaled_p (mode, offset));
9456	}
9457
9458      if (allow_reg_index_p)
9459	{
9460	  /* Look for base + (scaled/extended) index register.  */
9461	  if (aarch64_base_register_rtx_p (op0, strict_p)
9462	      && aarch64_classify_index (info, op1, mode, strict_p))
9463	    {
9464	      info->base = op0;
9465	      return true;
9466	    }
9467	  if (aarch64_base_register_rtx_p (op1, strict_p)
9468	      && aarch64_classify_index (info, op0, mode, strict_p))
9469	    {
9470	      info->base = op1;
9471	      return true;
9472	    }
9473	}
9474
9475      return false;
9476
9477    case POST_INC:
9478    case POST_DEC:
9479    case PRE_INC:
9480    case PRE_DEC:
9481      info->type = ADDRESS_REG_WB;
9482      info->base = XEXP (x, 0);
9483      info->offset = NULL_RTX;
9484      return aarch64_base_register_rtx_p (info->base, strict_p);
9485
9486    case POST_MODIFY:
9487    case PRE_MODIFY:
9488      info->type = ADDRESS_REG_WB;
9489      info->base = XEXP (x, 0);
9490      if (GET_CODE (XEXP (x, 1)) == PLUS
9491	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9492	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9493	  && aarch64_base_register_rtx_p (info->base, strict_p))
9494	{
9495	  info->offset = XEXP (XEXP (x, 1), 1);
9496	  info->const_offset = offset;
9497
9498	  /* TImode and TFmode values are allowed in both pairs of X
9499	     registers and individual Q registers.  The available
9500	     address modes are:
9501	     X,X: 7-bit signed scaled offset
9502	     Q:   9-bit signed offset
9503	     We conservatively require an offset representable in either mode.
9504	   */
9505	  if (mode == TImode || mode == TFmode)
9506	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9507		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9508
9509	  if (load_store_pair_p)
9510	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9511		     || known_eq (GET_MODE_SIZE (mode), 8)
9512		     || known_eq (GET_MODE_SIZE (mode), 16))
9513		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9514	  else
9515	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9516	}
9517      return false;
9518
9519    case CONST:
9520    case SYMBOL_REF:
9521    case LABEL_REF:
9522      /* load literal: pc-relative constant pool entry.  Only supported
9523         for SI mode or larger.  */
9524      info->type = ADDRESS_SYMBOLIC;
9525
9526      if (!load_store_pair_p
9527	  && GET_MODE_SIZE (mode).is_constant (&const_size)
9528	  && const_size >= 4)
9529	{
9530	  poly_int64 offset;
9531	  rtx sym = strip_offset_and_salt (x, &offset);
9532	  return ((GET_CODE (sym) == LABEL_REF
9533		   || (GET_CODE (sym) == SYMBOL_REF
9534		       && CONSTANT_POOL_ADDRESS_P (sym)
9535		       && aarch64_pcrelative_literal_loads)));
9536	}
9537      return false;
9538
9539    case LO_SUM:
9540      info->type = ADDRESS_LO_SUM;
9541      info->base = XEXP (x, 0);
9542      info->offset = XEXP (x, 1);
9543      if (allow_reg_index_p
9544	  && aarch64_base_register_rtx_p (info->base, strict_p))
9545	{
9546	  poly_int64 offset;
9547	  HOST_WIDE_INT const_offset;
9548	  rtx sym = strip_offset_and_salt (info->offset, &offset);
9549	  if (GET_CODE (sym) == SYMBOL_REF
9550	      && offset.is_constant (&const_offset)
9551	      && (aarch64_classify_symbol (sym, const_offset)
9552		  == SYMBOL_SMALL_ABSOLUTE))
9553	    {
9554	      /* The symbol and offset must be aligned to the access size.  */
9555	      unsigned int align;
9556
9557	      if (CONSTANT_POOL_ADDRESS_P (sym))
9558		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9559	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9560		{
9561		  tree exp = SYMBOL_REF_DECL (sym);
9562		  align = TYPE_ALIGN (TREE_TYPE (exp));
9563		  align = aarch64_constant_alignment (exp, align);
9564		}
9565	      else if (SYMBOL_REF_DECL (sym))
9566		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9567	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9568		       && SYMBOL_REF_BLOCK (sym) != NULL)
9569		align = SYMBOL_REF_BLOCK (sym)->alignment;
9570	      else
9571		align = BITS_PER_UNIT;
9572
9573	      poly_int64 ref_size = GET_MODE_SIZE (mode);
9574	      if (known_eq (ref_size, 0))
9575		ref_size = GET_MODE_SIZE (DImode);
9576
9577	      return (multiple_p (const_offset, ref_size)
9578		      && multiple_p (align / BITS_PER_UNIT, ref_size));
9579	    }
9580	}
9581      return false;
9582
9583    default:
9584      return false;
9585    }
9586}
9587
9588/* Return true if the address X is valid for a PRFM instruction.
9589   STRICT_P is true if we should do strict checking with
9590   aarch64_classify_address.  */
9591
9592bool
9593aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9594{
9595  struct aarch64_address_info addr;
9596
9597  /* PRFM accepts the same addresses as DImode...  */
9598  bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9599  if (!res)
9600    return false;
9601
9602  /* ... except writeback forms.  */
9603  return addr.type != ADDRESS_REG_WB;
9604}
9605
9606bool
9607aarch64_symbolic_address_p (rtx x)
9608{
9609  poly_int64 offset;
9610  x = strip_offset_and_salt (x, &offset);
9611  return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9612}
9613
9614/* Classify the base of symbolic expression X.  */
9615
9616enum aarch64_symbol_type
9617aarch64_classify_symbolic_expression (rtx x)
9618{
9619  rtx offset;
9620
9621  split_const (x, &x, &offset);
9622  return aarch64_classify_symbol (x, INTVAL (offset));
9623}
9624
9625
9626/* Return TRUE if X is a legitimate address for accessing memory in
9627   mode MODE.  */
9628static bool
9629aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9630{
9631  struct aarch64_address_info addr;
9632
9633  return aarch64_classify_address (&addr, x, mode, strict_p);
9634}
9635
9636/* Return TRUE if X is a legitimate address of type TYPE for accessing
9637   memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9638bool
9639aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9640			      aarch64_addr_query_type type)
9641{
9642  struct aarch64_address_info addr;
9643
9644  return aarch64_classify_address (&addr, x, mode, strict_p, type);
9645}
9646
9647/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9648
9649static bool
9650aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9651					 poly_int64 orig_offset,
9652					 machine_mode mode)
9653{
9654  HOST_WIDE_INT size;
9655  if (GET_MODE_SIZE (mode).is_constant (&size))
9656    {
9657      HOST_WIDE_INT const_offset, second_offset;
9658
9659      /* A general SVE offset is A * VQ + B.  Remove the A component from
9660	 coefficient 0 in order to get the constant B.  */
9661      const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9662
9663      /* Split an out-of-range address displacement into a base and
9664	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9665	 range otherwise to increase opportunities for sharing the base
9666	 address of different sizes.  Unaligned accesses use the signed
9667	 9-bit range, TImode/TFmode use the intersection of signed
9668	 scaled 7-bit and signed 9-bit offset.  */
9669      if (mode == TImode || mode == TFmode)
9670	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9671      else if ((const_offset & (size - 1)) != 0)
9672	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9673      else
9674	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9675
9676      if (second_offset == 0 || known_eq (orig_offset, second_offset))
9677	return false;
9678
9679      /* Split the offset into second_offset and the rest.  */
9680      *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9681      *offset2 = gen_int_mode (second_offset, Pmode);
9682      return true;
9683    }
9684  else
9685    {
9686      /* Get the mode we should use as the basis of the range.  For structure
9687	 modes this is the mode of one vector.  */
9688      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9689      machine_mode step_mode
9690	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9691
9692      /* Get the "mul vl" multiplier we'd like to use.  */
9693      HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9694      HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9695      if (vec_flags & VEC_SVE_DATA)
9696	/* LDR supports a 9-bit range, but the move patterns for
9697	   structure modes require all vectors to be in range of the
9698	   same base.  The simplest way of accomodating that while still
9699	   promoting reuse of anchor points between different modes is
9700	   to use an 8-bit range unconditionally.  */
9701	vnum = ((vnum + 128) & 255) - 128;
9702      else
9703	/* Predicates are only handled singly, so we might as well use
9704	   the full range.  */
9705	vnum = ((vnum + 256) & 511) - 256;
9706      if (vnum == 0)
9707	return false;
9708
9709      /* Convert the "mul vl" multiplier into a byte offset.  */
9710      poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9711      if (known_eq (second_offset, orig_offset))
9712	return false;
9713
9714      /* Split the offset into second_offset and the rest.  */
9715      *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9716      *offset2 = gen_int_mode (second_offset, Pmode);
9717      return true;
9718    }
9719}
9720
9721/* Return the binary representation of floating point constant VALUE in INTVAL.
9722   If the value cannot be converted, return false without setting INTVAL.
9723   The conversion is done in the given MODE.  */
9724bool
9725aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9726{
9727
9728  /* We make a general exception for 0.  */
9729  if (aarch64_float_const_zero_rtx_p (value))
9730    {
9731      *intval = 0;
9732      return true;
9733    }
9734
9735  scalar_float_mode mode;
9736  if (GET_CODE (value) != CONST_DOUBLE
9737      || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9738      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9739      /* Only support up to DF mode.  */
9740      || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9741    return false;
9742
9743  unsigned HOST_WIDE_INT ival = 0;
9744
9745  long res[2];
9746  real_to_target (res,
9747		  CONST_DOUBLE_REAL_VALUE (value),
9748		  REAL_MODE_FORMAT (mode));
9749
9750  if (mode == DFmode)
9751    {
9752      int order = BYTES_BIG_ENDIAN ? 1 : 0;
9753      ival = zext_hwi (res[order], 32);
9754      ival |= (zext_hwi (res[1 - order], 32) << 32);
9755    }
9756  else
9757      ival = zext_hwi (res[0], 32);
9758
9759  *intval = ival;
9760  return true;
9761}
9762
9763/* Return TRUE if rtx X is an immediate constant that can be moved using a
9764   single MOV(+MOVK) followed by an FMOV.  */
9765bool
9766aarch64_float_const_rtx_p (rtx x)
9767{
9768  machine_mode mode = GET_MODE (x);
9769  if (mode == VOIDmode)
9770    return false;
9771
9772  /* Determine whether it's cheaper to write float constants as
9773     mov/movk pairs over ldr/adrp pairs.  */
9774  unsigned HOST_WIDE_INT ival;
9775
9776  if (GET_CODE (x) == CONST_DOUBLE
9777      && SCALAR_FLOAT_MODE_P (mode)
9778      && aarch64_reinterpret_float_as_int (x, &ival))
9779    {
9780      scalar_int_mode imode = (mode == HFmode
9781			       ? SImode
9782			       : int_mode_for_mode (mode).require ());
9783      int num_instr = aarch64_internal_mov_immediate
9784			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
9785      return num_instr < 3;
9786    }
9787
9788  return false;
9789}
9790
9791/* Return TRUE if rtx X is immediate constant 0.0 */
9792bool
9793aarch64_float_const_zero_rtx_p (rtx x)
9794{
9795  if (GET_MODE (x) == VOIDmode)
9796    return false;
9797
9798  if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9799    return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9800  return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9801}
9802
9803/* Return TRUE if rtx X is immediate constant that fits in a single
9804   MOVI immediate operation.  */
9805bool
9806aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9807{
9808  if (!TARGET_SIMD)
9809     return false;
9810
9811  machine_mode vmode;
9812  scalar_int_mode imode;
9813  unsigned HOST_WIDE_INT ival;
9814
9815  if (GET_CODE (x) == CONST_DOUBLE
9816      && SCALAR_FLOAT_MODE_P (mode))
9817    {
9818      if (!aarch64_reinterpret_float_as_int (x, &ival))
9819	return false;
9820
9821      /* We make a general exception for 0.  */
9822      if (aarch64_float_const_zero_rtx_p (x))
9823	return true;
9824
9825      imode = int_mode_for_mode (mode).require ();
9826    }
9827  else if (GET_CODE (x) == CONST_INT
9828	   && is_a <scalar_int_mode> (mode, &imode))
9829    ival = INTVAL (x);
9830  else
9831    return false;
9832
9833   /* use a 64 bit mode for everything except for DI/DF mode, where we use
9834     a 128 bit vector mode.  */
9835  int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9836
9837  vmode = aarch64_simd_container_mode (imode, width);
9838  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9839
9840  return aarch64_simd_valid_immediate (v_op, NULL);
9841}
9842
9843
9844/* Return the fixed registers used for condition codes.  */
9845
9846static bool
9847aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9848{
9849  *p1 = CC_REGNUM;
9850  *p2 = INVALID_REGNUM;
9851  return true;
9852}
9853
9854/* This function is used by the call expanders of the machine description.
9855   RESULT is the register in which the result is returned.  It's NULL for
9856   "call" and "sibcall".
9857   MEM is the location of the function call.
9858   CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9859   SIBCALL indicates whether this function call is normal call or sibling call.
9860   It will generate different pattern accordingly.  */
9861
9862void
9863aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9864{
9865  rtx call, callee, tmp;
9866  rtvec vec;
9867  machine_mode mode;
9868
9869  gcc_assert (MEM_P (mem));
9870  callee = XEXP (mem, 0);
9871  mode = GET_MODE (callee);
9872  gcc_assert (mode == Pmode);
9873
9874  /* Decide if we should generate indirect calls by loading the
9875     address of the callee into a register before performing
9876     the branch-and-link.  */
9877  if (SYMBOL_REF_P (callee)
9878      ? (aarch64_is_long_call_p (callee)
9879	 || aarch64_is_noplt_call_p (callee))
9880      : !REG_P (callee))
9881    XEXP (mem, 0) = force_reg (mode, callee);
9882
9883  call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9884
9885  if (result != NULL_RTX)
9886    call = gen_rtx_SET (result, call);
9887
9888  if (sibcall)
9889    tmp = ret_rtx;
9890  else
9891    tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9892
9893  gcc_assert (CONST_INT_P (callee_abi));
9894  callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9895			       UNSPEC_CALLEE_ABI);
9896
9897  vec = gen_rtvec (3, call, callee_abi, tmp);
9898  call = gen_rtx_PARALLEL (VOIDmode, vec);
9899
9900  aarch64_emit_call_insn (call);
9901}
9902
9903/* Emit call insn with PAT and do aarch64-specific handling.  */
9904
9905void
9906aarch64_emit_call_insn (rtx pat)
9907{
9908  rtx insn = emit_call_insn (pat);
9909
9910  rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9911  clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9912  clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9913}
9914
9915machine_mode
9916aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9917{
9918  machine_mode mode_x = GET_MODE (x);
9919  rtx_code code_x = GET_CODE (x);
9920
9921  /* All floating point compares return CCFP if it is an equality
9922     comparison, and CCFPE otherwise.  */
9923  if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9924    {
9925      switch (code)
9926	{
9927	case EQ:
9928	case NE:
9929	case UNORDERED:
9930	case ORDERED:
9931	case UNLT:
9932	case UNLE:
9933	case UNGT:
9934	case UNGE:
9935	case UNEQ:
9936	  return CCFPmode;
9937
9938	case LT:
9939	case LE:
9940	case GT:
9941	case GE:
9942	case LTGT:
9943	  return CCFPEmode;
9944
9945	default:
9946	  gcc_unreachable ();
9947	}
9948    }
9949
9950  /* Equality comparisons of short modes against zero can be performed
9951     using the TST instruction with the appropriate bitmask.  */
9952  if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9953      && (code == EQ || code == NE)
9954      && (mode_x == HImode || mode_x == QImode))
9955    return CC_NZmode;
9956
9957  /* Similarly, comparisons of zero_extends from shorter modes can
9958     be performed using an ANDS with an immediate mask.  */
9959  if (y == const0_rtx && code_x == ZERO_EXTEND
9960      && (mode_x == SImode || mode_x == DImode)
9961      && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9962      && (code == EQ || code == NE))
9963    return CC_NZmode;
9964
9965  if ((mode_x == SImode || mode_x == DImode)
9966      && y == const0_rtx
9967      && (code == EQ || code == NE || code == LT || code == GE)
9968      && (code_x == PLUS || code_x == MINUS || code_x == AND
9969	  || code_x == NEG
9970	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9971	      && CONST_INT_P (XEXP (x, 2)))))
9972    return CC_NZmode;
9973
9974  /* A compare with a shifted operand.  Because of canonicalization,
9975     the comparison will have to be swapped when we emit the assembly
9976     code.  */
9977  if ((mode_x == SImode || mode_x == DImode)
9978      && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9979      && (code_x == ASHIFT || code_x == ASHIFTRT
9980	  || code_x == LSHIFTRT
9981	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9982    return CC_SWPmode;
9983
9984  /* Similarly for a negated operand, but we can only do this for
9985     equalities.  */
9986  if ((mode_x == SImode || mode_x == DImode)
9987      && (REG_P (y) || GET_CODE (y) == SUBREG)
9988      && (code == EQ || code == NE)
9989      && code_x == NEG)
9990    return CC_Zmode;
9991
9992  /* A test for unsigned overflow from an addition.  */
9993  if ((mode_x == DImode || mode_x == TImode)
9994      && (code == LTU || code == GEU)
9995      && code_x == PLUS
9996      && rtx_equal_p (XEXP (x, 0), y))
9997    return CC_Cmode;
9998
9999  /* A test for unsigned overflow from an add with carry.  */
10000  if ((mode_x == DImode || mode_x == TImode)
10001      && (code == LTU || code == GEU)
10002      && code_x == PLUS
10003      && CONST_SCALAR_INT_P (y)
10004      && (rtx_mode_t (y, mode_x)
10005	  == (wi::shwi (1, mode_x)
10006	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
10007    return CC_ADCmode;
10008
10009  /* A test for signed overflow.  */
10010  if ((mode_x == DImode || mode_x == TImode)
10011      && code == NE
10012      && code_x == PLUS
10013      && GET_CODE (y) == SIGN_EXTEND)
10014    return CC_Vmode;
10015
10016  /* For everything else, return CCmode.  */
10017  return CCmode;
10018}
10019
10020static int
10021aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
10022
10023int
10024aarch64_get_condition_code (rtx x)
10025{
10026  machine_mode mode = GET_MODE (XEXP (x, 0));
10027  enum rtx_code comp_code = GET_CODE (x);
10028
10029  if (GET_MODE_CLASS (mode) != MODE_CC)
10030    mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
10031  return aarch64_get_condition_code_1 (mode, comp_code);
10032}
10033
10034static int
10035aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
10036{
10037  switch (mode)
10038    {
10039    case E_CCFPmode:
10040    case E_CCFPEmode:
10041      switch (comp_code)
10042	{
10043	case GE: return AARCH64_GE;
10044	case GT: return AARCH64_GT;
10045	case LE: return AARCH64_LS;
10046	case LT: return AARCH64_MI;
10047	case NE: return AARCH64_NE;
10048	case EQ: return AARCH64_EQ;
10049	case ORDERED: return AARCH64_VC;
10050	case UNORDERED: return AARCH64_VS;
10051	case UNLT: return AARCH64_LT;
10052	case UNLE: return AARCH64_LE;
10053	case UNGT: return AARCH64_HI;
10054	case UNGE: return AARCH64_PL;
10055	default: return -1;
10056	}
10057      break;
10058
10059    case E_CCmode:
10060      switch (comp_code)
10061	{
10062	case NE: return AARCH64_NE;
10063	case EQ: return AARCH64_EQ;
10064	case GE: return AARCH64_GE;
10065	case GT: return AARCH64_GT;
10066	case LE: return AARCH64_LE;
10067	case LT: return AARCH64_LT;
10068	case GEU: return AARCH64_CS;
10069	case GTU: return AARCH64_HI;
10070	case LEU: return AARCH64_LS;
10071	case LTU: return AARCH64_CC;
10072	default: return -1;
10073	}
10074      break;
10075
10076    case E_CC_SWPmode:
10077      switch (comp_code)
10078	{
10079	case NE: return AARCH64_NE;
10080	case EQ: return AARCH64_EQ;
10081	case GE: return AARCH64_LE;
10082	case GT: return AARCH64_LT;
10083	case LE: return AARCH64_GE;
10084	case LT: return AARCH64_GT;
10085	case GEU: return AARCH64_LS;
10086	case GTU: return AARCH64_CC;
10087	case LEU: return AARCH64_CS;
10088	case LTU: return AARCH64_HI;
10089	default: return -1;
10090	}
10091      break;
10092
10093    case E_CC_NZCmode:
10094      switch (comp_code)
10095	{
10096	case NE: return AARCH64_NE; /* = any */
10097	case EQ: return AARCH64_EQ; /* = none */
10098	case GE: return AARCH64_PL; /* = nfrst */
10099	case LT: return AARCH64_MI; /* = first */
10100	case GEU: return AARCH64_CS; /* = nlast */
10101	case GTU: return AARCH64_HI; /* = pmore */
10102	case LEU: return AARCH64_LS; /* = plast */
10103	case LTU: return AARCH64_CC; /* = last */
10104	default: return -1;
10105	}
10106      break;
10107
10108    case E_CC_NZmode:
10109      switch (comp_code)
10110	{
10111	case NE: return AARCH64_NE;
10112	case EQ: return AARCH64_EQ;
10113	case GE: return AARCH64_PL;
10114	case LT: return AARCH64_MI;
10115	default: return -1;
10116	}
10117      break;
10118
10119    case E_CC_Zmode:
10120      switch (comp_code)
10121	{
10122	case NE: return AARCH64_NE;
10123	case EQ: return AARCH64_EQ;
10124	default: return -1;
10125	}
10126      break;
10127
10128    case E_CC_Cmode:
10129      switch (comp_code)
10130	{
10131	case LTU: return AARCH64_CS;
10132	case GEU: return AARCH64_CC;
10133	default: return -1;
10134	}
10135      break;
10136
10137    case E_CC_ADCmode:
10138      switch (comp_code)
10139	{
10140	case GEU: return AARCH64_CS;
10141	case LTU: return AARCH64_CC;
10142	default: return -1;
10143	}
10144      break;
10145
10146    case E_CC_Vmode:
10147      switch (comp_code)
10148	{
10149	case NE: return AARCH64_VS;
10150	case EQ: return AARCH64_VC;
10151	default: return -1;
10152	}
10153      break;
10154
10155    default:
10156      return -1;
10157    }
10158
10159  return -1;
10160}
10161
10162bool
10163aarch64_const_vec_all_same_in_range_p (rtx x,
10164				       HOST_WIDE_INT minval,
10165				       HOST_WIDE_INT maxval)
10166{
10167  rtx elt;
10168  return (const_vec_duplicate_p (x, &elt)
10169	  && CONST_INT_P (elt)
10170	  && IN_RANGE (INTVAL (elt), minval, maxval));
10171}
10172
10173bool
10174aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10175{
10176  return aarch64_const_vec_all_same_in_range_p (x, val, val);
10177}
10178
10179/* Return true if VEC is a constant in which every element is in the range
10180   [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
10181
10182static bool
10183aarch64_const_vec_all_in_range_p (rtx vec,
10184				  HOST_WIDE_INT minval,
10185				  HOST_WIDE_INT maxval)
10186{
10187  if (GET_CODE (vec) != CONST_VECTOR
10188      || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10189    return false;
10190
10191  int nunits;
10192  if (!CONST_VECTOR_STEPPED_P (vec))
10193    nunits = const_vector_encoded_nelts (vec);
10194  else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10195    return false;
10196
10197  for (int i = 0; i < nunits; i++)
10198    {
10199      rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10200      if (!CONST_INT_P (vec_elem)
10201	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10202	return false;
10203    }
10204  return true;
10205}
10206
10207/* N Z C V.  */
10208#define AARCH64_CC_V 1
10209#define AARCH64_CC_C (1 << 1)
10210#define AARCH64_CC_Z (1 << 2)
10211#define AARCH64_CC_N (1 << 3)
10212
10213/* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
10214static const int aarch64_nzcv_codes[] =
10215{
10216  0,		/* EQ, Z == 1.  */
10217  AARCH64_CC_Z,	/* NE, Z == 0.  */
10218  0,		/* CS, C == 1.  */
10219  AARCH64_CC_C,	/* CC, C == 0.  */
10220  0,		/* MI, N == 1.  */
10221  AARCH64_CC_N, /* PL, N == 0.  */
10222  0,		/* VS, V == 1.  */
10223  AARCH64_CC_V, /* VC, V == 0.  */
10224  0,		/* HI, C ==1 && Z == 0.  */
10225  AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
10226  AARCH64_CC_V,	/* GE, N == V.  */
10227  0,		/* LT, N != V.  */
10228  AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
10229  0,		/* LE, !(Z == 0 && N == V).  */
10230  0,		/* AL, Any.  */
10231  0		/* NV, Any.  */
10232};
10233
10234/* Print floating-point vector immediate operand X to F, negating it
10235   first if NEGATE is true.  Return true on success, false if it isn't
10236   a constant we can handle.  */
10237
10238static bool
10239aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10240{
10241  rtx elt;
10242
10243  if (!const_vec_duplicate_p (x, &elt))
10244    return false;
10245
10246  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10247  if (negate)
10248    r = real_value_negate (&r);
10249
10250  /* Handle the SVE single-bit immediates specially, since they have a
10251     fixed form in the assembly syntax.  */
10252  if (real_equal (&r, &dconst0))
10253    asm_fprintf (f, "0.0");
10254  else if (real_equal (&r, &dconst2))
10255    asm_fprintf (f, "2.0");
10256  else if (real_equal (&r, &dconst1))
10257    asm_fprintf (f, "1.0");
10258  else if (real_equal (&r, &dconsthalf))
10259    asm_fprintf (f, "0.5");
10260  else
10261    {
10262      const int buf_size = 20;
10263      char float_buf[buf_size] = {'\0'};
10264      real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10265				1, GET_MODE (elt));
10266      asm_fprintf (f, "%s", float_buf);
10267    }
10268
10269  return true;
10270}
10271
10272/* Return the equivalent letter for size.  */
10273static char
10274sizetochar (int size)
10275{
10276  switch (size)
10277    {
10278    case 64: return 'd';
10279    case 32: return 's';
10280    case 16: return 'h';
10281    case 8 : return 'b';
10282    default: gcc_unreachable ();
10283    }
10284}
10285
10286/* Print operand X to file F in a target specific manner according to CODE.
10287   The acceptable formatting commands given by CODE are:
10288     'c':		An integer or symbol address without a preceding #
10289			sign.
10290     'C':		Take the duplicated element in a vector constant
10291			and print it in hex.
10292     'D':		Take the duplicated element in a vector constant
10293			and print it as an unsigned integer, in decimal.
10294     'e':		Print the sign/zero-extend size as a character 8->b,
10295			16->h, 32->w.  Can also be used for masks:
10296			0xff->b, 0xffff->h, 0xffffffff->w.
10297     'I':		If the operand is a duplicated vector constant,
10298			replace it with the duplicated scalar.  If the
10299			operand is then a floating-point constant, replace
10300			it with the integer bit representation.  Print the
10301			transformed constant as a signed decimal number.
10302     'p':		Prints N such that 2^N == X (X must be power of 2 and
10303			const int).
10304     'P':		Print the number of non-zero bits in X (a const_int).
10305     'H':		Print the higher numbered register of a pair (TImode)
10306			of regs.
10307     'm':		Print a condition (eq, ne, etc).
10308     'M':		Same as 'm', but invert condition.
10309     'N':		Take the duplicated element in a vector constant
10310			and print the negative of it in decimal.
10311     'b/h/s/d/q':	Print a scalar FP/SIMD register name.
10312     'S/T/U/V':		Print a FP/SIMD register name for a register list.
10313			The register printed is the FP/SIMD register name
10314			of X + 0/1/2/3 for S/T/U/V.
10315     'R':		Print a scalar Integer/FP/SIMD register name + 1.
10316     'X':		Print bottom 16 bits of integer constant in hex.
10317     'w/x':		Print a general register name or the zero register
10318			(32-bit or 64-bit).
10319     '0':		Print a normal operand, if it's a general register,
10320			then we assume DImode.
10321     'k':		Print NZCV for conditional compare instructions.
10322     'A':		Output address constant representing the first
10323			argument of X, specifying a relocation offset
10324			if appropriate.
10325     'L':		Output constant address specified by X
10326			with a relocation offset if appropriate.
10327     'G':		Prints address of X, specifying a PC relative
10328			relocation mode if appropriate.
10329     'y':		Output address of LDP or STP - this is used for
10330			some LDP/STPs which don't use a PARALLEL in their
10331			pattern (so the mode needs to be adjusted).
10332     'z':		Output address of a typical LDP or STP.  */
10333
10334static void
10335aarch64_print_operand (FILE *f, rtx x, int code)
10336{
10337  rtx elt;
10338  switch (code)
10339    {
10340    case 'c':
10341      if (CONST_INT_P (x))
10342	fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10343      else
10344	{
10345	  poly_int64 offset;
10346	  rtx base = strip_offset_and_salt (x, &offset);
10347	  if (SYMBOL_REF_P (base))
10348	    output_addr_const (f, x);
10349	  else
10350	    output_operand_lossage ("unsupported operand for code '%c'", code);
10351	}
10352      break;
10353
10354    case 'e':
10355      {
10356	x = unwrap_const_vec_duplicate (x);
10357	if (!CONST_INT_P (x))
10358	  {
10359	    output_operand_lossage ("invalid operand for '%%%c'", code);
10360	    return;
10361	  }
10362
10363	HOST_WIDE_INT val = INTVAL (x);
10364	if ((val & ~7) == 8 || val == 0xff)
10365	  fputc ('b', f);
10366	else if ((val & ~7) == 16 || val == 0xffff)
10367	  fputc ('h', f);
10368	else if ((val & ~7) == 32 || val == 0xffffffff)
10369	  fputc ('w', f);
10370	else
10371	  {
10372	    output_operand_lossage ("invalid operand for '%%%c'", code);
10373	    return;
10374	  }
10375      }
10376      break;
10377
10378    case 'p':
10379      {
10380	int n;
10381
10382	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10383	  {
10384	    output_operand_lossage ("invalid operand for '%%%c'", code);
10385	    return;
10386	  }
10387
10388	asm_fprintf (f, "%d", n);
10389      }
10390      break;
10391
10392    case 'P':
10393      if (!CONST_INT_P (x))
10394	{
10395	  output_operand_lossage ("invalid operand for '%%%c'", code);
10396	  return;
10397	}
10398
10399      asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10400      break;
10401
10402    case 'H':
10403      if (x == const0_rtx)
10404	{
10405	  asm_fprintf (f, "xzr");
10406	  break;
10407	}
10408
10409      if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10410	{
10411	  output_operand_lossage ("invalid operand for '%%%c'", code);
10412	  return;
10413	}
10414
10415      asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10416      break;
10417
10418    case 'I':
10419      {
10420	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10421	if (CONST_INT_P (x))
10422	  asm_fprintf (f, "%wd", INTVAL (x));
10423	else
10424	  {
10425	    output_operand_lossage ("invalid operand for '%%%c'", code);
10426	    return;
10427	  }
10428	break;
10429      }
10430
10431    case 'M':
10432    case 'm':
10433      {
10434        int cond_code;
10435	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10436	if (x == const_true_rtx)
10437	  {
10438	    if (code == 'M')
10439	      fputs ("nv", f);
10440	    return;
10441	  }
10442
10443        if (!COMPARISON_P (x))
10444	  {
10445	    output_operand_lossage ("invalid operand for '%%%c'", code);
10446	    return;
10447	  }
10448
10449        cond_code = aarch64_get_condition_code (x);
10450        gcc_assert (cond_code >= 0);
10451	if (code == 'M')
10452	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10453	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10454	  fputs (aarch64_sve_condition_codes[cond_code], f);
10455	else
10456	  fputs (aarch64_condition_codes[cond_code], f);
10457      }
10458      break;
10459
10460    case 'N':
10461      if (!const_vec_duplicate_p (x, &elt))
10462	{
10463	  output_operand_lossage ("invalid vector constant");
10464	  return;
10465	}
10466
10467      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10468	asm_fprintf (f, "%wd", -INTVAL (elt));
10469      else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10470	       && aarch64_print_vector_float_operand (f, x, true))
10471	;
10472      else
10473	{
10474	  output_operand_lossage ("invalid vector constant");
10475	  return;
10476	}
10477      break;
10478
10479    case 'b':
10480    case 'h':
10481    case 's':
10482    case 'd':
10483    case 'q':
10484      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10485	{
10486	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10487	  return;
10488	}
10489      asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10490      break;
10491
10492    case 'S':
10493    case 'T':
10494    case 'U':
10495    case 'V':
10496      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10497	{
10498	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10499	  return;
10500	}
10501      asm_fprintf (f, "%c%d",
10502		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10503		   REGNO (x) - V0_REGNUM + (code - 'S'));
10504      break;
10505
10506    case 'R':
10507      if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10508	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10509      else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10510	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10511      else
10512	output_operand_lossage ("incompatible register operand for '%%%c'",
10513				code);
10514      break;
10515
10516    case 'X':
10517      if (!CONST_INT_P (x))
10518	{
10519	  output_operand_lossage ("invalid operand for '%%%c'", code);
10520	  return;
10521	}
10522      asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10523      break;
10524
10525    case 'C':
10526      {
10527	/* Print a replicated constant in hex.  */
10528	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10529	  {
10530	    output_operand_lossage ("invalid operand for '%%%c'", code);
10531	    return;
10532	  }
10533	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10534	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10535      }
10536      break;
10537
10538    case 'D':
10539      {
10540	/* Print a replicated constant in decimal, treating it as
10541	   unsigned.  */
10542	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10543	  {
10544	    output_operand_lossage ("invalid operand for '%%%c'", code);
10545	    return;
10546	  }
10547	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10548	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10549      }
10550      break;
10551
10552    case 'w':
10553    case 'x':
10554      if (x == const0_rtx
10555	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10556	{
10557	  asm_fprintf (f, "%czr", code);
10558	  break;
10559	}
10560
10561      if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10562	{
10563	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10564	  break;
10565	}
10566
10567      if (REG_P (x) && REGNO (x) == SP_REGNUM)
10568	{
10569	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10570	  break;
10571	}
10572
10573      /* Fall through */
10574
10575    case 0:
10576      if (x == NULL)
10577	{
10578	  output_operand_lossage ("missing operand");
10579	  return;
10580	}
10581
10582      switch (GET_CODE (x))
10583	{
10584	case REG:
10585	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
10586	    {
10587	      if (REG_NREGS (x) == 1)
10588		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10589	      else
10590		{
10591		  char suffix
10592		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10593		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
10594			       REGNO (x) - V0_REGNUM, suffix,
10595			       END_REGNO (x) - V0_REGNUM - 1, suffix);
10596		}
10597	    }
10598	  else
10599	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10600	  break;
10601
10602	case MEM:
10603	  output_address (GET_MODE (x), XEXP (x, 0));
10604	  break;
10605
10606	case LABEL_REF:
10607	case SYMBOL_REF:
10608	  output_addr_const (asm_out_file, x);
10609	  break;
10610
10611	case CONST_INT:
10612	  asm_fprintf (f, "%wd", INTVAL (x));
10613	  break;
10614
10615	case CONST:
10616	  if (!VECTOR_MODE_P (GET_MODE (x)))
10617	    {
10618	      output_addr_const (asm_out_file, x);
10619	      break;
10620	    }
10621	  /* fall through */
10622
10623	case CONST_VECTOR:
10624	  if (!const_vec_duplicate_p (x, &elt))
10625	    {
10626	      output_operand_lossage ("invalid vector constant");
10627	      return;
10628	    }
10629
10630	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10631	    asm_fprintf (f, "%wd", INTVAL (elt));
10632	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10633		   && aarch64_print_vector_float_operand (f, x, false))
10634	    ;
10635	  else
10636	    {
10637	      output_operand_lossage ("invalid vector constant");
10638	      return;
10639	    }
10640	  break;
10641
10642	case CONST_DOUBLE:
10643	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10644	     be getting CONST_DOUBLEs holding integers.  */
10645	  gcc_assert (GET_MODE (x) != VOIDmode);
10646	  if (aarch64_float_const_zero_rtx_p (x))
10647	    {
10648	      fputc ('0', f);
10649	      break;
10650	    }
10651	  else if (aarch64_float_const_representable_p (x))
10652	    {
10653#define buf_size 20
10654	      char float_buf[buf_size] = {'\0'};
10655	      real_to_decimal_for_mode (float_buf,
10656					CONST_DOUBLE_REAL_VALUE (x),
10657					buf_size, buf_size,
10658					1, GET_MODE (x));
10659	      asm_fprintf (asm_out_file, "%s", float_buf);
10660	      break;
10661#undef buf_size
10662	    }
10663	  output_operand_lossage ("invalid constant");
10664	  return;
10665	default:
10666	  output_operand_lossage ("invalid operand");
10667	  return;
10668	}
10669      break;
10670
10671    case 'A':
10672      if (GET_CODE (x) == HIGH)
10673	x = XEXP (x, 0);
10674
10675      switch (aarch64_classify_symbolic_expression (x))
10676	{
10677	case SYMBOL_SMALL_GOT_4G:
10678	  asm_fprintf (asm_out_file, ":got:");
10679	  break;
10680
10681	case SYMBOL_SMALL_TLSGD:
10682	  asm_fprintf (asm_out_file, ":tlsgd:");
10683	  break;
10684
10685	case SYMBOL_SMALL_TLSDESC:
10686	  asm_fprintf (asm_out_file, ":tlsdesc:");
10687	  break;
10688
10689	case SYMBOL_SMALL_TLSIE:
10690	  asm_fprintf (asm_out_file, ":gottprel:");
10691	  break;
10692
10693	case SYMBOL_TLSLE24:
10694	  asm_fprintf (asm_out_file, ":tprel:");
10695	  break;
10696
10697	case SYMBOL_TINY_GOT:
10698	  gcc_unreachable ();
10699	  break;
10700
10701	default:
10702	  break;
10703	}
10704      output_addr_const (asm_out_file, x);
10705      break;
10706
10707    case 'L':
10708      switch (aarch64_classify_symbolic_expression (x))
10709	{
10710	case SYMBOL_SMALL_GOT_4G:
10711	  asm_fprintf (asm_out_file, ":lo12:");
10712	  break;
10713
10714	case SYMBOL_SMALL_TLSGD:
10715	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10716	  break;
10717
10718	case SYMBOL_SMALL_TLSDESC:
10719	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10720	  break;
10721
10722	case SYMBOL_SMALL_TLSIE:
10723	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
10724	  break;
10725
10726	case SYMBOL_TLSLE12:
10727	  asm_fprintf (asm_out_file, ":tprel_lo12:");
10728	  break;
10729
10730	case SYMBOL_TLSLE24:
10731	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10732	  break;
10733
10734	case SYMBOL_TINY_GOT:
10735	  asm_fprintf (asm_out_file, ":got:");
10736	  break;
10737
10738	case SYMBOL_TINY_TLSIE:
10739	  asm_fprintf (asm_out_file, ":gottprel:");
10740	  break;
10741
10742	default:
10743	  break;
10744	}
10745      output_addr_const (asm_out_file, x);
10746      break;
10747
10748    case 'G':
10749      switch (aarch64_classify_symbolic_expression (x))
10750	{
10751	case SYMBOL_TLSLE24:
10752	  asm_fprintf (asm_out_file, ":tprel_hi12:");
10753	  break;
10754	default:
10755	  break;
10756	}
10757      output_addr_const (asm_out_file, x);
10758      break;
10759
10760    case 'k':
10761      {
10762	HOST_WIDE_INT cond_code;
10763
10764	if (!CONST_INT_P (x))
10765	  {
10766	    output_operand_lossage ("invalid operand for '%%%c'", code);
10767	    return;
10768	  }
10769
10770	cond_code = INTVAL (x);
10771	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10772	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10773      }
10774      break;
10775
10776    case 'y':
10777    case 'z':
10778      {
10779	machine_mode mode = GET_MODE (x);
10780
10781	if (GET_CODE (x) != MEM
10782	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10783	  {
10784	    output_operand_lossage ("invalid operand for '%%%c'", code);
10785	    return;
10786	  }
10787
10788	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10789					    code == 'y'
10790					    ? ADDR_QUERY_LDP_STP_N
10791					    : ADDR_QUERY_LDP_STP))
10792	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
10793      }
10794      break;
10795
10796    default:
10797      output_operand_lossage ("invalid operand prefix '%%%c'", code);
10798      return;
10799    }
10800}
10801
10802/* Print address 'x' of a memory access with mode 'mode'.
10803   'op' is the context required by aarch64_classify_address.  It can either be
10804   MEM for a normal memory access or PARALLEL for LDP/STP.  */
10805static bool
10806aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10807				aarch64_addr_query_type type)
10808{
10809  struct aarch64_address_info addr;
10810  unsigned int size, vec_flags;
10811
10812  /* Check all addresses are Pmode - including ILP32.  */
10813  if (GET_MODE (x) != Pmode
10814      && (!CONST_INT_P (x)
10815	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10816    {
10817      output_operand_lossage ("invalid address mode");
10818      return false;
10819    }
10820
10821  if (aarch64_classify_address (&addr, x, mode, true, type))
10822    switch (addr.type)
10823      {
10824      case ADDRESS_REG_IMM:
10825	if (known_eq (addr.const_offset, 0))
10826	  {
10827	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10828	    return true;
10829	  }
10830
10831	vec_flags = aarch64_classify_vector_mode (mode);
10832	if (vec_flags & VEC_ANY_SVE)
10833	  {
10834	    HOST_WIDE_INT vnum
10835	      = exact_div (addr.const_offset,
10836			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10837	    asm_fprintf (f, "[%s, #%wd, mul vl]",
10838			 reg_names[REGNO (addr.base)], vnum);
10839	    return true;
10840	  }
10841
10842	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10843		     INTVAL (addr.offset));
10844	return true;
10845
10846      case ADDRESS_REG_REG:
10847	if (addr.shift == 0)
10848	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10849		       reg_names [REGNO (addr.offset)]);
10850	else
10851	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10852		       reg_names [REGNO (addr.offset)], addr.shift);
10853	return true;
10854
10855      case ADDRESS_REG_UXTW:
10856	if (addr.shift == 0)
10857	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10858		       REGNO (addr.offset) - R0_REGNUM);
10859	else
10860	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10861		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
10862	return true;
10863
10864      case ADDRESS_REG_SXTW:
10865	if (addr.shift == 0)
10866	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10867		       REGNO (addr.offset) - R0_REGNUM);
10868	else
10869	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10870		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
10871	return true;
10872
10873      case ADDRESS_REG_WB:
10874	/* Writeback is only supported for fixed-width modes.  */
10875	size = GET_MODE_SIZE (mode).to_constant ();
10876	switch (GET_CODE (x))
10877	  {
10878	  case PRE_INC:
10879	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10880	    return true;
10881	  case POST_INC:
10882	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10883	    return true;
10884	  case PRE_DEC:
10885	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10886	    return true;
10887	  case POST_DEC:
10888	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10889	    return true;
10890	  case PRE_MODIFY:
10891	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10892			 INTVAL (addr.offset));
10893	    return true;
10894	  case POST_MODIFY:
10895	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10896			 INTVAL (addr.offset));
10897	    return true;
10898	  default:
10899	    break;
10900	  }
10901	break;
10902
10903      case ADDRESS_LO_SUM:
10904	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10905	output_addr_const (f, addr.offset);
10906	asm_fprintf (f, "]");
10907	return true;
10908
10909      case ADDRESS_SYMBOLIC:
10910	output_addr_const (f, x);
10911	return true;
10912      }
10913
10914  return false;
10915}
10916
10917/* Print address 'x' of a memory access with mode 'mode'.  */
10918static void
10919aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10920{
10921  if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10922    output_addr_const (f, x);
10923}
10924
10925/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
10926
10927static bool
10928aarch64_output_addr_const_extra (FILE *file, rtx x)
10929{
10930  if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
10931    {
10932      output_addr_const (file, XVECEXP (x, 0, 0));
10933      return true;
10934   }
10935  return false;
10936}
10937
10938bool
10939aarch64_label_mentioned_p (rtx x)
10940{
10941  const char *fmt;
10942  int i;
10943
10944  if (GET_CODE (x) == LABEL_REF)
10945    return true;
10946
10947  /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10948     referencing instruction, but they are constant offsets, not
10949     symbols.  */
10950  if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10951    return false;
10952
10953  fmt = GET_RTX_FORMAT (GET_CODE (x));
10954  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10955    {
10956      if (fmt[i] == 'E')
10957	{
10958	  int j;
10959
10960	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10961	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10962	      return 1;
10963	}
10964      else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10965	return 1;
10966    }
10967
10968  return 0;
10969}
10970
10971/* Implement REGNO_REG_CLASS.  */
10972
10973enum reg_class
10974aarch64_regno_regclass (unsigned regno)
10975{
10976  if (STUB_REGNUM_P (regno))
10977    return STUB_REGS;
10978
10979  if (GP_REGNUM_P (regno))
10980    return GENERAL_REGS;
10981
10982  if (regno == SP_REGNUM)
10983    return STACK_REG;
10984
10985  if (regno == FRAME_POINTER_REGNUM
10986      || regno == ARG_POINTER_REGNUM)
10987    return POINTER_REGS;
10988
10989  if (FP_REGNUM_P (regno))
10990    return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10991	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10992
10993  if (PR_REGNUM_P (regno))
10994    return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10995
10996  if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10997    return FFR_REGS;
10998
10999  return NO_REGS;
11000}
11001
11002/* OFFSET is an address offset for mode MODE, which has SIZE bytes.
11003   If OFFSET is out of range, return an offset of an anchor point
11004   that is in range.  Return 0 otherwise.  */
11005
11006static HOST_WIDE_INT
11007aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
11008		       machine_mode mode)
11009{
11010  /* Does it look like we'll need a 16-byte load/store-pair operation?  */
11011  if (size > 16)
11012    return (offset + 0x400) & ~0x7f0;
11013
11014  /* For offsets that aren't a multiple of the access size, the limit is
11015     -256...255.  */
11016  if (offset & (size - 1))
11017    {
11018      /* BLKmode typically uses LDP of X-registers.  */
11019      if (mode == BLKmode)
11020	return (offset + 512) & ~0x3ff;
11021      return (offset + 0x100) & ~0x1ff;
11022    }
11023
11024  /* Small negative offsets are supported.  */
11025  if (IN_RANGE (offset, -256, 0))
11026    return 0;
11027
11028  if (mode == TImode || mode == TFmode)
11029    return (offset + 0x100) & ~0x1ff;
11030
11031  /* Use 12-bit offset by access size.  */
11032  return offset & (~0xfff * size);
11033}
11034
11035static rtx
11036aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
11037{
11038  /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
11039     where mask is selected by alignment and size of the offset.
11040     We try to pick as large a range for the offset as possible to
11041     maximize the chance of a CSE.  However, for aligned addresses
11042     we limit the range to 4k so that structures with different sized
11043     elements are likely to use the same base.  We need to be careful
11044     not to split a CONST for some forms of address expression, otherwise
11045     it will generate sub-optimal code.  */
11046
11047  if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
11048    {
11049      rtx base = XEXP (x, 0);
11050      rtx offset_rtx = XEXP (x, 1);
11051      HOST_WIDE_INT offset = INTVAL (offset_rtx);
11052
11053      if (GET_CODE (base) == PLUS)
11054	{
11055	  rtx op0 = XEXP (base, 0);
11056	  rtx op1 = XEXP (base, 1);
11057
11058	  /* Force any scaling into a temp for CSE.  */
11059	  op0 = force_reg (Pmode, op0);
11060	  op1 = force_reg (Pmode, op1);
11061
11062	  /* Let the pointer register be in op0.  */
11063	  if (REG_POINTER (op1))
11064	    std::swap (op0, op1);
11065
11066	  /* If the pointer is virtual or frame related, then we know that
11067	     virtual register instantiation or register elimination is going
11068	     to apply a second constant.  We want the two constants folded
11069	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
11070	  if (virt_or_elim_regno_p (REGNO (op0)))
11071	    {
11072	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
11073				   NULL_RTX, true, OPTAB_DIRECT);
11074	      return gen_rtx_PLUS (Pmode, base, op1);
11075	    }
11076
11077	  /* Otherwise, in order to encourage CSE (and thence loop strength
11078	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
11079	  base = expand_binop (Pmode, add_optab, op0, op1,
11080			       NULL_RTX, true, OPTAB_DIRECT);
11081	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
11082	}
11083
11084      HOST_WIDE_INT size;
11085      if (GET_MODE_SIZE (mode).is_constant (&size))
11086	{
11087	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
11088							     mode);
11089	  if (base_offset != 0)
11090	    {
11091	      base = plus_constant (Pmode, base, base_offset);
11092	      base = force_operand (base, NULL_RTX);
11093	      return plus_constant (Pmode, base, offset - base_offset);
11094	    }
11095	}
11096    }
11097
11098  return x;
11099}
11100
11101static reg_class_t
11102aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
11103			  reg_class_t rclass,
11104			  machine_mode mode,
11105			  secondary_reload_info *sri)
11106{
11107  /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
11108     LDR and STR.  See the comment at the head of aarch64-sve.md for
11109     more details about the big-endian handling.  */
11110  if (reg_class_subset_p (rclass, FP_REGS)
11111      && !((REG_P (x) && HARD_REGISTER_P (x))
11112	   || aarch64_simd_valid_immediate (x, NULL))
11113      && mode != VNx16QImode)
11114    {
11115      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11116      if ((vec_flags & VEC_SVE_DATA)
11117	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
11118	{
11119	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
11120	  return NO_REGS;
11121	}
11122    }
11123
11124  /* If we have to disable direct literal pool loads and stores because the
11125     function is too big, then we need a scratch register.  */
11126  if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
11127      && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
11128	  || targetm.vector_mode_supported_p (GET_MODE (x)))
11129      && !aarch64_pcrelative_literal_loads)
11130    {
11131      sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
11132      return NO_REGS;
11133    }
11134
11135  /* Without the TARGET_SIMD instructions we cannot move a Q register
11136     to a Q register directly.  We need a scratch.  */
11137  if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
11138      && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
11139      && reg_class_subset_p (rclass, FP_REGS))
11140    {
11141      sri->icode = code_for_aarch64_reload_mov (mode);
11142      return NO_REGS;
11143    }
11144
11145  /* A TFmode or TImode memory access should be handled via an FP_REGS
11146     because AArch64 has richer addressing modes for LDR/STR instructions
11147     than LDP/STP instructions.  */
11148  if (TARGET_FLOAT && rclass == GENERAL_REGS
11149      && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
11150    return FP_REGS;
11151
11152  if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
11153      return GENERAL_REGS;
11154
11155  return NO_REGS;
11156}
11157
11158static bool
11159aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
11160{
11161  gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
11162
11163  /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
11164     can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
11165  if (frame_pointer_needed)
11166    return to == HARD_FRAME_POINTER_REGNUM;
11167  return true;
11168}
11169
11170poly_int64
11171aarch64_initial_elimination_offset (unsigned from, unsigned to)
11172{
11173  if (to == HARD_FRAME_POINTER_REGNUM)
11174    {
11175      if (from == ARG_POINTER_REGNUM)
11176	return cfun->machine->frame.hard_fp_offset;
11177
11178      if (from == FRAME_POINTER_REGNUM)
11179	return cfun->machine->frame.hard_fp_offset
11180	       - cfun->machine->frame.locals_offset;
11181    }
11182
11183  if (to == STACK_POINTER_REGNUM)
11184    {
11185      if (from == FRAME_POINTER_REGNUM)
11186	  return cfun->machine->frame.frame_size
11187		 - cfun->machine->frame.locals_offset;
11188    }
11189
11190  return cfun->machine->frame.frame_size;
11191}
11192
11193
11194/* Get return address without mangling.  */
11195
11196rtx
11197aarch64_return_addr_rtx (void)
11198{
11199  rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11200  /* Note: aarch64_return_address_signing_enabled only
11201     works after cfun->machine->frame.laid_out is set,
11202     so here we don't know if the return address will
11203     be signed or not.  */
11204  rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11205  emit_move_insn (lr, val);
11206  emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11207  return lr;
11208}
11209
11210
11211/* Implement RETURN_ADDR_RTX.  We do not support moving back to a
11212   previous frame.  */
11213
11214rtx
11215aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11216{
11217  if (count != 0)
11218    return const0_rtx;
11219  return aarch64_return_addr_rtx ();
11220}
11221
11222
11223static void
11224aarch64_asm_trampoline_template (FILE *f)
11225{
11226  int offset1 = 24;
11227  int offset2 = 28;
11228
11229  if (aarch64_bti_enabled ())
11230    {
11231      asm_fprintf (f, "\thint\t34 // bti c\n");
11232      offset1 -= 4;
11233      offset2 -= 4;
11234    }
11235
11236  if (TARGET_ILP32)
11237    {
11238      asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
11239      asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
11240		   offset1);
11241    }
11242  else
11243    {
11244      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
11245      asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
11246		   offset2);
11247    }
11248  asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11249
11250  /* We always emit a speculation barrier.
11251     This is because the same trampoline template is used for every nested
11252     function.  Since nested functions are not particularly common or
11253     performant we don't worry too much about the extra instructions to copy
11254     around.
11255     This is not yet a problem, since we have not yet implemented function
11256     specific attributes to choose between hardening against straight line
11257     speculation or not, but such function specific attributes are likely to
11258     happen in the future.  */
11259  asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11260
11261  /* The trampoline needs an extra padding instruction.  In case if BTI is
11262     enabled the padding instruction is replaced by the BTI instruction at
11263     the beginning.  */
11264  if (!aarch64_bti_enabled ())
11265    assemble_aligned_integer (4, const0_rtx);
11266
11267  assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11268  assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11269}
11270
11271static void
11272aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11273{
11274  rtx fnaddr, mem, a_tramp;
11275  const int tramp_code_sz = 24;
11276
11277  /* Don't need to copy the trailing D-words, we fill those in below.  */
11278  /* We create our own memory address in Pmode so that `emit_block_move` can
11279     use parts of the backend which expect Pmode addresses.  */
11280  rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11281  emit_block_move (gen_rtx_MEM (BLKmode, temp),
11282		   assemble_trampoline_template (),
11283		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11284  mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11285  fnaddr = XEXP (DECL_RTL (fndecl), 0);
11286  if (GET_MODE (fnaddr) != ptr_mode)
11287    fnaddr = convert_memory_address (ptr_mode, fnaddr);
11288  emit_move_insn (mem, fnaddr);
11289
11290  mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11291  emit_move_insn (mem, chain_value);
11292
11293  /* XXX We should really define a "clear_cache" pattern and use
11294     gen_clear_cache().  */
11295  a_tramp = XEXP (m_tramp, 0);
11296  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
11297		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
11298		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
11299		     ptr_mode);
11300}
11301
11302static unsigned char
11303aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11304{
11305  /* ??? Logically we should only need to provide a value when
11306     HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11307     can hold MODE, but at the moment we need to handle all modes.
11308     Just ignore any runtime parts for registers that can't store them.  */
11309  HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11310  unsigned int nregs, vec_flags;
11311  switch (regclass)
11312    {
11313    case STUB_REGS:
11314    case TAILCALL_ADDR_REGS:
11315    case POINTER_REGS:
11316    case GENERAL_REGS:
11317    case ALL_REGS:
11318    case POINTER_AND_FP_REGS:
11319    case FP_REGS:
11320    case FP_LO_REGS:
11321    case FP_LO8_REGS:
11322      vec_flags = aarch64_classify_vector_mode (mode);
11323      if ((vec_flags & VEC_SVE_DATA)
11324	  && constant_multiple_p (GET_MODE_SIZE (mode),
11325				  aarch64_vl_bytes (mode, vec_flags), &nregs))
11326	return nregs;
11327      return (vec_flags & VEC_ADVSIMD
11328	      ? CEIL (lowest_size, UNITS_PER_VREG)
11329	      : CEIL (lowest_size, UNITS_PER_WORD));
11330    case STACK_REG:
11331    case PR_REGS:
11332    case PR_LO_REGS:
11333    case PR_HI_REGS:
11334    case FFR_REGS:
11335    case PR_AND_FFR_REGS:
11336      return 1;
11337
11338    case NO_REGS:
11339      return 0;
11340
11341    default:
11342      break;
11343    }
11344  gcc_unreachable ();
11345}
11346
11347static reg_class_t
11348aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11349{
11350  if (regclass == POINTER_REGS)
11351    return GENERAL_REGS;
11352
11353  if (regclass == STACK_REG)
11354    {
11355      if (REG_P(x)
11356	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11357	  return regclass;
11358
11359      return NO_REGS;
11360    }
11361
11362  /* Register eliminiation can result in a request for
11363     SP+constant->FP_REGS.  We cannot support such operations which
11364     use SP as source and an FP_REG as destination, so reject out
11365     right now.  */
11366  if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11367    {
11368      rtx lhs = XEXP (x, 0);
11369
11370      /* Look through a possible SUBREG introduced by ILP32.  */
11371      if (GET_CODE (lhs) == SUBREG)
11372	lhs = SUBREG_REG (lhs);
11373
11374      gcc_assert (REG_P (lhs));
11375      gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11376				      POINTER_REGS));
11377      return NO_REGS;
11378    }
11379
11380  return regclass;
11381}
11382
11383void
11384aarch64_asm_output_labelref (FILE* f, const char *name)
11385{
11386  asm_fprintf (f, "%U%s", name);
11387}
11388
11389static void
11390aarch64_elf_asm_constructor (rtx symbol, int priority)
11391{
11392  if (priority == DEFAULT_INIT_PRIORITY)
11393    default_ctor_section_asm_out_constructor (symbol, priority);
11394  else
11395    {
11396      section *s;
11397      /* While priority is known to be in range [0, 65535], so 18 bytes
11398         would be enough, the compiler might not know that.  To avoid
11399         -Wformat-truncation false positive, use a larger size.  */
11400      char buf[23];
11401      snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11402      s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11403      switch_to_section (s);
11404      assemble_align (POINTER_SIZE);
11405      assemble_aligned_integer (POINTER_BYTES, symbol);
11406    }
11407}
11408
11409static void
11410aarch64_elf_asm_destructor (rtx symbol, int priority)
11411{
11412  if (priority == DEFAULT_INIT_PRIORITY)
11413    default_dtor_section_asm_out_destructor (symbol, priority);
11414  else
11415    {
11416      section *s;
11417      /* While priority is known to be in range [0, 65535], so 18 bytes
11418         would be enough, the compiler might not know that.  To avoid
11419         -Wformat-truncation false positive, use a larger size.  */
11420      char buf[23];
11421      snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11422      s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11423      switch_to_section (s);
11424      assemble_align (POINTER_SIZE);
11425      assemble_aligned_integer (POINTER_BYTES, symbol);
11426    }
11427}
11428
11429const char*
11430aarch64_output_casesi (rtx *operands)
11431{
11432  char buf[100];
11433  char label[100];
11434  rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11435  int index;
11436  static const char *const patterns[4][2] =
11437  {
11438    {
11439      "ldrb\t%w3, [%0,%w1,uxtw]",
11440      "add\t%3, %4, %w3, sxtb #2"
11441    },
11442    {
11443      "ldrh\t%w3, [%0,%w1,uxtw #1]",
11444      "add\t%3, %4, %w3, sxth #2"
11445    },
11446    {
11447      "ldr\t%w3, [%0,%w1,uxtw #2]",
11448      "add\t%3, %4, %w3, sxtw #2"
11449    },
11450    /* We assume that DImode is only generated when not optimizing and
11451       that we don't really need 64-bit address offsets.  That would
11452       imply an object file with 8GB of code in a single function!  */
11453    {
11454      "ldr\t%w3, [%0,%w1,uxtw #2]",
11455      "add\t%3, %4, %w3, sxtw #2"
11456    }
11457  };
11458
11459  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11460
11461  scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11462  index = exact_log2 (GET_MODE_SIZE (mode));
11463
11464  gcc_assert (index >= 0 && index <= 3);
11465
11466  /* Need to implement table size reduction, by chaning the code below.  */
11467  output_asm_insn (patterns[index][0], operands);
11468  ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11469  snprintf (buf, sizeof (buf),
11470	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
11471  output_asm_insn (buf, operands);
11472  output_asm_insn (patterns[index][1], operands);
11473  output_asm_insn ("br\t%3", operands);
11474  output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11475		   operands);
11476  assemble_label (asm_out_file, label);
11477  return "";
11478}
11479
11480
11481/* Return size in bits of an arithmetic operand which is shifted/scaled and
11482   masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11483   operator.  */
11484
11485int
11486aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11487{
11488  if (shift >= 0 && shift <= 3)
11489    {
11490      int size;
11491      for (size = 8; size <= 32; size *= 2)
11492	{
11493	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11494	  if (mask == bits << shift)
11495	    return size;
11496	}
11497    }
11498  return 0;
11499}
11500
11501/* Constant pools are per function only when PC relative
11502   literal loads are true or we are in the large memory
11503   model.  */
11504
11505static inline bool
11506aarch64_can_use_per_function_literal_pools_p (void)
11507{
11508  return (aarch64_pcrelative_literal_loads
11509	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11510}
11511
11512static bool
11513aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11514{
11515  /* We can't use blocks for constants when we're using a per-function
11516     constant pool.  */
11517  return !aarch64_can_use_per_function_literal_pools_p ();
11518}
11519
11520/* Select appropriate section for constants depending
11521   on where we place literal pools.  */
11522
11523static section *
11524aarch64_select_rtx_section (machine_mode mode,
11525			    rtx x,
11526			    unsigned HOST_WIDE_INT align)
11527{
11528  if (aarch64_can_use_per_function_literal_pools_p ())
11529    return function_section (current_function_decl);
11530
11531  return default_elf_select_rtx_section (mode, x, align);
11532}
11533
11534/* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11535void
11536aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11537				  HOST_WIDE_INT offset)
11538{
11539  /* When using per-function literal pools, we must ensure that any code
11540     section is aligned to the minimal instruction length, lest we get
11541     errors from the assembler re "unaligned instructions".  */
11542  if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11543    ASM_OUTPUT_ALIGN (f, 2);
11544}
11545
11546/* Costs.  */
11547
11548/* Helper function for rtx cost calculation.  Strip a shift expression
11549   from X.  Returns the inner operand if successful, or the original
11550   expression on failure.  */
11551static rtx
11552aarch64_strip_shift (rtx x)
11553{
11554  rtx op = x;
11555
11556  /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11557     we can convert both to ROR during final output.  */
11558  if ((GET_CODE (op) == ASHIFT
11559       || GET_CODE (op) == ASHIFTRT
11560       || GET_CODE (op) == LSHIFTRT
11561       || GET_CODE (op) == ROTATERT
11562       || GET_CODE (op) == ROTATE)
11563      && CONST_INT_P (XEXP (op, 1)))
11564    return XEXP (op, 0);
11565
11566  if (GET_CODE (op) == MULT
11567      && CONST_INT_P (XEXP (op, 1))
11568      && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11569    return XEXP (op, 0);
11570
11571  return x;
11572}
11573
11574/* Helper function for rtx cost calculation.  Strip an extend
11575   expression from X.  Returns the inner operand if successful, or the
11576   original expression on failure.  We deal with a number of possible
11577   canonicalization variations here. If STRIP_SHIFT is true, then
11578   we can strip off a shift also.  */
11579static rtx
11580aarch64_strip_extend (rtx x, bool strip_shift)
11581{
11582  scalar_int_mode mode;
11583  rtx op = x;
11584
11585  if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11586    return op;
11587
11588  /* Zero and sign extraction of a widened value.  */
11589  if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11590      && XEXP (op, 2) == const0_rtx
11591      && GET_CODE (XEXP (op, 0)) == MULT
11592      && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11593					 XEXP (op, 1)))
11594    return XEXP (XEXP (op, 0), 0);
11595
11596  /* It can also be represented (for zero-extend) as an AND with an
11597     immediate.  */
11598  if (GET_CODE (op) == AND
11599      && GET_CODE (XEXP (op, 0)) == MULT
11600      && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11601      && CONST_INT_P (XEXP (op, 1))
11602      && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11603			   INTVAL (XEXP (op, 1))) != 0)
11604    return XEXP (XEXP (op, 0), 0);
11605
11606  /* Now handle extended register, as this may also have an optional
11607     left shift by 1..4.  */
11608  if (strip_shift
11609      && GET_CODE (op) == ASHIFT
11610      && CONST_INT_P (XEXP (op, 1))
11611      && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11612    op = XEXP (op, 0);
11613
11614  if (GET_CODE (op) == ZERO_EXTEND
11615      || GET_CODE (op) == SIGN_EXTEND)
11616    op = XEXP (op, 0);
11617
11618  if (op != x)
11619    return op;
11620
11621  return x;
11622}
11623
11624/* Return true iff CODE is a shift supported in combination
11625   with arithmetic instructions.  */
11626
11627static bool
11628aarch64_shift_p (enum rtx_code code)
11629{
11630  return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11631}
11632
11633
11634/* Return true iff X is a cheap shift without a sign extend. */
11635
11636static bool
11637aarch64_cheap_mult_shift_p (rtx x)
11638{
11639  rtx op0, op1;
11640
11641  op0 = XEXP (x, 0);
11642  op1 = XEXP (x, 1);
11643
11644  if (!(aarch64_tune_params.extra_tuning_flags
11645                      & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11646    return false;
11647
11648  if (GET_CODE (op0) == SIGN_EXTEND)
11649    return false;
11650
11651  if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11652      && UINTVAL (op1) <= 4)
11653    return true;
11654
11655  if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11656    return false;
11657
11658  HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11659
11660  if (l2 > 0 && l2 <= 4)
11661    return true;
11662
11663  return false;
11664}
11665
11666/* Helper function for rtx cost calculation.  Calculate the cost of
11667   a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11668   Return the calculated cost of the expression, recursing manually in to
11669   operands where needed.  */
11670
11671static int
11672aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11673{
11674  rtx op0, op1;
11675  const struct cpu_cost_table *extra_cost
11676    = aarch64_tune_params.insn_extra_cost;
11677  int cost = 0;
11678  bool compound_p = (outer == PLUS || outer == MINUS);
11679  machine_mode mode = GET_MODE (x);
11680
11681  gcc_checking_assert (code == MULT);
11682
11683  op0 = XEXP (x, 0);
11684  op1 = XEXP (x, 1);
11685
11686  if (VECTOR_MODE_P (mode))
11687    mode = GET_MODE_INNER (mode);
11688
11689  /* Integer multiply/fma.  */
11690  if (GET_MODE_CLASS (mode) == MODE_INT)
11691    {
11692      /* The multiply will be canonicalized as a shift, cost it as such.  */
11693      if (aarch64_shift_p (GET_CODE (x))
11694	  || (CONST_INT_P (op1)
11695	      && exact_log2 (INTVAL (op1)) > 0))
11696	{
11697	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11698	                   || GET_CODE (op0) == SIGN_EXTEND;
11699	  if (speed)
11700	    {
11701	      if (compound_p)
11702	        {
11703		  /* If the shift is considered cheap,
11704		     then don't add any cost. */
11705		  if (aarch64_cheap_mult_shift_p (x))
11706		    ;
11707	          else if (REG_P (op1))
11708		    /* ARITH + shift-by-register.  */
11709		    cost += extra_cost->alu.arith_shift_reg;
11710		  else if (is_extend)
11711		    /* ARITH + extended register.  We don't have a cost field
11712		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
11713		    cost += extra_cost->alu.extend_arith;
11714		  else
11715		    /* ARITH + shift-by-immediate.  */
11716		    cost += extra_cost->alu.arith_shift;
11717		}
11718	      else
11719		/* LSL (immediate).  */
11720	        cost += extra_cost->alu.shift;
11721
11722	    }
11723	  /* Strip extends as we will have costed them in the case above.  */
11724	  if (is_extend)
11725	    op0 = aarch64_strip_extend (op0, true);
11726
11727	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11728
11729	  return cost;
11730	}
11731
11732      /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
11733	 compound and let the below cases handle it.  After all, MNEG is a
11734	 special-case alias of MSUB.  */
11735      if (GET_CODE (op0) == NEG)
11736	{
11737	  op0 = XEXP (op0, 0);
11738	  compound_p = true;
11739	}
11740
11741      /* Integer multiplies or FMAs have zero/sign extending variants.  */
11742      if ((GET_CODE (op0) == ZERO_EXTEND
11743	   && GET_CODE (op1) == ZERO_EXTEND)
11744	  || (GET_CODE (op0) == SIGN_EXTEND
11745	      && GET_CODE (op1) == SIGN_EXTEND))
11746	{
11747	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11748	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11749
11750	  if (speed)
11751	    {
11752	      if (compound_p)
11753		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
11754		cost += extra_cost->mult[0].extend_add;
11755	      else
11756		/* MUL/SMULL/UMULL.  */
11757		cost += extra_cost->mult[0].extend;
11758	    }
11759
11760	  return cost;
11761	}
11762
11763      /* This is either an integer multiply or a MADD.  In both cases
11764	 we want to recurse and cost the operands.  */
11765      cost += rtx_cost (op0, mode, MULT, 0, speed);
11766      cost += rtx_cost (op1, mode, MULT, 1, speed);
11767
11768      if (speed)
11769	{
11770	  if (compound_p)
11771	    /* MADD/MSUB.  */
11772	    cost += extra_cost->mult[mode == DImode].add;
11773	  else
11774	    /* MUL.  */
11775	    cost += extra_cost->mult[mode == DImode].simple;
11776	}
11777
11778      return cost;
11779    }
11780  else
11781    {
11782      if (speed)
11783	{
11784	  /* Floating-point FMA/FMUL can also support negations of the
11785	     operands, unless the rounding mode is upward or downward in
11786	     which case FNMUL is different than FMUL with operand negation.  */
11787	  bool neg0 = GET_CODE (op0) == NEG;
11788	  bool neg1 = GET_CODE (op1) == NEG;
11789	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
11790	    {
11791	      if (neg0)
11792		op0 = XEXP (op0, 0);
11793	      if (neg1)
11794		op1 = XEXP (op1, 0);
11795	    }
11796
11797	  if (compound_p)
11798	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
11799	    cost += extra_cost->fp[mode == DFmode].fma;
11800	  else
11801	    /* FMUL/FNMUL.  */
11802	    cost += extra_cost->fp[mode == DFmode].mult;
11803	}
11804
11805      cost += rtx_cost (op0, mode, MULT, 0, speed);
11806      cost += rtx_cost (op1, mode, MULT, 1, speed);
11807      return cost;
11808    }
11809}
11810
11811static int
11812aarch64_address_cost (rtx x,
11813		      machine_mode mode,
11814		      addr_space_t as ATTRIBUTE_UNUSED,
11815		      bool speed)
11816{
11817  enum rtx_code c = GET_CODE (x);
11818  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11819  struct aarch64_address_info info;
11820  int cost = 0;
11821  info.shift = 0;
11822
11823  if (!aarch64_classify_address (&info, x, mode, false))
11824    {
11825      if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11826	{
11827	  /* This is a CONST or SYMBOL ref which will be split
11828	     in a different way depending on the code model in use.
11829	     Cost it through the generic infrastructure.  */
11830	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11831	  /* Divide through by the cost of one instruction to
11832	     bring it to the same units as the address costs.  */
11833	  cost_symbol_ref /= COSTS_N_INSNS (1);
11834	  /* The cost is then the cost of preparing the address,
11835	     followed by an immediate (possibly 0) offset.  */
11836	  return cost_symbol_ref + addr_cost->imm_offset;
11837	}
11838      else
11839	{
11840	  /* This is most likely a jump table from a case
11841	     statement.  */
11842	  return addr_cost->register_offset;
11843	}
11844    }
11845
11846  switch (info.type)
11847    {
11848      case ADDRESS_LO_SUM:
11849      case ADDRESS_SYMBOLIC:
11850      case ADDRESS_REG_IMM:
11851	cost += addr_cost->imm_offset;
11852	break;
11853
11854      case ADDRESS_REG_WB:
11855	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11856	  cost += addr_cost->pre_modify;
11857	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11858	  cost += addr_cost->post_modify;
11859	else
11860	  gcc_unreachable ();
11861
11862	break;
11863
11864      case ADDRESS_REG_REG:
11865	cost += addr_cost->register_offset;
11866	break;
11867
11868      case ADDRESS_REG_SXTW:
11869	cost += addr_cost->register_sextend;
11870	break;
11871
11872      case ADDRESS_REG_UXTW:
11873	cost += addr_cost->register_zextend;
11874	break;
11875
11876      default:
11877	gcc_unreachable ();
11878    }
11879
11880
11881  if (info.shift > 0)
11882    {
11883      /* For the sake of calculating the cost of the shifted register
11884	 component, we can treat same sized modes in the same way.  */
11885      if (known_eq (GET_MODE_BITSIZE (mode), 16))
11886	cost += addr_cost->addr_scale_costs.hi;
11887      else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11888	cost += addr_cost->addr_scale_costs.si;
11889      else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11890	cost += addr_cost->addr_scale_costs.di;
11891      else
11892	/* We can't tell, or this is a 128-bit vector.  */
11893	cost += addr_cost->addr_scale_costs.ti;
11894    }
11895
11896  return cost;
11897}
11898
11899/* Return the cost of a branch.  If SPEED_P is true then the compiler is
11900   optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
11901   to be taken.  */
11902
11903int
11904aarch64_branch_cost (bool speed_p, bool predictable_p)
11905{
11906  /* When optimizing for speed, use the cost of unpredictable branches.  */
11907  const struct cpu_branch_cost *branch_costs =
11908    aarch64_tune_params.branch_costs;
11909
11910  if (!speed_p || predictable_p)
11911    return branch_costs->predictable;
11912  else
11913    return branch_costs->unpredictable;
11914}
11915
11916/* Return true if the RTX X in mode MODE is a zero or sign extract
11917   usable in an ADD or SUB (extended register) instruction.  */
11918static bool
11919aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11920{
11921  /* Catch add with a sign extract.
11922     This is add_<optab><mode>_multp2.  */
11923  if (GET_CODE (x) == SIGN_EXTRACT
11924      || GET_CODE (x) == ZERO_EXTRACT)
11925    {
11926      rtx op0 = XEXP (x, 0);
11927      rtx op1 = XEXP (x, 1);
11928      rtx op2 = XEXP (x, 2);
11929
11930      if (GET_CODE (op0) == MULT
11931	  && CONST_INT_P (op1)
11932	  && op2 == const0_rtx
11933	  && CONST_INT_P (XEXP (op0, 1))
11934	  && aarch64_is_extend_from_extract (mode,
11935					     XEXP (op0, 1),
11936					     op1))
11937	{
11938	  return true;
11939	}
11940    }
11941  /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11942     No shift.  */
11943  else if (GET_CODE (x) == SIGN_EXTEND
11944	   || GET_CODE (x) == ZERO_EXTEND)
11945    return REG_P (XEXP (x, 0));
11946
11947  return false;
11948}
11949
11950static bool
11951aarch64_frint_unspec_p (unsigned int u)
11952{
11953  switch (u)
11954    {
11955      case UNSPEC_FRINTZ:
11956      case UNSPEC_FRINTP:
11957      case UNSPEC_FRINTM:
11958      case UNSPEC_FRINTA:
11959      case UNSPEC_FRINTN:
11960      case UNSPEC_FRINTX:
11961      case UNSPEC_FRINTI:
11962        return true;
11963
11964      default:
11965        return false;
11966    }
11967}
11968
11969/* Return true iff X is an rtx that will match an extr instruction
11970   i.e. as described in the *extr<mode>5_insn family of patterns.
11971   OP0 and OP1 will be set to the operands of the shifts involved
11972   on success and will be NULL_RTX otherwise.  */
11973
11974static bool
11975aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11976{
11977  rtx op0, op1;
11978  scalar_int_mode mode;
11979  if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11980    return false;
11981
11982  *res_op0 = NULL_RTX;
11983  *res_op1 = NULL_RTX;
11984
11985  if (GET_CODE (x) != IOR)
11986    return false;
11987
11988  op0 = XEXP (x, 0);
11989  op1 = XEXP (x, 1);
11990
11991  if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11992      || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11993    {
11994     /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11995      if (GET_CODE (op1) == ASHIFT)
11996        std::swap (op0, op1);
11997
11998      if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11999        return false;
12000
12001      unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
12002      unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
12003
12004      if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
12005          && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
12006        {
12007          *res_op0 = XEXP (op0, 0);
12008          *res_op1 = XEXP (op1, 0);
12009          return true;
12010        }
12011    }
12012
12013  return false;
12014}
12015
12016/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
12017   storing it in *COST.  Result is true if the total cost of the operation
12018   has now been calculated.  */
12019static bool
12020aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
12021{
12022  rtx inner;
12023  rtx comparator;
12024  enum rtx_code cmpcode;
12025  const struct cpu_cost_table *extra_cost
12026    = aarch64_tune_params.insn_extra_cost;
12027
12028  if (COMPARISON_P (op0))
12029    {
12030      inner = XEXP (op0, 0);
12031      comparator = XEXP (op0, 1);
12032      cmpcode = GET_CODE (op0);
12033    }
12034  else
12035    {
12036      inner = op0;
12037      comparator = const0_rtx;
12038      cmpcode = NE;
12039    }
12040
12041  if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
12042    {
12043      /* Conditional branch.  */
12044      if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12045	return true;
12046      else
12047	{
12048	  if (cmpcode == NE || cmpcode == EQ)
12049	    {
12050	      if (comparator == const0_rtx)
12051		{
12052		  /* TBZ/TBNZ/CBZ/CBNZ.  */
12053		  if (GET_CODE (inner) == ZERO_EXTRACT)
12054		    /* TBZ/TBNZ.  */
12055		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
12056				       ZERO_EXTRACT, 0, speed);
12057		  else
12058		    /* CBZ/CBNZ.  */
12059		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
12060
12061		  return true;
12062		}
12063	      if (register_operand (inner, VOIDmode)
12064		  && aarch64_imm24 (comparator, VOIDmode))
12065		{
12066		  /* SUB and SUBS.  */
12067		  *cost += COSTS_N_INSNS (2);
12068		  if (speed)
12069		    *cost += extra_cost->alu.arith * 2;
12070		  return true;
12071		}
12072	    }
12073	  else if (cmpcode == LT || cmpcode == GE)
12074	    {
12075	      /* TBZ/TBNZ.  */
12076	      if (comparator == const0_rtx)
12077		return true;
12078	    }
12079	}
12080    }
12081  else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12082    {
12083      /* CCMP.  */
12084      if (GET_CODE (op1) == COMPARE)
12085	{
12086	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
12087	  if (XEXP (op1, 1) == const0_rtx)
12088	    *cost += 1;
12089	  if (speed)
12090	    {
12091	      machine_mode mode = GET_MODE (XEXP (op1, 0));
12092	      const struct cpu_cost_table *extra_cost
12093		= aarch64_tune_params.insn_extra_cost;
12094
12095	      if (GET_MODE_CLASS (mode) == MODE_INT)
12096		*cost += extra_cost->alu.arith;
12097	      else
12098		*cost += extra_cost->fp[mode == DFmode].compare;
12099	    }
12100	  return true;
12101	}
12102
12103      /* It's a conditional operation based on the status flags,
12104	 so it must be some flavor of CSEL.  */
12105
12106      /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
12107      if (GET_CODE (op1) == NEG
12108          || GET_CODE (op1) == NOT
12109          || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
12110	op1 = XEXP (op1, 0);
12111      else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
12112	{
12113	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
12114	  op1 = XEXP (op1, 0);
12115	  op2 = XEXP (op2, 0);
12116	}
12117
12118      *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
12119      *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
12120      return true;
12121    }
12122
12123  /* We don't know what this is, cost all operands.  */
12124  return false;
12125}
12126
12127/* Check whether X is a bitfield operation of the form shift + extend that
12128   maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
12129   operand to which the bitfield operation is applied.  Otherwise return
12130   NULL_RTX.  */
12131
12132static rtx
12133aarch64_extend_bitfield_pattern_p (rtx x)
12134{
12135  rtx_code outer_code = GET_CODE (x);
12136  machine_mode outer_mode = GET_MODE (x);
12137
12138  if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
12139      && outer_mode != SImode && outer_mode != DImode)
12140    return NULL_RTX;
12141
12142  rtx inner = XEXP (x, 0);
12143  rtx_code inner_code = GET_CODE (inner);
12144  machine_mode inner_mode = GET_MODE (inner);
12145  rtx op = NULL_RTX;
12146
12147  switch (inner_code)
12148    {
12149      case ASHIFT:
12150	if (CONST_INT_P (XEXP (inner, 1))
12151	    && (inner_mode == QImode || inner_mode == HImode))
12152	  op = XEXP (inner, 0);
12153	break;
12154      case LSHIFTRT:
12155	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
12156	    && (inner_mode == QImode || inner_mode == HImode))
12157	  op = XEXP (inner, 0);
12158	break;
12159      case ASHIFTRT:
12160	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
12161	    && (inner_mode == QImode || inner_mode == HImode))
12162	  op = XEXP (inner, 0);
12163	break;
12164      default:
12165	break;
12166    }
12167
12168  return op;
12169}
12170
12171/* Return true if the mask and a shift amount from an RTX of the form
12172   (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
12173   mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
12174
12175bool
12176aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
12177				    rtx shft_amnt)
12178{
12179  return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
12180	 && INTVAL (mask) > 0
12181	 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
12182	 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
12183	 && (UINTVAL (mask)
12184	     & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
12185}
12186
12187/* Return true if the masks and a shift amount from an RTX of the form
12188   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
12189   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
12190
12191bool
12192aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
12193				   unsigned HOST_WIDE_INT mask1,
12194				   unsigned HOST_WIDE_INT shft_amnt,
12195				   unsigned HOST_WIDE_INT mask2)
12196{
12197  unsigned HOST_WIDE_INT t;
12198
12199  /* Verify that there is no overlap in what bits are set in the two masks.  */
12200  if (mask1 != ~mask2)
12201    return false;
12202
12203  /* Verify that mask2 is not all zeros or ones.  */
12204  if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12205    return false;
12206
12207  /* The shift amount should always be less than the mode size.  */
12208  gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12209
12210  /* Verify that the mask being shifted is contiguous and would be in the
12211     least significant bits after shifting by shft_amnt.  */
12212  t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12213  return (t == (t & -t));
12214}
12215
12216/* Calculate the cost of calculating X, storing it in *COST.  Result
12217   is true if the total cost of the operation has now been calculated.  */
12218static bool
12219aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12220		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12221{
12222  rtx op0, op1, op2;
12223  const struct cpu_cost_table *extra_cost
12224    = aarch64_tune_params.insn_extra_cost;
12225  int code = GET_CODE (x);
12226  scalar_int_mode int_mode;
12227
12228  /* By default, assume that everything has equivalent cost to the
12229     cheapest instruction.  Any additional costs are applied as a delta
12230     above this default.  */
12231  *cost = COSTS_N_INSNS (1);
12232
12233  switch (code)
12234    {
12235    case SET:
12236      /* The cost depends entirely on the operands to SET.  */
12237      *cost = 0;
12238      op0 = SET_DEST (x);
12239      op1 = SET_SRC (x);
12240
12241      switch (GET_CODE (op0))
12242	{
12243	case MEM:
12244	  if (speed)
12245	    {
12246	      rtx address = XEXP (op0, 0);
12247	      if (VECTOR_MODE_P (mode))
12248		*cost += extra_cost->ldst.storev;
12249	      else if (GET_MODE_CLASS (mode) == MODE_INT)
12250		*cost += extra_cost->ldst.store;
12251	      else if (mode == SFmode)
12252		*cost += extra_cost->ldst.storef;
12253	      else if (mode == DFmode)
12254		*cost += extra_cost->ldst.stored;
12255
12256	      *cost +=
12257		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12258						     0, speed));
12259	    }
12260
12261	  *cost += rtx_cost (op1, mode, SET, 1, speed);
12262	  return true;
12263
12264	case SUBREG:
12265	  if (! REG_P (SUBREG_REG (op0)))
12266	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12267
12268	  /* Fall through.  */
12269	case REG:
12270	  /* The cost is one per vector-register copied.  */
12271	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12272	    {
12273	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12274	      *cost = COSTS_N_INSNS (nregs);
12275	    }
12276	  /* const0_rtx is in general free, but we will use an
12277	     instruction to set a register to 0.  */
12278	  else if (REG_P (op1) || op1 == const0_rtx)
12279	    {
12280	      /* The cost is 1 per register copied.  */
12281	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12282	      *cost = COSTS_N_INSNS (nregs);
12283	    }
12284          else
12285	    /* Cost is just the cost of the RHS of the set.  */
12286	    *cost += rtx_cost (op1, mode, SET, 1, speed);
12287	  return true;
12288
12289	case ZERO_EXTRACT:
12290	case SIGN_EXTRACT:
12291	  /* Bit-field insertion.  Strip any redundant widening of
12292	     the RHS to meet the width of the target.  */
12293	  if (GET_CODE (op1) == SUBREG)
12294	    op1 = SUBREG_REG (op1);
12295	  if ((GET_CODE (op1) == ZERO_EXTEND
12296	       || GET_CODE (op1) == SIGN_EXTEND)
12297	      && CONST_INT_P (XEXP (op0, 1))
12298	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12299	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12300	    op1 = XEXP (op1, 0);
12301
12302          if (CONST_INT_P (op1))
12303            {
12304              /* MOV immediate is assumed to always be cheap.  */
12305              *cost = COSTS_N_INSNS (1);
12306            }
12307          else
12308            {
12309              /* BFM.  */
12310	      if (speed)
12311		*cost += extra_cost->alu.bfi;
12312	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12313            }
12314
12315	  return true;
12316
12317	default:
12318	  /* We can't make sense of this, assume default cost.  */
12319          *cost = COSTS_N_INSNS (1);
12320	  return false;
12321	}
12322      return false;
12323
12324    case CONST_INT:
12325      /* If an instruction can incorporate a constant within the
12326	 instruction, the instruction's expression avoids calling
12327	 rtx_cost() on the constant.  If rtx_cost() is called on a
12328	 constant, then it is usually because the constant must be
12329	 moved into a register by one or more instructions.
12330
12331	 The exception is constant 0, which can be expressed
12332	 as XZR/WZR and is therefore free.  The exception to this is
12333	 if we have (set (reg) (const0_rtx)) in which case we must cost
12334	 the move.  However, we can catch that when we cost the SET, so
12335	 we don't need to consider that here.  */
12336      if (x == const0_rtx)
12337	*cost = 0;
12338      else
12339	{
12340	  /* To an approximation, building any other constant is
12341	     proportionally expensive to the number of instructions
12342	     required to build that constant.  This is true whether we
12343	     are compiling for SPEED or otherwise.  */
12344	  if (!is_a <scalar_int_mode> (mode, &int_mode))
12345	    int_mode = word_mode;
12346	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12347				 (NULL_RTX, x, false, int_mode));
12348	}
12349      return true;
12350
12351    case CONST_DOUBLE:
12352
12353      /* First determine number of instructions to do the move
12354	  as an integer constant.  */
12355      if (!aarch64_float_const_representable_p (x)
12356	   && !aarch64_can_const_movi_rtx_p (x, mode)
12357	   && aarch64_float_const_rtx_p (x))
12358	{
12359	  unsigned HOST_WIDE_INT ival;
12360	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12361	  gcc_assert (succeed);
12362
12363	  scalar_int_mode imode = (mode == HFmode
12364				   ? SImode
12365				   : int_mode_for_mode (mode).require ());
12366	  int ncost = aarch64_internal_mov_immediate
12367		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
12368	  *cost += COSTS_N_INSNS (ncost);
12369	  return true;
12370	}
12371
12372      if (speed)
12373	{
12374	  /* mov[df,sf]_aarch64.  */
12375	  if (aarch64_float_const_representable_p (x))
12376	    /* FMOV (scalar immediate).  */
12377	    *cost += extra_cost->fp[mode == DFmode].fpconst;
12378	  else if (!aarch64_float_const_zero_rtx_p (x))
12379	    {
12380	      /* This will be a load from memory.  */
12381	      if (mode == DFmode)
12382		*cost += extra_cost->ldst.loadd;
12383	      else
12384		*cost += extra_cost->ldst.loadf;
12385	    }
12386	  else
12387	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
12388	       or MOV v0.s[0], wzr - neither of which are modeled by the
12389	       cost tables.  Just use the default cost.  */
12390	    {
12391	    }
12392	}
12393
12394      return true;
12395
12396    case MEM:
12397      if (speed)
12398	{
12399	  /* For loads we want the base cost of a load, plus an
12400	     approximation for the additional cost of the addressing
12401	     mode.  */
12402	  rtx address = XEXP (x, 0);
12403	  if (VECTOR_MODE_P (mode))
12404	    *cost += extra_cost->ldst.loadv;
12405	  else if (GET_MODE_CLASS (mode) == MODE_INT)
12406	    *cost += extra_cost->ldst.load;
12407	  else if (mode == SFmode)
12408	    *cost += extra_cost->ldst.loadf;
12409	  else if (mode == DFmode)
12410	    *cost += extra_cost->ldst.loadd;
12411
12412	  *cost +=
12413		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12414						     0, speed));
12415	}
12416
12417      return true;
12418
12419    case NEG:
12420      op0 = XEXP (x, 0);
12421
12422      if (VECTOR_MODE_P (mode))
12423	{
12424	  if (speed)
12425	    {
12426	      /* FNEG.  */
12427	      *cost += extra_cost->vect.alu;
12428	    }
12429	  return false;
12430	}
12431
12432      if (GET_MODE_CLASS (mode) == MODE_INT)
12433	{
12434          if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12435              || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12436            {
12437              /* CSETM.  */
12438	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12439              return true;
12440            }
12441
12442	  /* Cost this as SUB wzr, X.  */
12443          op0 = CONST0_RTX (mode);
12444          op1 = XEXP (x, 0);
12445          goto cost_minus;
12446        }
12447
12448      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12449        {
12450          /* Support (neg(fma...)) as a single instruction only if
12451             sign of zeros is unimportant.  This matches the decision
12452             making in aarch64.md.  */
12453          if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12454            {
12455	      /* FNMADD.  */
12456	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12457              return true;
12458            }
12459	  if (GET_CODE (op0) == MULT)
12460	    {
12461	      /* FNMUL.  */
12462	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12463	      return true;
12464	    }
12465	  if (speed)
12466	    /* FNEG.  */
12467	    *cost += extra_cost->fp[mode == DFmode].neg;
12468          return false;
12469        }
12470
12471      return false;
12472
12473    case CLRSB:
12474    case CLZ:
12475      if (speed)
12476	{
12477	  if (VECTOR_MODE_P (mode))
12478	    *cost += extra_cost->vect.alu;
12479	  else
12480	    *cost += extra_cost->alu.clz;
12481	}
12482
12483      return false;
12484
12485    case CTZ:
12486      *cost = COSTS_N_INSNS (2);
12487
12488      if (speed)
12489	*cost += extra_cost->alu.clz + extra_cost->alu.rev;
12490      return false;
12491
12492    case COMPARE:
12493      op0 = XEXP (x, 0);
12494      op1 = XEXP (x, 1);
12495
12496      if (op1 == const0_rtx
12497	  && GET_CODE (op0) == AND)
12498	{
12499	  x = op0;
12500	  mode = GET_MODE (op0);
12501	  goto cost_logic;
12502	}
12503
12504      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12505        {
12506          /* TODO: A write to the CC flags possibly costs extra, this
12507	     needs encoding in the cost tables.  */
12508
12509	  mode = GET_MODE (op0);
12510          /* ANDS.  */
12511          if (GET_CODE (op0) == AND)
12512            {
12513              x = op0;
12514              goto cost_logic;
12515            }
12516
12517          if (GET_CODE (op0) == PLUS)
12518            {
12519	      /* ADDS (and CMN alias).  */
12520              x = op0;
12521              goto cost_plus;
12522            }
12523
12524          if (GET_CODE (op0) == MINUS)
12525            {
12526	      /* SUBS.  */
12527              x = op0;
12528              goto cost_minus;
12529            }
12530
12531	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12532	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12533	      && CONST_INT_P (XEXP (op0, 2)))
12534	    {
12535	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12536		 Handle it here directly rather than going to cost_logic
12537		 since we know the immediate generated for the TST is valid
12538		 so we can avoid creating an intermediate rtx for it only
12539		 for costing purposes.  */
12540	      if (speed)
12541		*cost += extra_cost->alu.logical;
12542
12543	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12544				 ZERO_EXTRACT, 0, speed);
12545	      return true;
12546	    }
12547
12548          if (GET_CODE (op1) == NEG)
12549            {
12550	      /* CMN.  */
12551	      if (speed)
12552		*cost += extra_cost->alu.arith;
12553
12554	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12555	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12556              return true;
12557            }
12558
12559          /* CMP.
12560
12561	     Compare can freely swap the order of operands, and
12562             canonicalization puts the more complex operation first.
12563             But the integer MINUS logic expects the shift/extend
12564             operation in op1.  */
12565          if (! (REG_P (op0)
12566                 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12567          {
12568            op0 = XEXP (x, 1);
12569            op1 = XEXP (x, 0);
12570          }
12571          goto cost_minus;
12572        }
12573
12574      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12575        {
12576	  /* FCMP.  */
12577	  if (speed)
12578	    *cost += extra_cost->fp[mode == DFmode].compare;
12579
12580          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12581            {
12582	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12583              /* FCMP supports constant 0.0 for no extra cost. */
12584              return true;
12585            }
12586          return false;
12587        }
12588
12589      if (VECTOR_MODE_P (mode))
12590	{
12591	  /* Vector compare.  */
12592	  if (speed)
12593	    *cost += extra_cost->vect.alu;
12594
12595	  if (aarch64_float_const_zero_rtx_p (op1))
12596	    {
12597	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12598		 cost.  */
12599	      return true;
12600	    }
12601	  return false;
12602	}
12603      return false;
12604
12605    case MINUS:
12606      {
12607	op0 = XEXP (x, 0);
12608	op1 = XEXP (x, 1);
12609
12610cost_minus:
12611	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
12612
12613	/* Detect valid immediates.  */
12614	if ((GET_MODE_CLASS (mode) == MODE_INT
12615	     || (GET_MODE_CLASS (mode) == MODE_CC
12616		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12617	    && CONST_INT_P (op1)
12618	    && aarch64_uimm12_shift (INTVAL (op1)))
12619	  {
12620	    if (speed)
12621	      /* SUB(S) (immediate).  */
12622	      *cost += extra_cost->alu.arith;
12623	    return true;
12624	  }
12625
12626	/* Look for SUB (extended register).  */
12627	if (is_a <scalar_int_mode> (mode, &int_mode)
12628	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12629	  {
12630	    if (speed)
12631	      *cost += extra_cost->alu.extend_arith;
12632
12633	    op1 = aarch64_strip_extend (op1, true);
12634	    *cost += rtx_cost (op1, VOIDmode,
12635			       (enum rtx_code) GET_CODE (op1), 0, speed);
12636	    return true;
12637	  }
12638
12639	rtx new_op1 = aarch64_strip_extend (op1, false);
12640
12641	/* Cost this as an FMA-alike operation.  */
12642	if ((GET_CODE (new_op1) == MULT
12643	     || aarch64_shift_p (GET_CODE (new_op1)))
12644	    && code != COMPARE)
12645	  {
12646	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12647					    (enum rtx_code) code,
12648					    speed);
12649	    return true;
12650	  }
12651
12652	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12653
12654	if (speed)
12655	  {
12656	    if (VECTOR_MODE_P (mode))
12657	      {
12658		/* Vector SUB.  */
12659		*cost += extra_cost->vect.alu;
12660	      }
12661	    else if (GET_MODE_CLASS (mode) == MODE_INT)
12662	      {
12663		/* SUB(S).  */
12664		*cost += extra_cost->alu.arith;
12665	      }
12666	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12667	      {
12668		/* FSUB.  */
12669		*cost += extra_cost->fp[mode == DFmode].addsub;
12670	      }
12671	  }
12672	return true;
12673      }
12674
12675    case PLUS:
12676      {
12677	rtx new_op0;
12678
12679	op0 = XEXP (x, 0);
12680	op1 = XEXP (x, 1);
12681
12682cost_plus:
12683	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12684	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12685	  {
12686	    /* CSINC.  */
12687	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12688	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12689	    return true;
12690	  }
12691
12692	if (GET_MODE_CLASS (mode) == MODE_INT
12693	    && (aarch64_plus_immediate (op1, mode)
12694		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
12695	  {
12696	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12697
12698	    if (speed)
12699	      {
12700		/* ADD (immediate).  */
12701		*cost += extra_cost->alu.arith;
12702
12703		/* Some tunings prefer to not use the VL-based scalar ops.
12704		   Increase the cost of the poly immediate to prevent their
12705		   formation.  */
12706		if (GET_CODE (op1) == CONST_POLY_INT
12707		    && (aarch64_tune_params.extra_tuning_flags
12708			& AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
12709		  *cost += COSTS_N_INSNS (1);
12710	      }
12711	    return true;
12712	  }
12713
12714	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
12715
12716	/* Look for ADD (extended register).  */
12717	if (is_a <scalar_int_mode> (mode, &int_mode)
12718	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12719	  {
12720	    if (speed)
12721	      *cost += extra_cost->alu.extend_arith;
12722
12723	    op0 = aarch64_strip_extend (op0, true);
12724	    *cost += rtx_cost (op0, VOIDmode,
12725			       (enum rtx_code) GET_CODE (op0), 0, speed);
12726	    return true;
12727	  }
12728
12729	/* Strip any extend, leave shifts behind as we will
12730	   cost them through mult_cost.  */
12731	new_op0 = aarch64_strip_extend (op0, false);
12732
12733	if (GET_CODE (new_op0) == MULT
12734	    || aarch64_shift_p (GET_CODE (new_op0)))
12735	  {
12736	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12737					    speed);
12738	    return true;
12739	  }
12740
12741	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12742
12743	if (speed)
12744	  {
12745	    if (VECTOR_MODE_P (mode))
12746	      {
12747		/* Vector ADD.  */
12748		*cost += extra_cost->vect.alu;
12749	      }
12750	    else if (GET_MODE_CLASS (mode) == MODE_INT)
12751	      {
12752		/* ADD.  */
12753		*cost += extra_cost->alu.arith;
12754	      }
12755	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12756	      {
12757		/* FADD.  */
12758		*cost += extra_cost->fp[mode == DFmode].addsub;
12759	      }
12760	  }
12761	return true;
12762      }
12763
12764    case BSWAP:
12765      *cost = COSTS_N_INSNS (1);
12766
12767      if (speed)
12768	{
12769	  if (VECTOR_MODE_P (mode))
12770	    *cost += extra_cost->vect.alu;
12771	  else
12772	    *cost += extra_cost->alu.rev;
12773	}
12774      return false;
12775
12776    case IOR:
12777      if (aarch_rev16_p (x))
12778        {
12779          *cost = COSTS_N_INSNS (1);
12780
12781	  if (speed)
12782	    {
12783	      if (VECTOR_MODE_P (mode))
12784		*cost += extra_cost->vect.alu;
12785	      else
12786		*cost += extra_cost->alu.rev;
12787	    }
12788	  return true;
12789        }
12790
12791      if (aarch64_extr_rtx_p (x, &op0, &op1))
12792        {
12793	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
12794	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
12795          if (speed)
12796            *cost += extra_cost->alu.shift;
12797
12798          return true;
12799        }
12800    /* Fall through.  */
12801    case XOR:
12802    case AND:
12803    cost_logic:
12804      op0 = XEXP (x, 0);
12805      op1 = XEXP (x, 1);
12806
12807      if (VECTOR_MODE_P (mode))
12808	{
12809	  if (speed)
12810	    *cost += extra_cost->vect.alu;
12811	  return true;
12812	}
12813
12814      if (code == AND
12815          && GET_CODE (op0) == MULT
12816          && CONST_INT_P (XEXP (op0, 1))
12817          && CONST_INT_P (op1)
12818          && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12819                               INTVAL (op1)) != 0)
12820        {
12821          /* This is a UBFM/SBFM.  */
12822	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12823	  if (speed)
12824	    *cost += extra_cost->alu.bfx;
12825          return true;
12826        }
12827
12828      if (is_int_mode (mode, &int_mode))
12829	{
12830	  if (CONST_INT_P (op1))
12831	    {
12832	      /* We have a mask + shift version of a UBFIZ
12833		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
12834	      if (GET_CODE (op0) == ASHIFT
12835		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12836							 XEXP (op0, 1)))
12837		{
12838		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
12839				     (enum rtx_code) code, 0, speed);
12840		  if (speed)
12841		    *cost += extra_cost->alu.bfx;
12842
12843		  return true;
12844		}
12845	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12846		{
12847		/* We possibly get the immediate for free, this is not
12848		   modelled.  */
12849		  *cost += rtx_cost (op0, int_mode,
12850				     (enum rtx_code) code, 0, speed);
12851		  if (speed)
12852		    *cost += extra_cost->alu.logical;
12853
12854		  return true;
12855		}
12856	    }
12857	  else
12858	    {
12859	      rtx new_op0 = op0;
12860
12861	      /* Handle ORN, EON, or BIC.  */
12862	      if (GET_CODE (op0) == NOT)
12863		op0 = XEXP (op0, 0);
12864
12865	      new_op0 = aarch64_strip_shift (op0);
12866
12867	      /* If we had a shift on op0 then this is a logical-shift-
12868		 by-register/immediate operation.  Otherwise, this is just
12869		 a logical operation.  */
12870	      if (speed)
12871		{
12872		  if (new_op0 != op0)
12873		    {
12874		      /* Shift by immediate.  */
12875		      if (CONST_INT_P (XEXP (op0, 1)))
12876			*cost += extra_cost->alu.log_shift;
12877		      else
12878			*cost += extra_cost->alu.log_shift_reg;
12879		    }
12880		  else
12881		    *cost += extra_cost->alu.logical;
12882		}
12883
12884	      /* In both cases we want to cost both operands.  */
12885	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12886				 0, speed);
12887	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12888				 1, speed);
12889
12890	      return true;
12891	    }
12892	}
12893      return false;
12894
12895    case NOT:
12896      x = XEXP (x, 0);
12897      op0 = aarch64_strip_shift (x);
12898
12899      if (VECTOR_MODE_P (mode))
12900	{
12901	  /* Vector NOT.  */
12902	  *cost += extra_cost->vect.alu;
12903	  return false;
12904	}
12905
12906      /* MVN-shifted-reg.  */
12907      if (op0 != x)
12908        {
12909	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12910
12911          if (speed)
12912            *cost += extra_cost->alu.log_shift;
12913
12914          return true;
12915        }
12916      /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12917         Handle the second form here taking care that 'a' in the above can
12918         be a shift.  */
12919      else if (GET_CODE (op0) == XOR)
12920        {
12921          rtx newop0 = XEXP (op0, 0);
12922          rtx newop1 = XEXP (op0, 1);
12923          rtx op0_stripped = aarch64_strip_shift (newop0);
12924
12925	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12926	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12927
12928          if (speed)
12929            {
12930              if (op0_stripped != newop0)
12931                *cost += extra_cost->alu.log_shift;
12932              else
12933                *cost += extra_cost->alu.logical;
12934            }
12935
12936          return true;
12937        }
12938      /* MVN.  */
12939      if (speed)
12940	*cost += extra_cost->alu.logical;
12941
12942      return false;
12943
12944    case ZERO_EXTEND:
12945
12946      op0 = XEXP (x, 0);
12947      /* If a value is written in SI mode, then zero extended to DI
12948	 mode, the operation will in general be free as a write to
12949	 a 'w' register implicitly zeroes the upper bits of an 'x'
12950	 register.  However, if this is
12951
12952	   (set (reg) (zero_extend (reg)))
12953
12954	 we must cost the explicit register move.  */
12955      if (mode == DImode
12956	  && GET_MODE (op0) == SImode
12957	  && outer == SET)
12958	{
12959	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12960
12961	/* If OP_COST is non-zero, then the cost of the zero extend
12962	   is effectively the cost of the inner operation.  Otherwise
12963	   we have a MOV instruction and we take the cost from the MOV
12964	   itself.  This is true independently of whether we are
12965	   optimizing for space or time.  */
12966	  if (op_cost)
12967	    *cost = op_cost;
12968
12969	  return true;
12970	}
12971      else if (MEM_P (op0))
12972	{
12973	  /* All loads can zero extend to any size for free.  */
12974	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12975	  return true;
12976	}
12977
12978      op0 = aarch64_extend_bitfield_pattern_p (x);
12979      if (op0)
12980	{
12981	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12982	  if (speed)
12983	    *cost += extra_cost->alu.bfx;
12984	  return true;
12985	}
12986
12987      if (speed)
12988	{
12989	  if (VECTOR_MODE_P (mode))
12990	    {
12991	      /* UMOV.  */
12992	      *cost += extra_cost->vect.alu;
12993	    }
12994	  else
12995	    {
12996	      /* We generate an AND instead of UXTB/UXTH.  */
12997	      *cost += extra_cost->alu.logical;
12998	    }
12999	}
13000      return false;
13001
13002    case SIGN_EXTEND:
13003      if (MEM_P (XEXP (x, 0)))
13004	{
13005	  /* LDRSH.  */
13006	  if (speed)
13007	    {
13008	      rtx address = XEXP (XEXP (x, 0), 0);
13009	      *cost += extra_cost->ldst.load_sign_extend;
13010
13011	      *cost +=
13012		COSTS_N_INSNS (aarch64_address_cost (address, mode,
13013						     0, speed));
13014	    }
13015	  return true;
13016	}
13017
13018      op0 = aarch64_extend_bitfield_pattern_p (x);
13019      if (op0)
13020	{
13021	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
13022	  if (speed)
13023	    *cost += extra_cost->alu.bfx;
13024	  return true;
13025	}
13026
13027      if (speed)
13028	{
13029	  if (VECTOR_MODE_P (mode))
13030	    *cost += extra_cost->vect.alu;
13031	  else
13032	    *cost += extra_cost->alu.extend;
13033	}
13034      return false;
13035
13036    case ASHIFT:
13037      op0 = XEXP (x, 0);
13038      op1 = XEXP (x, 1);
13039
13040      if (CONST_INT_P (op1))
13041        {
13042	  if (speed)
13043	    {
13044	      if (VECTOR_MODE_P (mode))
13045		{
13046		  /* Vector shift (immediate).  */
13047		  *cost += extra_cost->vect.alu;
13048		}
13049	      else
13050		{
13051		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
13052		     aliases.  */
13053		  *cost += extra_cost->alu.shift;
13054		}
13055	    }
13056
13057          /* We can incorporate zero/sign extend for free.  */
13058          if (GET_CODE (op0) == ZERO_EXTEND
13059              || GET_CODE (op0) == SIGN_EXTEND)
13060            op0 = XEXP (op0, 0);
13061
13062	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
13063          return true;
13064        }
13065      else
13066        {
13067	  if (VECTOR_MODE_P (mode))
13068	    {
13069	      if (speed)
13070		/* Vector shift (register).  */
13071		*cost += extra_cost->vect.alu;
13072	    }
13073	  else
13074	    {
13075	      if (speed)
13076		/* LSLV.  */
13077		*cost += extra_cost->alu.shift_reg;
13078
13079	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13080		  && CONST_INT_P (XEXP (op1, 1))
13081		  && known_eq (INTVAL (XEXP (op1, 1)),
13082			       GET_MODE_BITSIZE (mode) - 1))
13083		{
13084		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13085		  /* We already demanded XEXP (op1, 0) to be REG_P, so
13086		     don't recurse into it.  */
13087		  return true;
13088		}
13089	    }
13090	  return false;  /* All arguments need to be in registers.  */
13091        }
13092
13093    case ROTATE:
13094    case ROTATERT:
13095    case LSHIFTRT:
13096    case ASHIFTRT:
13097      op0 = XEXP (x, 0);
13098      op1 = XEXP (x, 1);
13099
13100      if (CONST_INT_P (op1))
13101	{
13102	  /* ASR (immediate) and friends.  */
13103	  if (speed)
13104	    {
13105	      if (VECTOR_MODE_P (mode))
13106		*cost += extra_cost->vect.alu;
13107	      else
13108		*cost += extra_cost->alu.shift;
13109	    }
13110
13111	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13112	  return true;
13113	}
13114      else
13115	{
13116	  if (VECTOR_MODE_P (mode))
13117	    {
13118	      if (speed)
13119		/* Vector shift (register).  */
13120		*cost += extra_cost->vect.alu;
13121	    }
13122	  else
13123	    {
13124	      if (speed)
13125		/* ASR (register) and friends.  */
13126		*cost += extra_cost->alu.shift_reg;
13127
13128	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13129		  && CONST_INT_P (XEXP (op1, 1))
13130		  && known_eq (INTVAL (XEXP (op1, 1)),
13131			       GET_MODE_BITSIZE (mode) - 1))
13132		{
13133		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13134		  /* We already demanded XEXP (op1, 0) to be REG_P, so
13135		     don't recurse into it.  */
13136		  return true;
13137		}
13138	    }
13139	  return false;  /* All arguments need to be in registers.  */
13140	}
13141
13142    case SYMBOL_REF:
13143
13144      if (aarch64_cmodel == AARCH64_CMODEL_LARGE
13145	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
13146	{
13147	  /* LDR.  */
13148	  if (speed)
13149	    *cost += extra_cost->ldst.load;
13150	}
13151      else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
13152	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
13153	{
13154	  /* ADRP, followed by ADD.  */
13155	  *cost += COSTS_N_INSNS (1);
13156	  if (speed)
13157	    *cost += 2 * extra_cost->alu.arith;
13158	}
13159      else if (aarch64_cmodel == AARCH64_CMODEL_TINY
13160	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13161	{
13162	  /* ADR.  */
13163	  if (speed)
13164	    *cost += extra_cost->alu.arith;
13165	}
13166
13167      if (flag_pic)
13168	{
13169	  /* One extra load instruction, after accessing the GOT.  */
13170	  *cost += COSTS_N_INSNS (1);
13171	  if (speed)
13172	    *cost += extra_cost->ldst.load;
13173	}
13174      return true;
13175
13176    case HIGH:
13177    case LO_SUM:
13178      /* ADRP/ADD (immediate).  */
13179      if (speed)
13180	*cost += extra_cost->alu.arith;
13181      return true;
13182
13183    case ZERO_EXTRACT:
13184    case SIGN_EXTRACT:
13185      /* UBFX/SBFX.  */
13186      if (speed)
13187	{
13188	  if (VECTOR_MODE_P (mode))
13189	    *cost += extra_cost->vect.alu;
13190	  else
13191	    *cost += extra_cost->alu.bfx;
13192	}
13193
13194      /* We can trust that the immediates used will be correct (there
13195	 are no by-register forms), so we need only cost op0.  */
13196      *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
13197      return true;
13198
13199    case MULT:
13200      *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
13201      /* aarch64_rtx_mult_cost always handles recursion to its
13202	 operands.  */
13203      return true;
13204
13205    case MOD:
13206    /* We can expand signed mod by power of 2 using a NEGS, two parallel
13207       ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
13208       an unconditional negate.  This case should only ever be reached through
13209       the set_smod_pow2_cheap check in expmed.c.  */
13210      if (CONST_INT_P (XEXP (x, 1))
13211	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13212	  && (mode == SImode || mode == DImode))
13213	{
13214	  /* We expand to 4 instructions.  Reset the baseline.  */
13215	  *cost = COSTS_N_INSNS (4);
13216
13217	  if (speed)
13218	    *cost += 2 * extra_cost->alu.logical
13219		     + 2 * extra_cost->alu.arith;
13220
13221	  return true;
13222	}
13223
13224    /* Fall-through.  */
13225    case UMOD:
13226      if (speed)
13227	{
13228	  /* Slighly prefer UMOD over SMOD.  */
13229	  if (VECTOR_MODE_P (mode))
13230	    *cost += extra_cost->vect.alu;
13231	  else if (GET_MODE_CLASS (mode) == MODE_INT)
13232	    *cost += (extra_cost->mult[mode == DImode].add
13233		      + extra_cost->mult[mode == DImode].idiv
13234		      + (code == MOD ? 1 : 0));
13235	}
13236      return false;  /* All arguments need to be in registers.  */
13237
13238    case DIV:
13239    case UDIV:
13240    case SQRT:
13241      if (speed)
13242	{
13243	  if (VECTOR_MODE_P (mode))
13244	    *cost += extra_cost->vect.alu;
13245	  else if (GET_MODE_CLASS (mode) == MODE_INT)
13246	    /* There is no integer SQRT, so only DIV and UDIV can get
13247	       here.  */
13248	    *cost += (extra_cost->mult[mode == DImode].idiv
13249		     /* Slighly prefer UDIV over SDIV.  */
13250		     + (code == DIV ? 1 : 0));
13251	  else
13252	    *cost += extra_cost->fp[mode == DFmode].div;
13253	}
13254      return false;  /* All arguments need to be in registers.  */
13255
13256    case IF_THEN_ELSE:
13257      return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13258					 XEXP (x, 2), cost, speed);
13259
13260    case EQ:
13261    case NE:
13262    case GT:
13263    case GTU:
13264    case LT:
13265    case LTU:
13266    case GE:
13267    case GEU:
13268    case LE:
13269    case LEU:
13270
13271      return false; /* All arguments must be in registers.  */
13272
13273    case FMA:
13274      op0 = XEXP (x, 0);
13275      op1 = XEXP (x, 1);
13276      op2 = XEXP (x, 2);
13277
13278      if (speed)
13279	{
13280	  if (VECTOR_MODE_P (mode))
13281	    *cost += extra_cost->vect.alu;
13282	  else
13283	    *cost += extra_cost->fp[mode == DFmode].fma;
13284	}
13285
13286      /* FMSUB, FNMADD, and FNMSUB are free.  */
13287      if (GET_CODE (op0) == NEG)
13288        op0 = XEXP (op0, 0);
13289
13290      if (GET_CODE (op2) == NEG)
13291        op2 = XEXP (op2, 0);
13292
13293      /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13294	 and the by-element operand as operand 0.  */
13295      if (GET_CODE (op1) == NEG)
13296        op1 = XEXP (op1, 0);
13297
13298      /* Catch vector-by-element operations.  The by-element operand can
13299	 either be (vec_duplicate (vec_select (x))) or just
13300	 (vec_select (x)), depending on whether we are multiplying by
13301	 a vector or a scalar.
13302
13303	 Canonicalization is not very good in these cases, FMA4 will put the
13304	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
13305      if (GET_CODE (op0) == VEC_DUPLICATE)
13306	op0 = XEXP (op0, 0);
13307      else if (GET_CODE (op1) == VEC_DUPLICATE)
13308	op1 = XEXP (op1, 0);
13309
13310      if (GET_CODE (op0) == VEC_SELECT)
13311	op0 = XEXP (op0, 0);
13312      else if (GET_CODE (op1) == VEC_SELECT)
13313	op1 = XEXP (op1, 0);
13314
13315      /* If the remaining parameters are not registers,
13316         get the cost to put them into registers.  */
13317      *cost += rtx_cost (op0, mode, FMA, 0, speed);
13318      *cost += rtx_cost (op1, mode, FMA, 1, speed);
13319      *cost += rtx_cost (op2, mode, FMA, 2, speed);
13320      return true;
13321
13322    case FLOAT:
13323    case UNSIGNED_FLOAT:
13324      if (speed)
13325	*cost += extra_cost->fp[mode == DFmode].fromint;
13326      return false;
13327
13328    case FLOAT_EXTEND:
13329      if (speed)
13330	{
13331	  if (VECTOR_MODE_P (mode))
13332	    {
13333	      /*Vector truncate.  */
13334	      *cost += extra_cost->vect.alu;
13335	    }
13336	  else
13337	    *cost += extra_cost->fp[mode == DFmode].widen;
13338	}
13339      return false;
13340
13341    case FLOAT_TRUNCATE:
13342      if (speed)
13343	{
13344	  if (VECTOR_MODE_P (mode))
13345	    {
13346	      /*Vector conversion.  */
13347	      *cost += extra_cost->vect.alu;
13348	    }
13349	  else
13350	    *cost += extra_cost->fp[mode == DFmode].narrow;
13351	}
13352      return false;
13353
13354    case FIX:
13355    case UNSIGNED_FIX:
13356      x = XEXP (x, 0);
13357      /* Strip the rounding part.  They will all be implemented
13358         by the fcvt* family of instructions anyway.  */
13359      if (GET_CODE (x) == UNSPEC)
13360        {
13361          unsigned int uns_code = XINT (x, 1);
13362
13363          if (uns_code == UNSPEC_FRINTA
13364              || uns_code == UNSPEC_FRINTM
13365              || uns_code == UNSPEC_FRINTN
13366              || uns_code == UNSPEC_FRINTP
13367              || uns_code == UNSPEC_FRINTZ)
13368            x = XVECEXP (x, 0, 0);
13369        }
13370
13371      if (speed)
13372	{
13373	  if (VECTOR_MODE_P (mode))
13374	    *cost += extra_cost->vect.alu;
13375	  else
13376	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13377	}
13378
13379      /* We can combine fmul by a power of 2 followed by a fcvt into a single
13380	 fixed-point fcvt.  */
13381      if (GET_CODE (x) == MULT
13382	  && ((VECTOR_MODE_P (mode)
13383	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13384	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13385	{
13386	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13387			     0, speed);
13388	  return true;
13389	}
13390
13391      *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13392      return true;
13393
13394    case ABS:
13395      if (VECTOR_MODE_P (mode))
13396	{
13397	  /* ABS (vector).  */
13398	  if (speed)
13399	    *cost += extra_cost->vect.alu;
13400	}
13401      else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13402	{
13403	  op0 = XEXP (x, 0);
13404
13405	  /* FABD, which is analogous to FADD.  */
13406	  if (GET_CODE (op0) == MINUS)
13407	    {
13408	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13409	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13410	      if (speed)
13411		*cost += extra_cost->fp[mode == DFmode].addsub;
13412
13413	      return true;
13414	    }
13415	  /* Simple FABS is analogous to FNEG.  */
13416	  if (speed)
13417	    *cost += extra_cost->fp[mode == DFmode].neg;
13418	}
13419      else
13420	{
13421	  /* Integer ABS will either be split to
13422	     two arithmetic instructions, or will be an ABS
13423	     (scalar), which we don't model.  */
13424	  *cost = COSTS_N_INSNS (2);
13425	  if (speed)
13426	    *cost += 2 * extra_cost->alu.arith;
13427	}
13428      return false;
13429
13430    case SMAX:
13431    case SMIN:
13432      if (speed)
13433	{
13434	  if (VECTOR_MODE_P (mode))
13435	    *cost += extra_cost->vect.alu;
13436	  else
13437	    {
13438	      /* FMAXNM/FMINNM/FMAX/FMIN.
13439	         TODO: This may not be accurate for all implementations, but
13440	         we do not model this in the cost tables.  */
13441	      *cost += extra_cost->fp[mode == DFmode].addsub;
13442	    }
13443	}
13444      return false;
13445
13446    case UNSPEC:
13447      /* The floating point round to integer frint* instructions.  */
13448      if (aarch64_frint_unspec_p (XINT (x, 1)))
13449        {
13450          if (speed)
13451            *cost += extra_cost->fp[mode == DFmode].roundint;
13452
13453          return false;
13454        }
13455
13456      if (XINT (x, 1) == UNSPEC_RBIT)
13457        {
13458          if (speed)
13459            *cost += extra_cost->alu.rev;
13460
13461          return false;
13462        }
13463      break;
13464
13465    case TRUNCATE:
13466
13467      /* Decompose <su>muldi3_highpart.  */
13468      if (/* (truncate:DI  */
13469	  mode == DImode
13470	  /*   (lshiftrt:TI  */
13471          && GET_MODE (XEXP (x, 0)) == TImode
13472          && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13473	  /*      (mult:TI  */
13474          && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13475	  /*        (ANY_EXTEND:TI (reg:DI))
13476	            (ANY_EXTEND:TI (reg:DI)))  */
13477          && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13478               && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13479              || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13480                  && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13481          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13482          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13483	  /*     (const_int 64)  */
13484          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13485          && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13486        {
13487          /* UMULH/SMULH.  */
13488	  if (speed)
13489	    *cost += extra_cost->mult[mode == DImode].extend;
13490	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13491			     mode, MULT, 0, speed);
13492	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13493			     mode, MULT, 1, speed);
13494          return true;
13495        }
13496
13497      /* Fall through.  */
13498    default:
13499      break;
13500    }
13501
13502  if (dump_file
13503      && flag_aarch64_verbose_cost)
13504    fprintf (dump_file,
13505      "\nFailed to cost RTX.  Assuming default cost.\n");
13506
13507  return true;
13508}
13509
13510/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13511   calculated for X.  This cost is stored in *COST.  Returns true
13512   if the total cost of X was calculated.  */
13513static bool
13514aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13515		   int param, int *cost, bool speed)
13516{
13517  bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13518
13519  if (dump_file
13520      && flag_aarch64_verbose_cost)
13521    {
13522      print_rtl_single (dump_file, x);
13523      fprintf (dump_file, "\n%s cost: %d (%s)\n",
13524	       speed ? "Hot" : "Cold",
13525	       *cost, result ? "final" : "partial");
13526    }
13527
13528  return result;
13529}
13530
13531static int
13532aarch64_register_move_cost (machine_mode mode,
13533			    reg_class_t from_i, reg_class_t to_i)
13534{
13535  enum reg_class from = (enum reg_class) from_i;
13536  enum reg_class to = (enum reg_class) to_i;
13537  const struct cpu_regmove_cost *regmove_cost
13538    = aarch64_tune_params.regmove_cost;
13539
13540  /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13541  if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13542      || to == STUB_REGS)
13543    to = GENERAL_REGS;
13544
13545  if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13546      || from == STUB_REGS)
13547    from = GENERAL_REGS;
13548
13549  /* Make RDFFR very expensive.  In particular, if we know that the FFR
13550     contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13551     as a way of obtaining a PTRUE.  */
13552  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13553      && hard_reg_set_subset_p (reg_class_contents[from_i],
13554				reg_class_contents[FFR_REGS]))
13555    return 80;
13556
13557  /* Moving between GPR and stack cost is the same as GP2GP.  */
13558  if ((from == GENERAL_REGS && to == STACK_REG)
13559      || (to == GENERAL_REGS && from == STACK_REG))
13560    return regmove_cost->GP2GP;
13561
13562  /* To/From the stack register, we move via the gprs.  */
13563  if (to == STACK_REG || from == STACK_REG)
13564    return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13565            + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13566
13567  if (known_eq (GET_MODE_SIZE (mode), 16))
13568    {
13569      /* 128-bit operations on general registers require 2 instructions.  */
13570      if (from == GENERAL_REGS && to == GENERAL_REGS)
13571	return regmove_cost->GP2GP * 2;
13572      else if (from == GENERAL_REGS)
13573	return regmove_cost->GP2FP * 2;
13574      else if (to == GENERAL_REGS)
13575	return regmove_cost->FP2GP * 2;
13576
13577      /* When AdvSIMD instructions are disabled it is not possible to move
13578	 a 128-bit value directly between Q registers.  This is handled in
13579	 secondary reload.  A general register is used as a scratch to move
13580	 the upper DI value and the lower DI value is moved directly,
13581	 hence the cost is the sum of three moves. */
13582      if (! TARGET_SIMD)
13583	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13584
13585      return regmove_cost->FP2FP;
13586    }
13587
13588  if (from == GENERAL_REGS && to == GENERAL_REGS)
13589    return regmove_cost->GP2GP;
13590  else if (from == GENERAL_REGS)
13591    return regmove_cost->GP2FP;
13592  else if (to == GENERAL_REGS)
13593    return regmove_cost->FP2GP;
13594
13595  return regmove_cost->FP2FP;
13596}
13597
13598static int
13599aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13600			  reg_class_t rclass ATTRIBUTE_UNUSED,
13601			  bool in ATTRIBUTE_UNUSED)
13602{
13603  return aarch64_tune_params.memmov_cost;
13604}
13605
13606/* Implement TARGET_INIT_BUILTINS.  */
13607static void
13608aarch64_init_builtins ()
13609{
13610  aarch64_general_init_builtins ();
13611  aarch64_sve::init_builtins ();
13612}
13613
13614/* Implement TARGET_FOLD_BUILTIN.  */
13615static tree
13616aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13617{
13618  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13619  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13620  tree type = TREE_TYPE (TREE_TYPE (fndecl));
13621  switch (code & AARCH64_BUILTIN_CLASS)
13622    {
13623    case AARCH64_BUILTIN_GENERAL:
13624      return aarch64_general_fold_builtin (subcode, type, nargs, args);
13625
13626    case AARCH64_BUILTIN_SVE:
13627      return NULL_TREE;
13628    }
13629  gcc_unreachable ();
13630}
13631
13632/* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13633static bool
13634aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13635{
13636  gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13637  tree fndecl = gimple_call_fndecl (stmt);
13638  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13639  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13640  gimple *new_stmt = NULL;
13641  switch (code & AARCH64_BUILTIN_CLASS)
13642    {
13643    case AARCH64_BUILTIN_GENERAL:
13644      new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13645      break;
13646
13647    case AARCH64_BUILTIN_SVE:
13648      new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13649      break;
13650    }
13651
13652  if (!new_stmt)
13653    return false;
13654
13655  gsi_replace (gsi, new_stmt, true);
13656  return true;
13657}
13658
13659/* Implement TARGET_EXPAND_BUILTIN.  */
13660static rtx
13661aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13662{
13663  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13664  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13665  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13666  switch (code & AARCH64_BUILTIN_CLASS)
13667    {
13668    case AARCH64_BUILTIN_GENERAL:
13669      return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13670
13671    case AARCH64_BUILTIN_SVE:
13672      return aarch64_sve::expand_builtin (subcode, exp, target);
13673    }
13674  gcc_unreachable ();
13675}
13676
13677/* Implement TARGET_BUILTIN_DECL.  */
13678static tree
13679aarch64_builtin_decl (unsigned int code, bool initialize_p)
13680{
13681  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13682  switch (code & AARCH64_BUILTIN_CLASS)
13683    {
13684    case AARCH64_BUILTIN_GENERAL:
13685      return aarch64_general_builtin_decl (subcode, initialize_p);
13686
13687    case AARCH64_BUILTIN_SVE:
13688      return aarch64_sve::builtin_decl (subcode, initialize_p);
13689    }
13690  gcc_unreachable ();
13691}
13692
13693/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13694   to optimize 1.0/sqrt.  */
13695
13696static bool
13697use_rsqrt_p (machine_mode mode)
13698{
13699  return (!flag_trapping_math
13700	  && flag_unsafe_math_optimizations
13701	  && ((aarch64_tune_params.approx_modes->recip_sqrt
13702	       & AARCH64_APPROX_MODE (mode))
13703	      || flag_mrecip_low_precision_sqrt));
13704}
13705
13706/* Function to decide when to use the approximate reciprocal square root
13707   builtin.  */
13708
13709static tree
13710aarch64_builtin_reciprocal (tree fndecl)
13711{
13712  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13713
13714  if (!use_rsqrt_p (mode))
13715    return NULL_TREE;
13716  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13717  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13718  switch (code & AARCH64_BUILTIN_CLASS)
13719    {
13720    case AARCH64_BUILTIN_GENERAL:
13721      return aarch64_general_builtin_rsqrt (subcode);
13722
13723    case AARCH64_BUILTIN_SVE:
13724      return NULL_TREE;
13725    }
13726  gcc_unreachable ();
13727}
13728
13729/* Emit code to perform the floating-point operation:
13730
13731     DST = SRC1 * SRC2
13732
13733   where all three operands are already known to be registers.
13734   If the operation is an SVE one, PTRUE is a suitable all-true
13735   predicate.  */
13736
13737static void
13738aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13739{
13740  if (ptrue)
13741    emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13742				 dst, ptrue, src1, src2,
13743				 gen_int_mode (SVE_RELAXED_GP, SImode)));
13744  else
13745    emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13746}
13747
13748/* Emit instruction sequence to compute either the approximate square root
13749   or its approximate reciprocal, depending on the flag RECP, and return
13750   whether the sequence was emitted or not.  */
13751
13752bool
13753aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13754{
13755  machine_mode mode = GET_MODE (dst);
13756
13757  if (GET_MODE_INNER (mode) == HFmode)
13758    {
13759      gcc_assert (!recp);
13760      return false;
13761    }
13762
13763  if (!recp)
13764    {
13765      if (!(flag_mlow_precision_sqrt
13766	    || (aarch64_tune_params.approx_modes->sqrt
13767		& AARCH64_APPROX_MODE (mode))))
13768	return false;
13769
13770      if (!flag_finite_math_only
13771	  || flag_trapping_math
13772	  || !flag_unsafe_math_optimizations
13773	  || optimize_function_for_size_p (cfun))
13774	return false;
13775    }
13776  else
13777    /* Caller assumes we cannot fail.  */
13778    gcc_assert (use_rsqrt_p (mode));
13779
13780  rtx pg = NULL_RTX;
13781  if (aarch64_sve_mode_p (mode))
13782    pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13783  machine_mode mmsk = (VECTOR_MODE_P (mode)
13784		       ? related_int_vector_mode (mode).require ()
13785		       : int_mode_for_mode (mode).require ());
13786  rtx xmsk = NULL_RTX;
13787  if (!recp)
13788    {
13789      /* When calculating the approximate square root, compare the
13790	 argument with 0.0 and create a mask.  */
13791      rtx zero = CONST0_RTX (mode);
13792      if (pg)
13793	{
13794	  xmsk = gen_reg_rtx (GET_MODE (pg));
13795	  rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13796	  emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13797					   xmsk, pg, hint, src, zero));
13798	}
13799      else
13800	{
13801	  xmsk = gen_reg_rtx (mmsk);
13802	  emit_insn (gen_rtx_SET (xmsk,
13803				  gen_rtx_NEG (mmsk,
13804					       gen_rtx_EQ (mmsk, src, zero))));
13805	}
13806    }
13807
13808  /* Estimate the approximate reciprocal square root.  */
13809  rtx xdst = gen_reg_rtx (mode);
13810  emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13811
13812  /* Iterate over the series twice for SF and thrice for DF.  */
13813  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13814
13815  /* Optionally iterate over the series once less for faster performance
13816     while sacrificing the accuracy.  */
13817  if ((recp && flag_mrecip_low_precision_sqrt)
13818      || (!recp && flag_mlow_precision_sqrt))
13819    iterations--;
13820
13821  /* Iterate over the series to calculate the approximate reciprocal square
13822     root.  */
13823  rtx x1 = gen_reg_rtx (mode);
13824  while (iterations--)
13825    {
13826      rtx x2 = gen_reg_rtx (mode);
13827      aarch64_emit_mult (x2, pg, xdst, xdst);
13828
13829      emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13830
13831      if (iterations > 0)
13832	aarch64_emit_mult (xdst, pg, xdst, x1);
13833    }
13834
13835  if (!recp)
13836    {
13837      if (pg)
13838	/* Multiply nonzero source values by the corresponding intermediate
13839	   result elements, so that the final calculation is the approximate
13840	   square root rather than its reciprocal.  Select a zero result for
13841	   zero source values, to avoid the Inf * 0 -> NaN that we'd get
13842	   otherwise.  */
13843	emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13844			     xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13845      else
13846	{
13847	  /* Qualify the approximate reciprocal square root when the
13848	     argument is 0.0 by squashing the intermediary result to 0.0.  */
13849	  rtx xtmp = gen_reg_rtx (mmsk);
13850	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13851					    gen_rtx_SUBREG (mmsk, xdst, 0)));
13852	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13853
13854	  /* Calculate the approximate square root.  */
13855	  aarch64_emit_mult (xdst, pg, xdst, src);
13856	}
13857    }
13858
13859  /* Finalize the approximation.  */
13860  aarch64_emit_mult (dst, pg, xdst, x1);
13861
13862  return true;
13863}
13864
13865/* Emit the instruction sequence to compute the approximation for the division
13866   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
13867
13868bool
13869aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13870{
13871  machine_mode mode = GET_MODE (quo);
13872
13873  if (GET_MODE_INNER (mode) == HFmode)
13874    return false;
13875
13876  bool use_approx_division_p = (flag_mlow_precision_div
13877			        || (aarch64_tune_params.approx_modes->division
13878				    & AARCH64_APPROX_MODE (mode)));
13879
13880  if (!flag_finite_math_only
13881      || flag_trapping_math
13882      || !flag_unsafe_math_optimizations
13883      || optimize_function_for_size_p (cfun)
13884      || !use_approx_division_p)
13885    return false;
13886
13887  if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13888    return false;
13889
13890  rtx pg = NULL_RTX;
13891  if (aarch64_sve_mode_p (mode))
13892    pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13893
13894  /* Estimate the approximate reciprocal.  */
13895  rtx xrcp = gen_reg_rtx (mode);
13896  emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13897
13898  /* Iterate over the series twice for SF and thrice for DF.  */
13899  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13900
13901  /* Optionally iterate over the series less for faster performance,
13902     while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
13903  if (flag_mlow_precision_div)
13904    iterations = (GET_MODE_INNER (mode) == DFmode
13905		  ? aarch64_double_recp_precision
13906		  : aarch64_float_recp_precision);
13907
13908  /* Iterate over the series to calculate the approximate reciprocal.  */
13909  rtx xtmp = gen_reg_rtx (mode);
13910  while (iterations--)
13911    {
13912      emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13913
13914      if (iterations > 0)
13915	aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13916    }
13917
13918  if (num != CONST1_RTX (mode))
13919    {
13920      /* As the approximate reciprocal of DEN is already calculated, only
13921	 calculate the approximate division when NUM is not 1.0.  */
13922      rtx xnum = force_reg (mode, num);
13923      aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13924    }
13925
13926  /* Finalize the approximation.  */
13927  aarch64_emit_mult (quo, pg, xrcp, xtmp);
13928  return true;
13929}
13930
13931/* Return the number of instructions that can be issued per cycle.  */
13932static int
13933aarch64_sched_issue_rate (void)
13934{
13935  return aarch64_tune_params.issue_rate;
13936}
13937
13938/* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
13939static int
13940aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13941{
13942  if (DEBUG_INSN_P (insn))
13943    return more;
13944
13945  rtx_code code = GET_CODE (PATTERN (insn));
13946  if (code == USE || code == CLOBBER)
13947    return more;
13948
13949  if (get_attr_type (insn) == TYPE_NO_INSN)
13950    return more;
13951
13952  return more - 1;
13953}
13954
13955static int
13956aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13957{
13958  int issue_rate = aarch64_sched_issue_rate ();
13959
13960  return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13961}
13962
13963
13964/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13965   autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
13966   has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
13967
13968static int
13969aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13970						    int ready_index)
13971{
13972  return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13973}
13974
13975
13976/* Vectorizer cost model target hooks.  */
13977
13978/* Implement targetm.vectorize.builtin_vectorization_cost.  */
13979static int
13980aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13981				    tree vectype,
13982				    int misalign ATTRIBUTE_UNUSED)
13983{
13984  unsigned elements;
13985  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13986  bool fp = false;
13987
13988  if (vectype != NULL)
13989    fp = FLOAT_TYPE_P (vectype);
13990
13991  switch (type_of_cost)
13992    {
13993      case scalar_stmt:
13994	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13995
13996      case scalar_load:
13997	return costs->scalar_load_cost;
13998
13999      case scalar_store:
14000	return costs->scalar_store_cost;
14001
14002      case vector_stmt:
14003	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
14004
14005      case vector_load:
14006	return costs->vec_align_load_cost;
14007
14008      case vector_store:
14009	return costs->vec_store_cost;
14010
14011      case vec_to_scalar:
14012	return costs->vec_to_scalar_cost;
14013
14014      case scalar_to_vec:
14015	return costs->scalar_to_vec_cost;
14016
14017      case unaligned_load:
14018      case vector_gather_load:
14019	return costs->vec_unalign_load_cost;
14020
14021      case unaligned_store:
14022      case vector_scatter_store:
14023	return costs->vec_unalign_store_cost;
14024
14025      case cond_branch_taken:
14026	return costs->cond_taken_branch_cost;
14027
14028      case cond_branch_not_taken:
14029	return costs->cond_not_taken_branch_cost;
14030
14031      case vec_perm:
14032	return costs->vec_permute_cost;
14033
14034      case vec_promote_demote:
14035	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
14036
14037      case vec_construct:
14038	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
14039	return elements / 2 + 1;
14040
14041      default:
14042	gcc_unreachable ();
14043    }
14044}
14045
14046/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
14047   vectors would produce a series of LDP or STP operations.  KIND is the
14048   kind of statement that STMT_INFO represents.  */
14049static bool
14050aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
14051			   stmt_vec_info stmt_info)
14052{
14053  switch (kind)
14054    {
14055    case vector_load:
14056    case vector_store:
14057    case unaligned_load:
14058    case unaligned_store:
14059      break;
14060
14061    default:
14062      return false;
14063    }
14064
14065  if (aarch64_tune_params.extra_tuning_flags
14066      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
14067    return false;
14068
14069  return is_gimple_assign (stmt_info->stmt);
14070}
14071
14072/* Return true if STMT_INFO extends the result of a load.  */
14073static bool
14074aarch64_extending_load_p (stmt_vec_info stmt_info)
14075{
14076  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14077  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14078    return false;
14079
14080  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
14081  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14082  tree rhs_type = TREE_TYPE (rhs);
14083  if (!INTEGRAL_TYPE_P (lhs_type)
14084      || !INTEGRAL_TYPE_P (rhs_type)
14085      || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
14086    return false;
14087
14088  stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
14089  return (def_stmt_info
14090	  && STMT_VINFO_DATA_REF (def_stmt_info)
14091	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
14092}
14093
14094/* Return true if STMT_INFO is an integer truncation.  */
14095static bool
14096aarch64_integer_truncation_p (stmt_vec_info stmt_info)
14097{
14098  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14099  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14100    return false;
14101
14102  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14103  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
14104  return (INTEGRAL_TYPE_P (lhs_type)
14105	  && INTEGRAL_TYPE_P (rhs_type)
14106	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
14107}
14108
14109/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
14110   for STMT_INFO, which has cost kind KIND and which when vectorized would
14111   operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
14112   targets.  */
14113static unsigned int
14114aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind,
14115			      stmt_vec_info stmt_info, tree vectype,
14116			      unsigned int stmt_cost)
14117{
14118  /* Unlike vec_promote_demote, vector_stmt conversions do not change the
14119     vector register size or number of units.  Integer promotions of this
14120     type therefore map to SXT[BHW] or UXT[BHW].
14121
14122     Most loads have extending forms that can do the sign or zero extension
14123     on the fly.  Optimistically assume that a load followed by an extension
14124     will fold to this form during combine, and that the extension therefore
14125     comes for free.  */
14126  if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
14127    stmt_cost = 0;
14128
14129  /* For similar reasons, vector_stmt integer truncations are a no-op,
14130     because we can just ignore the unused upper bits of the source.  */
14131  if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
14132    stmt_cost = 0;
14133
14134  /* Advanced SIMD can load and store pairs of registers using LDP and STP,
14135     but there are no equivalent instructions for SVE.  This means that
14136     (all other things being equal) 128-bit SVE needs twice as many load
14137     and store instructions as Advanced SIMD in order to process vector pairs.
14138
14139     Also, scalar code can often use LDP and STP to access pairs of values,
14140     so it is too simplistic to say that one SVE load or store replaces
14141     VF scalar loads and stores.
14142
14143     Ideally we would account for this in the scalar and Advanced SIMD
14144     costs by making suitable load/store pairs as cheap as a single
14145     load/store.  However, that would be a very invasive change and in
14146     practice it tends to stress other parts of the cost model too much.
14147     E.g. stores of scalar constants currently count just a store,
14148     whereas stores of vector constants count a store and a vec_init.
14149     This is an artificial distinction for AArch64, where stores of
14150     nonzero scalar constants need the same kind of register invariant
14151     as vector stores.
14152
14153     An alternative would be to double the cost of any SVE loads and stores
14154     that could be paired in Advanced SIMD (and possibly also paired in
14155     scalar code).  But this tends to stress other parts of the cost model
14156     in the same way.  It also means that we can fall back to Advanced SIMD
14157     even if full-loop predication would have been useful.
14158
14159     Here we go for a more conservative version: double the costs of SVE
14160     loads and stores if one iteration of the scalar loop processes enough
14161     elements for it to use a whole number of Advanced SIMD LDP or STP
14162     instructions.  This makes it very likely that the VF would be 1 for
14163     Advanced SIMD, and so no epilogue should be needed.  */
14164  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
14165    {
14166      stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
14167      unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
14168      unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
14169      if (multiple_p (count * elt_bits, 256)
14170	  && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
14171	stmt_cost *= 2;
14172    }
14173
14174  return stmt_cost;
14175}
14176
14177/* Implement targetm.vectorize.add_stmt_cost.  */
14178static unsigned
14179aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
14180		       struct _stmt_vec_info *stmt_info, int misalign,
14181		       enum vect_cost_model_location where)
14182{
14183  unsigned *cost = (unsigned *) data;
14184  unsigned retval = 0;
14185
14186  if (flag_vect_cost_model)
14187    {
14188      tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
14189      int stmt_cost =
14190	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
14191
14192      if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
14193	stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, vectype,
14194						  stmt_cost);
14195
14196      /* Statements in an inner loop relative to the loop being
14197	 vectorized are weighted more heavily.  The value here is
14198	 arbitrary and could potentially be improved with analysis.  */
14199      if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
14200	count *= 50; /*  FIXME  */
14201
14202      retval = (unsigned) (count * stmt_cost);
14203      cost[where] += retval;
14204    }
14205
14206  return retval;
14207}
14208
14209static void initialize_aarch64_code_model (struct gcc_options *);
14210
14211/* Parse the TO_PARSE string and put the architecture struct that it
14212   selects into RES and the architectural features into ISA_FLAGS.
14213   Return an aarch64_parse_opt_result describing the parse result.
14214   If there is an error parsing, RES and ISA_FLAGS are left unchanged.
14215   When the TO_PARSE string contains an invalid extension,
14216   a copy of the string is created and stored to INVALID_EXTENSION.  */
14217
14218static enum aarch64_parse_opt_result
14219aarch64_parse_arch (const char *to_parse, const struct processor **res,
14220		    uint64_t *isa_flags, std::string *invalid_extension)
14221{
14222  const char *ext;
14223  const struct processor *arch;
14224  size_t len;
14225
14226  ext = strchr (to_parse, '+');
14227
14228  if (ext != NULL)
14229    len = ext - to_parse;
14230  else
14231    len = strlen (to_parse);
14232
14233  if (len == 0)
14234    return AARCH64_PARSE_MISSING_ARG;
14235
14236
14237  /* Loop through the list of supported ARCHes to find a match.  */
14238  for (arch = all_architectures; arch->name != NULL; arch++)
14239    {
14240      if (strlen (arch->name) == len
14241	  && strncmp (arch->name, to_parse, len) == 0)
14242	{
14243	  uint64_t isa_temp = arch->flags;
14244
14245	  if (ext != NULL)
14246	    {
14247	      /* TO_PARSE string contains at least one extension.  */
14248	      enum aarch64_parse_opt_result ext_res
14249		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14250
14251	      if (ext_res != AARCH64_PARSE_OK)
14252		return ext_res;
14253	    }
14254	  /* Extension parsing was successful.  Confirm the result
14255	     arch and ISA flags.  */
14256	  *res = arch;
14257	  *isa_flags = isa_temp;
14258	  return AARCH64_PARSE_OK;
14259	}
14260    }
14261
14262  /* ARCH name not found in list.  */
14263  return AARCH64_PARSE_INVALID_ARG;
14264}
14265
14266/* Parse the TO_PARSE string and put the result tuning in RES and the
14267   architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
14268   describing the parse result.  If there is an error parsing, RES and
14269   ISA_FLAGS are left unchanged.
14270   When the TO_PARSE string contains an invalid extension,
14271   a copy of the string is created and stored to INVALID_EXTENSION.  */
14272
14273static enum aarch64_parse_opt_result
14274aarch64_parse_cpu (const char *to_parse, const struct processor **res,
14275		   uint64_t *isa_flags, std::string *invalid_extension)
14276{
14277  const char *ext;
14278  const struct processor *cpu;
14279  size_t len;
14280
14281  ext = strchr (to_parse, '+');
14282
14283  if (ext != NULL)
14284    len = ext - to_parse;
14285  else
14286    len = strlen (to_parse);
14287
14288  if (len == 0)
14289    return AARCH64_PARSE_MISSING_ARG;
14290
14291
14292  /* Loop through the list of supported CPUs to find a match.  */
14293  for (cpu = all_cores; cpu->name != NULL; cpu++)
14294    {
14295      if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
14296	{
14297	  uint64_t isa_temp = cpu->flags;
14298
14299
14300	  if (ext != NULL)
14301	    {
14302	      /* TO_PARSE string contains at least one extension.  */
14303	      enum aarch64_parse_opt_result ext_res
14304		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
14305
14306	      if (ext_res != AARCH64_PARSE_OK)
14307		return ext_res;
14308	    }
14309	  /* Extension parsing was successfull.  Confirm the result
14310	     cpu and ISA flags.  */
14311	  *res = cpu;
14312	  *isa_flags = isa_temp;
14313	  return AARCH64_PARSE_OK;
14314	}
14315    }
14316
14317  /* CPU name not found in list.  */
14318  return AARCH64_PARSE_INVALID_ARG;
14319}
14320
14321/* Parse the TO_PARSE string and put the cpu it selects into RES.
14322   Return an aarch64_parse_opt_result describing the parse result.
14323   If the parsing fails the RES does not change.  */
14324
14325static enum aarch64_parse_opt_result
14326aarch64_parse_tune (const char *to_parse, const struct processor **res)
14327{
14328  const struct processor *cpu;
14329
14330  /* Loop through the list of supported CPUs to find a match.  */
14331  for (cpu = all_cores; cpu->name != NULL; cpu++)
14332    {
14333      if (strcmp (cpu->name, to_parse) == 0)
14334	{
14335	  *res = cpu;
14336	  return AARCH64_PARSE_OK;
14337	}
14338    }
14339
14340  /* CPU name not found in list.  */
14341  return AARCH64_PARSE_INVALID_ARG;
14342}
14343
14344/* Parse TOKEN, which has length LENGTH to see if it is an option
14345   described in FLAG.  If it is, return the index bit for that fusion type.
14346   If not, error (printing OPTION_NAME) and return zero.  */
14347
14348static unsigned int
14349aarch64_parse_one_option_token (const char *token,
14350				size_t length,
14351				const struct aarch64_flag_desc *flag,
14352				const char *option_name)
14353{
14354  for (; flag->name != NULL; flag++)
14355    {
14356      if (length == strlen (flag->name)
14357	  && !strncmp (flag->name, token, length))
14358	return flag->flag;
14359    }
14360
14361  error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
14362  return 0;
14363}
14364
14365/* Parse OPTION which is a comma-separated list of flags to enable.
14366   FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14367   default state we inherit from the CPU tuning structures.  OPTION_NAME
14368   gives the top-level option we are parsing in the -moverride string,
14369   for use in error messages.  */
14370
14371static unsigned int
14372aarch64_parse_boolean_options (const char *option,
14373			       const struct aarch64_flag_desc *flags,
14374			       unsigned int initial_state,
14375			       const char *option_name)
14376{
14377  const char separator = '.';
14378  const char* specs = option;
14379  const char* ntoken = option;
14380  unsigned int found_flags = initial_state;
14381
14382  while ((ntoken = strchr (specs, separator)))
14383    {
14384      size_t token_length = ntoken - specs;
14385      unsigned token_ops = aarch64_parse_one_option_token (specs,
14386							   token_length,
14387							   flags,
14388							   option_name);
14389      /* If we find "none" (or, for simplicity's sake, an error) anywhere
14390	 in the token stream, reset the supported operations.  So:
14391
14392	   adrp+add.cmp+branch.none.adrp+add
14393
14394	   would have the result of turning on only adrp+add fusion.  */
14395      if (!token_ops)
14396	found_flags = 0;
14397
14398      found_flags |= token_ops;
14399      specs = ++ntoken;
14400    }
14401
14402  /* We ended with a comma, print something.  */
14403  if (!(*specs))
14404    {
14405      error ("%s string ill-formed\n", option_name);
14406      return 0;
14407    }
14408
14409  /* We still have one more token to parse.  */
14410  size_t token_length = strlen (specs);
14411  unsigned token_ops = aarch64_parse_one_option_token (specs,
14412						       token_length,
14413						       flags,
14414						       option_name);
14415   if (!token_ops)
14416     found_flags = 0;
14417
14418  found_flags |= token_ops;
14419  return found_flags;
14420}
14421
14422/* Support for overriding instruction fusion.  */
14423
14424static void
14425aarch64_parse_fuse_string (const char *fuse_string,
14426			    struct tune_params *tune)
14427{
14428  tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14429						     aarch64_fusible_pairs,
14430						     tune->fusible_ops,
14431						     "fuse=");
14432}
14433
14434/* Support for overriding other tuning flags.  */
14435
14436static void
14437aarch64_parse_tune_string (const char *tune_string,
14438			    struct tune_params *tune)
14439{
14440  tune->extra_tuning_flags
14441    = aarch64_parse_boolean_options (tune_string,
14442				     aarch64_tuning_flags,
14443				     tune->extra_tuning_flags,
14444				     "tune=");
14445}
14446
14447/* Parse the sve_width tuning moverride string in TUNE_STRING.
14448   Accept the valid SVE vector widths allowed by
14449   aarch64_sve_vector_bits_enum and use it to override sve_width
14450   in TUNE.  */
14451
14452static void
14453aarch64_parse_sve_width_string (const char *tune_string,
14454				struct tune_params *tune)
14455{
14456  int width = -1;
14457
14458  int n = sscanf (tune_string, "%d", &width);
14459  if (n == EOF)
14460    {
14461      error ("invalid format for sve_width");
14462      return;
14463    }
14464  switch (width)
14465    {
14466    case SVE_128:
14467    case SVE_256:
14468    case SVE_512:
14469    case SVE_1024:
14470    case SVE_2048:
14471      break;
14472    default:
14473      error ("invalid sve_width value: %d", width);
14474    }
14475  tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14476}
14477
14478/* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14479   we understand.  If it is, extract the option string and handoff to
14480   the appropriate function.  */
14481
14482void
14483aarch64_parse_one_override_token (const char* token,
14484				  size_t length,
14485				  struct tune_params *tune)
14486{
14487  const struct aarch64_tuning_override_function *fn
14488    = aarch64_tuning_override_functions;
14489
14490  const char *option_part = strchr (token, '=');
14491  if (!option_part)
14492    {
14493      error ("tuning string missing in option (%s)", token);
14494      return;
14495    }
14496
14497  /* Get the length of the option name.  */
14498  length = option_part - token;
14499  /* Skip the '=' to get to the option string.  */
14500  option_part++;
14501
14502  for (; fn->name != NULL; fn++)
14503    {
14504      if (!strncmp (fn->name, token, length))
14505	{
14506	  fn->parse_override (option_part, tune);
14507	  return;
14508	}
14509    }
14510
14511  error ("unknown tuning option (%s)",token);
14512  return;
14513}
14514
14515/* A checking mechanism for the implementation of the tls size.  */
14516
14517static void
14518initialize_aarch64_tls_size (struct gcc_options *opts)
14519{
14520  if (aarch64_tls_size == 0)
14521    aarch64_tls_size = 24;
14522
14523  switch (opts->x_aarch64_cmodel_var)
14524    {
14525    case AARCH64_CMODEL_TINY:
14526      /* Both the default and maximum TLS size allowed under tiny is 1M which
14527	 needs two instructions to address, so we clamp the size to 24.  */
14528      if (aarch64_tls_size > 24)
14529	aarch64_tls_size = 24;
14530      break;
14531    case AARCH64_CMODEL_SMALL:
14532      /* The maximum TLS size allowed under small is 4G.  */
14533      if (aarch64_tls_size > 32)
14534	aarch64_tls_size = 32;
14535      break;
14536    case AARCH64_CMODEL_LARGE:
14537      /* The maximum TLS size allowed under large is 16E.
14538	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
14539      if (aarch64_tls_size > 48)
14540	aarch64_tls_size = 48;
14541      break;
14542    default:
14543      gcc_unreachable ();
14544    }
14545
14546  return;
14547}
14548
14549/* Parse STRING looking for options in the format:
14550     string	:: option:string
14551     option	:: name=substring
14552     name	:: {a-z}
14553     substring	:: defined by option.  */
14554
14555static void
14556aarch64_parse_override_string (const char* input_string,
14557			       struct tune_params* tune)
14558{
14559  const char separator = ':';
14560  size_t string_length = strlen (input_string) + 1;
14561  char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14562  char *string = string_root;
14563  strncpy (string, input_string, string_length);
14564  string[string_length - 1] = '\0';
14565
14566  char* ntoken = string;
14567
14568  while ((ntoken = strchr (string, separator)))
14569    {
14570      size_t token_length = ntoken - string;
14571      /* Make this substring look like a string.  */
14572      *ntoken = '\0';
14573      aarch64_parse_one_override_token (string, token_length, tune);
14574      string = ++ntoken;
14575    }
14576
14577  /* One last option to parse.  */
14578  aarch64_parse_one_override_token (string, strlen (string), tune);
14579  free (string_root);
14580}
14581
14582/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
14583   are best for a generic target with the currently-enabled architecture
14584   extensions.  */
14585static void
14586aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
14587{
14588  /* Neoverse V1 is the only core that is known to benefit from
14589     AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
14590     point enabling it for SVE2 and above.  */
14591  if (TARGET_SVE2)
14592    current_tune.extra_tuning_flags
14593      &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
14594}
14595
14596static void
14597aarch64_override_options_after_change_1 (struct gcc_options *opts)
14598{
14599  if (accepted_branch_protection_string)
14600    {
14601      opts->x_aarch64_branch_protection_string
14602	= xstrdup (accepted_branch_protection_string);
14603    }
14604
14605  /* PR 70044: We have to be careful about being called multiple times for the
14606     same function.  This means all changes should be repeatable.  */
14607
14608  /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14609     Disable the frame pointer flag so the mid-end will not use a frame
14610     pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14611     Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14612     between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
14613  aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14614  if (opts->x_flag_omit_frame_pointer == 0)
14615    opts->x_flag_omit_frame_pointer = 2;
14616
14617  /* If not optimizing for size, set the default
14618     alignment to what the target wants.  */
14619  if (!opts->x_optimize_size)
14620    {
14621      if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14622	opts->x_str_align_loops = aarch64_tune_params.loop_align;
14623      if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14624	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14625      if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14626	opts->x_str_align_functions = aarch64_tune_params.function_align;
14627    }
14628
14629  /* We default to no pc-relative literal loads.  */
14630
14631  aarch64_pcrelative_literal_loads = false;
14632
14633  /* If -mpc-relative-literal-loads is set on the command line, this
14634     implies that the user asked for PC relative literal loads.  */
14635  if (opts->x_pcrelative_literal_loads == 1)
14636    aarch64_pcrelative_literal_loads = true;
14637
14638  /* In the tiny memory model it makes no sense to disallow PC relative
14639     literal pool loads.  */
14640  if (aarch64_cmodel == AARCH64_CMODEL_TINY
14641      || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14642    aarch64_pcrelative_literal_loads = true;
14643
14644  /* When enabling the lower precision Newton series for the square root, also
14645     enable it for the reciprocal square root, since the latter is an
14646     intermediary step for the former.  */
14647  if (flag_mlow_precision_sqrt)
14648    flag_mrecip_low_precision_sqrt = true;
14649}
14650
14651/* 'Unpack' up the internal tuning structs and update the options
14652    in OPTS.  The caller must have set up selected_tune and selected_arch
14653    as all the other target-specific codegen decisions are
14654    derived from them.  */
14655
14656void
14657aarch64_override_options_internal (struct gcc_options *opts)
14658{
14659  aarch64_tune_flags = selected_tune->flags;
14660  aarch64_tune = selected_tune->sched_core;
14661  /* Make a copy of the tuning parameters attached to the core, which
14662     we may later overwrite.  */
14663  aarch64_tune_params = *(selected_tune->tune);
14664  aarch64_architecture_version = selected_arch->architecture_version;
14665  if (selected_tune->tune == &generic_tunings)
14666    aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
14667
14668  if (opts->x_aarch64_override_tune_string)
14669    aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14670				  &aarch64_tune_params);
14671
14672  /* This target defaults to strict volatile bitfields.  */
14673  if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14674    opts->x_flag_strict_volatile_bitfields = 1;
14675
14676  if (aarch64_stack_protector_guard == SSP_GLOBAL
14677      && opts->x_aarch64_stack_protector_guard_offset_str)
14678    {
14679      error ("incompatible options %<-mstack-protector-guard=global%> and "
14680	     "%<-mstack-protector-guard-offset=%s%>",
14681	     aarch64_stack_protector_guard_offset_str);
14682    }
14683
14684  if (aarch64_stack_protector_guard == SSP_SYSREG
14685      && !(opts->x_aarch64_stack_protector_guard_offset_str
14686	   && opts->x_aarch64_stack_protector_guard_reg_str))
14687    {
14688      error ("both %<-mstack-protector-guard-offset%> and "
14689	     "%<-mstack-protector-guard-reg%> must be used "
14690	     "with %<-mstack-protector-guard=sysreg%>");
14691    }
14692
14693  if (opts->x_aarch64_stack_protector_guard_reg_str)
14694    {
14695      if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14696	  error ("specify a system register with a small string length.");
14697    }
14698
14699  if (opts->x_aarch64_stack_protector_guard_offset_str)
14700    {
14701      char *end;
14702      const char *str = aarch64_stack_protector_guard_offset_str;
14703      errno = 0;
14704      long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14705      if (!*str || *end || errno)
14706	error ("%qs is not a valid offset in %qs", str,
14707	       "-mstack-protector-guard-offset=");
14708      aarch64_stack_protector_guard_offset = offs;
14709    }
14710
14711  initialize_aarch64_code_model (opts);
14712  initialize_aarch64_tls_size (opts);
14713
14714  int queue_depth = 0;
14715  switch (aarch64_tune_params.autoprefetcher_model)
14716    {
14717      case tune_params::AUTOPREFETCHER_OFF:
14718	queue_depth = -1;
14719	break;
14720      case tune_params::AUTOPREFETCHER_WEAK:
14721	queue_depth = 0;
14722	break;
14723      case tune_params::AUTOPREFETCHER_STRONG:
14724	queue_depth = max_insn_queue_index + 1;
14725	break;
14726      default:
14727	gcc_unreachable ();
14728    }
14729
14730  /* We don't mind passing in global_options_set here as we don't use
14731     the *options_set structs anyway.  */
14732  SET_OPTION_IF_UNSET (opts, &global_options_set,
14733		       param_sched_autopref_queue_depth, queue_depth);
14734
14735  /* If the core wants only AdvancedSIMD autovectorization, do this through
14736     aarch64_autovec_preference.  If the user set it explicitly, they should
14737     know what they want.  */
14738  if (aarch64_tune_params.extra_tuning_flags
14739      & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC)
14740    SET_OPTION_IF_UNSET (opts, &global_options_set,
14741			 aarch64_autovec_preference, 1);
14742
14743  /* If using Advanced SIMD only for autovectorization disable SVE vector costs
14744     comparison.  */
14745  if (aarch64_autovec_preference == 1)
14746    SET_OPTION_IF_UNSET (opts, &global_options_set,
14747			 aarch64_sve_compare_costs, 0);
14748
14749  /* Set up parameters to be used in prefetching algorithm.  Do not
14750     override the defaults unless we are tuning for a core we have
14751     researched values for.  */
14752  if (aarch64_tune_params.prefetch->num_slots > 0)
14753    SET_OPTION_IF_UNSET (opts, &global_options_set,
14754			 param_simultaneous_prefetches,
14755			 aarch64_tune_params.prefetch->num_slots);
14756  if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14757    SET_OPTION_IF_UNSET (opts, &global_options_set,
14758			 param_l1_cache_size,
14759			 aarch64_tune_params.prefetch->l1_cache_size);
14760  if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14761    SET_OPTION_IF_UNSET (opts, &global_options_set,
14762			 param_l1_cache_line_size,
14763			 aarch64_tune_params.prefetch->l1_cache_line_size);
14764  if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14765    SET_OPTION_IF_UNSET (opts, &global_options_set,
14766			 param_l2_cache_size,
14767			 aarch64_tune_params.prefetch->l2_cache_size);
14768  if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14769    SET_OPTION_IF_UNSET (opts, &global_options_set,
14770			 param_prefetch_dynamic_strides, 0);
14771  if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14772    SET_OPTION_IF_UNSET (opts, &global_options_set,
14773			 param_prefetch_minimum_stride,
14774			 aarch64_tune_params.prefetch->minimum_stride);
14775
14776  /* Use the alternative scheduling-pressure algorithm by default.  */
14777  SET_OPTION_IF_UNSET (opts, &global_options_set,
14778		       param_sched_pressure_algorithm,
14779		       SCHED_PRESSURE_MODEL);
14780
14781  /* Validate the guard size.  */
14782  int guard_size = param_stack_clash_protection_guard_size;
14783
14784  if (guard_size != 12 && guard_size != 16)
14785    error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14786	   "size.  Given value %d (%llu KB) is out of range",
14787	   guard_size, (1ULL << guard_size) / 1024ULL);
14788
14789  /* Enforce that interval is the same size as size so the mid-end does the
14790     right thing.  */
14791  SET_OPTION_IF_UNSET (opts, &global_options_set,
14792		       param_stack_clash_protection_probe_interval,
14793		       guard_size);
14794
14795  /* The maybe_set calls won't update the value if the user has explicitly set
14796     one.  Which means we need to validate that probing interval and guard size
14797     are equal.  */
14798  int probe_interval
14799    = param_stack_clash_protection_probe_interval;
14800  if (guard_size != probe_interval)
14801    error ("stack clash guard size %<%d%> must be equal to probing interval "
14802	   "%<%d%>", guard_size, probe_interval);
14803
14804  /* Enable sw prefetching at specified optimization level for
14805     CPUS that have prefetch.  Lower optimization level threshold by 1
14806     when profiling is enabled.  */
14807  if (opts->x_flag_prefetch_loop_arrays < 0
14808      && !opts->x_optimize_size
14809      && aarch64_tune_params.prefetch->default_opt_level >= 0
14810      && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14811    opts->x_flag_prefetch_loop_arrays = 1;
14812
14813  if (opts->x_aarch64_arch_string == NULL)
14814    opts->x_aarch64_arch_string = selected_arch->name;
14815  if (opts->x_aarch64_cpu_string == NULL)
14816    opts->x_aarch64_cpu_string = selected_cpu->name;
14817  if (opts->x_aarch64_tune_string == NULL)
14818    opts->x_aarch64_tune_string = selected_tune->name;
14819
14820  aarch64_override_options_after_change_1 (opts);
14821}
14822
14823/* Print a hint with a suggestion for a core or architecture name that
14824   most closely resembles what the user passed in STR.  ARCH is true if
14825   the user is asking for an architecture name.  ARCH is false if the user
14826   is asking for a core name.  */
14827
14828static void
14829aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14830{
14831  auto_vec<const char *> candidates;
14832  const struct processor *entry = arch ? all_architectures : all_cores;
14833  for (; entry->name != NULL; entry++)
14834    candidates.safe_push (entry->name);
14835
14836#ifdef HAVE_LOCAL_CPU_DETECT
14837  /* Add also "native" as possible value.  */
14838  if (arch)
14839    candidates.safe_push ("native");
14840#endif
14841
14842  char *s;
14843  const char *hint = candidates_list_and_hint (str, s, candidates);
14844  if (hint)
14845    inform (input_location, "valid arguments are: %s;"
14846			     " did you mean %qs?", s, hint);
14847  else
14848    inform (input_location, "valid arguments are: %s", s);
14849
14850  XDELETEVEC (s);
14851}
14852
14853/* Print a hint with a suggestion for a core name that most closely resembles
14854   what the user passed in STR.  */
14855
14856inline static void
14857aarch64_print_hint_for_core (const char *str)
14858{
14859  aarch64_print_hint_for_core_or_arch (str, false);
14860}
14861
14862/* Print a hint with a suggestion for an architecture name that most closely
14863   resembles what the user passed in STR.  */
14864
14865inline static void
14866aarch64_print_hint_for_arch (const char *str)
14867{
14868  aarch64_print_hint_for_core_or_arch (str, true);
14869}
14870
14871
14872/* Print a hint with a suggestion for an extension name
14873   that most closely resembles what the user passed in STR.  */
14874
14875void
14876aarch64_print_hint_for_extensions (const std::string &str)
14877{
14878  auto_vec<const char *> candidates;
14879  aarch64_get_all_extension_candidates (&candidates);
14880  char *s;
14881  const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14882  if (hint)
14883    inform (input_location, "valid arguments are: %s;"
14884			     " did you mean %qs?", s, hint);
14885  else
14886    inform (input_location, "valid arguments are: %s;", s);
14887
14888  XDELETEVEC (s);
14889}
14890
14891/* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
14892   specified in STR and throw errors if appropriate.  Put the results if
14893   they are valid in RES and ISA_FLAGS.  Return whether the option is
14894   valid.  */
14895
14896static bool
14897aarch64_validate_mcpu (const char *str, const struct processor **res,
14898		       uint64_t *isa_flags)
14899{
14900  std::string invalid_extension;
14901  enum aarch64_parse_opt_result parse_res
14902    = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14903
14904  if (parse_res == AARCH64_PARSE_OK)
14905    return true;
14906
14907  switch (parse_res)
14908    {
14909      case AARCH64_PARSE_MISSING_ARG:
14910	error ("missing cpu name in %<-mcpu=%s%>", str);
14911	break;
14912      case AARCH64_PARSE_INVALID_ARG:
14913	error ("unknown value %qs for %<-mcpu%>", str);
14914	aarch64_print_hint_for_core (str);
14915	break;
14916      case AARCH64_PARSE_INVALID_FEATURE:
14917	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14918	       invalid_extension.c_str (), str);
14919	aarch64_print_hint_for_extensions (invalid_extension);
14920	break;
14921      default:
14922	gcc_unreachable ();
14923    }
14924
14925  return false;
14926}
14927
14928/* Straight line speculation indicators.  */
14929enum aarch64_sls_hardening_type
14930{
14931  SLS_NONE = 0,
14932  SLS_RETBR = 1,
14933  SLS_BLR = 2,
14934  SLS_ALL = 3,
14935};
14936static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14937
14938/* Return whether we should mitigatate Straight Line Speculation for the RET
14939   and BR instructions.  */
14940bool
14941aarch64_harden_sls_retbr_p (void)
14942{
14943  return aarch64_sls_hardening & SLS_RETBR;
14944}
14945
14946/* Return whether we should mitigatate Straight Line Speculation for the BLR
14947   instruction.  */
14948bool
14949aarch64_harden_sls_blr_p (void)
14950{
14951  return aarch64_sls_hardening & SLS_BLR;
14952}
14953
14954/* As of yet we only allow setting these options globally, in the future we may
14955   allow setting them per function.  */
14956static void
14957aarch64_validate_sls_mitigation (const char *const_str)
14958{
14959  char *token_save = NULL;
14960  char *str = NULL;
14961
14962  if (strcmp (const_str, "none") == 0)
14963    {
14964      aarch64_sls_hardening = SLS_NONE;
14965      return;
14966    }
14967  if (strcmp (const_str, "all") == 0)
14968    {
14969      aarch64_sls_hardening = SLS_ALL;
14970      return;
14971    }
14972
14973  char *str_root = xstrdup (const_str);
14974  str = strtok_r (str_root, ",", &token_save);
14975  if (!str)
14976    error ("invalid argument given to %<-mharden-sls=%>");
14977
14978  int temp = SLS_NONE;
14979  while (str)
14980    {
14981      if (strcmp (str, "blr") == 0)
14982	temp |= SLS_BLR;
14983      else if (strcmp (str, "retbr") == 0)
14984	temp |= SLS_RETBR;
14985      else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14986	{
14987	  error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14988	  break;
14989	}
14990      else
14991	{
14992	  error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14993	  break;
14994	}
14995      str = strtok_r (NULL, ",", &token_save);
14996    }
14997  aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14998  free (str_root);
14999}
15000
15001/* Parses CONST_STR for branch protection features specified in
15002   aarch64_branch_protect_types, and set any global variables required.  Returns
15003   the parsing result and assigns LAST_STR to the last processed token from
15004   CONST_STR so that it can be used for error reporting.  */
15005
15006static enum
15007aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
15008							  char** last_str)
15009{
15010  char *str_root = xstrdup (const_str);
15011  char* token_save = NULL;
15012  char *str = strtok_r (str_root, "+", &token_save);
15013  enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
15014  if (!str)
15015    res = AARCH64_PARSE_MISSING_ARG;
15016  else
15017    {
15018      char *next_str = strtok_r (NULL, "+", &token_save);
15019      /* Reset the branch protection features to their defaults.  */
15020      aarch64_handle_no_branch_protection (NULL, NULL);
15021
15022      while (str && res == AARCH64_PARSE_OK)
15023	{
15024	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
15025	  bool found = false;
15026	  /* Search for this type.  */
15027	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
15028	    {
15029	      if (strcmp (str, type->name) == 0)
15030		{
15031		  found = true;
15032		  res = type->handler (str, next_str);
15033		  str = next_str;
15034		  next_str = strtok_r (NULL, "+", &token_save);
15035		}
15036	      else
15037		type++;
15038	    }
15039	  if (found && res == AARCH64_PARSE_OK)
15040	    {
15041	      bool found_subtype = true;
15042	      /* Loop through each token until we find one that isn't a
15043		 subtype.  */
15044	      while (found_subtype)
15045		{
15046		  found_subtype = false;
15047		  const aarch64_branch_protect_type *subtype = type->subtypes;
15048		  /* Search for the subtype.  */
15049		  while (str && subtype && subtype->name && !found_subtype
15050			  && res == AARCH64_PARSE_OK)
15051		    {
15052		      if (strcmp (str, subtype->name) == 0)
15053			{
15054			  found_subtype = true;
15055			  res = subtype->handler (str, next_str);
15056			  str = next_str;
15057			  next_str = strtok_r (NULL, "+", &token_save);
15058			}
15059		      else
15060			subtype++;
15061		    }
15062		}
15063	    }
15064	  else if (!found)
15065	    res = AARCH64_PARSE_INVALID_ARG;
15066	}
15067    }
15068  /* Copy the last processed token into the argument to pass it back.
15069    Used by option and attribute validation to print the offending token.  */
15070  if (last_str)
15071    {
15072      if (str) strcpy (*last_str, str);
15073      else *last_str = NULL;
15074    }
15075  if (res == AARCH64_PARSE_OK)
15076    {
15077      /* If needed, alloc the accepted string then copy in const_str.
15078	Used by override_option_after_change_1.  */
15079      if (!accepted_branch_protection_string)
15080	accepted_branch_protection_string = (char *) xmalloc (
15081						      BRANCH_PROTECT_STR_MAX
15082							+ 1);
15083      strncpy (accepted_branch_protection_string, const_str,
15084		BRANCH_PROTECT_STR_MAX + 1);
15085      /* Forcibly null-terminate.  */
15086      accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
15087    }
15088  return res;
15089}
15090
15091static bool
15092aarch64_validate_mbranch_protection (const char *const_str)
15093{
15094  char *str = (char *) xmalloc (strlen (const_str));
15095  enum aarch64_parse_opt_result res =
15096    aarch64_parse_branch_protection (const_str, &str);
15097  if (res == AARCH64_PARSE_INVALID_ARG)
15098    error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
15099  else if (res == AARCH64_PARSE_MISSING_ARG)
15100    error ("missing argument for %<-mbranch-protection=%>");
15101  free (str);
15102  return res == AARCH64_PARSE_OK;
15103}
15104
15105/* Validate a command-line -march option.  Parse the arch and extensions
15106   (if any) specified in STR and throw errors if appropriate.  Put the
15107   results, if they are valid, in RES and ISA_FLAGS.  Return whether the
15108   option is valid.  */
15109
15110static bool
15111aarch64_validate_march (const char *str, const struct processor **res,
15112			 uint64_t *isa_flags)
15113{
15114  std::string invalid_extension;
15115  enum aarch64_parse_opt_result parse_res
15116    = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
15117
15118  if (parse_res == AARCH64_PARSE_OK)
15119    return true;
15120
15121  switch (parse_res)
15122    {
15123      case AARCH64_PARSE_MISSING_ARG:
15124	error ("missing arch name in %<-march=%s%>", str);
15125	break;
15126      case AARCH64_PARSE_INVALID_ARG:
15127	error ("unknown value %qs for %<-march%>", str);
15128	aarch64_print_hint_for_arch (str);
15129	break;
15130      case AARCH64_PARSE_INVALID_FEATURE:
15131	error ("invalid feature modifier %qs in %<-march=%s%>",
15132	       invalid_extension.c_str (), str);
15133	aarch64_print_hint_for_extensions (invalid_extension);
15134	break;
15135      default:
15136	gcc_unreachable ();
15137    }
15138
15139  return false;
15140}
15141
15142/* Validate a command-line -mtune option.  Parse the cpu
15143   specified in STR and throw errors if appropriate.  Put the
15144   result, if it is valid, in RES.  Return whether the option is
15145   valid.  */
15146
15147static bool
15148aarch64_validate_mtune (const char *str, const struct processor **res)
15149{
15150  enum aarch64_parse_opt_result parse_res
15151    = aarch64_parse_tune (str, res);
15152
15153  if (parse_res == AARCH64_PARSE_OK)
15154    return true;
15155
15156  switch (parse_res)
15157    {
15158      case AARCH64_PARSE_MISSING_ARG:
15159	error ("missing cpu name in %<-mtune=%s%>", str);
15160	break;
15161      case AARCH64_PARSE_INVALID_ARG:
15162	error ("unknown value %qs for %<-mtune%>", str);
15163	aarch64_print_hint_for_core (str);
15164	break;
15165      default:
15166	gcc_unreachable ();
15167    }
15168  return false;
15169}
15170
15171/* Return the CPU corresponding to the enum CPU.
15172   If it doesn't specify a cpu, return the default.  */
15173
15174static const struct processor *
15175aarch64_get_tune_cpu (enum aarch64_processor cpu)
15176{
15177  if (cpu != aarch64_none)
15178    return &all_cores[cpu];
15179
15180  /* The & 0x3f is to extract the bottom 6 bits that encode the
15181     default cpu as selected by the --with-cpu GCC configure option
15182     in config.gcc.
15183     ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
15184     flags mechanism should be reworked to make it more sane.  */
15185  return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
15186}
15187
15188/* Return the architecture corresponding to the enum ARCH.
15189   If it doesn't specify a valid architecture, return the default.  */
15190
15191static const struct processor *
15192aarch64_get_arch (enum aarch64_arch arch)
15193{
15194  if (arch != aarch64_no_arch)
15195    return &all_architectures[arch];
15196
15197  const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
15198
15199  return &all_architectures[cpu->arch];
15200}
15201
15202/* Return the VG value associated with -msve-vector-bits= value VALUE.  */
15203
15204static poly_uint16
15205aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
15206{
15207  /* 128-bit SVE and Advanced SIMD modes use different register layouts
15208     on big-endian targets, so we would need to forbid subregs that convert
15209     from one to the other.  By default a reinterpret sequence would then
15210     involve a store to memory in one mode and a load back in the other.
15211     Even if we optimize that sequence using reverse instructions,
15212     it would still be a significant potential overhead.
15213
15214     For now, it seems better to generate length-agnostic code for that
15215     case instead.  */
15216  if (value == SVE_SCALABLE
15217      || (value == SVE_128 && BYTES_BIG_ENDIAN))
15218    return poly_uint16 (2, 2);
15219  else
15220    return (int) value / 64;
15221}
15222
15223/* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
15224   and is used to parse the -m{cpu,tune,arch} strings and setup the initial
15225   tuning structs.  In particular it must set selected_tune and
15226   aarch64_isa_flags that define the available ISA features and tuning
15227   decisions.  It must also set selected_arch as this will be used to
15228   output the .arch asm tags for each function.  */
15229
15230static void
15231aarch64_override_options (void)
15232{
15233  uint64_t cpu_isa = 0;
15234  uint64_t arch_isa = 0;
15235  aarch64_isa_flags = 0;
15236
15237  bool valid_cpu = true;
15238  bool valid_tune = true;
15239  bool valid_arch = true;
15240
15241  selected_cpu = NULL;
15242  selected_arch = NULL;
15243  selected_tune = NULL;
15244
15245  if (aarch64_harden_sls_string)
15246    aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
15247
15248  if (aarch64_branch_protection_string)
15249    aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
15250
15251  /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
15252     If either of -march or -mtune is given, they override their
15253     respective component of -mcpu.  */
15254  if (aarch64_cpu_string)
15255    valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
15256					&cpu_isa);
15257
15258  if (aarch64_arch_string)
15259    valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
15260					  &arch_isa);
15261
15262  if (aarch64_tune_string)
15263    valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
15264
15265#ifdef SUBTARGET_OVERRIDE_OPTIONS
15266  SUBTARGET_OVERRIDE_OPTIONS;
15267#endif
15268
15269  /* If the user did not specify a processor, choose the default
15270     one for them.  This will be the CPU set during configuration using
15271     --with-cpu, otherwise it is "generic".  */
15272  if (!selected_cpu)
15273    {
15274      if (selected_arch)
15275	{
15276	  selected_cpu = &all_cores[selected_arch->ident];
15277	  aarch64_isa_flags = arch_isa;
15278	  explicit_arch = selected_arch->arch;
15279	}
15280      else
15281	{
15282	  /* Get default configure-time CPU.  */
15283	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
15284	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
15285	}
15286
15287      if (selected_tune)
15288	explicit_tune_core = selected_tune->ident;
15289    }
15290  /* If both -mcpu and -march are specified check that they are architecturally
15291     compatible, warn if they're not and prefer the -march ISA flags.  */
15292  else if (selected_arch)
15293    {
15294      if (selected_arch->arch != selected_cpu->arch)
15295	{
15296	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
15297		       aarch64_cpu_string,
15298		       aarch64_arch_string);
15299	}
15300      aarch64_isa_flags = arch_isa;
15301      explicit_arch = selected_arch->arch;
15302      explicit_tune_core = selected_tune ? selected_tune->ident
15303					  : selected_cpu->ident;
15304    }
15305  else
15306    {
15307      /* -mcpu but no -march.  */
15308      aarch64_isa_flags = cpu_isa;
15309      explicit_tune_core = selected_tune ? selected_tune->ident
15310					  : selected_cpu->ident;
15311      gcc_assert (selected_cpu);
15312      selected_arch = &all_architectures[selected_cpu->arch];
15313      explicit_arch = selected_arch->arch;
15314    }
15315
15316  /* Set the arch as well as we will need it when outputing
15317     the .arch directive in assembly.  */
15318  if (!selected_arch)
15319    {
15320      gcc_assert (selected_cpu);
15321      selected_arch = &all_architectures[selected_cpu->arch];
15322    }
15323
15324  if (!selected_tune)
15325    selected_tune = selected_cpu;
15326
15327  if (aarch64_enable_bti == 2)
15328    {
15329#ifdef TARGET_ENABLE_BTI
15330      aarch64_enable_bti = 1;
15331#else
15332      aarch64_enable_bti = 0;
15333#endif
15334    }
15335
15336  /* Return address signing is currently not supported for ILP32 targets.  For
15337     LP64 targets use the configured option in the absence of a command-line
15338     option for -mbranch-protection.  */
15339  if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
15340    {
15341#ifdef TARGET_ENABLE_PAC_RET
15342      aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
15343#else
15344      aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
15345#endif
15346    }
15347
15348#ifndef HAVE_AS_MABI_OPTION
15349  /* The compiler may have been configured with 2.23.* binutils, which does
15350     not have support for ILP32.  */
15351  if (TARGET_ILP32)
15352    error ("assembler does not support %<-mabi=ilp32%>");
15353#endif
15354
15355  /* Convert -msve-vector-bits to a VG count.  */
15356  aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
15357
15358  if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
15359    sorry ("return address signing is only supported for %<-mabi=lp64%>");
15360
15361  /* Make sure we properly set up the explicit options.  */
15362  if ((aarch64_cpu_string && valid_cpu)
15363       || (aarch64_tune_string && valid_tune))
15364    gcc_assert (explicit_tune_core != aarch64_none);
15365
15366  if ((aarch64_cpu_string && valid_cpu)
15367       || (aarch64_arch_string && valid_arch))
15368    gcc_assert (explicit_arch != aarch64_no_arch);
15369
15370  /* The pass to insert speculation tracking runs before
15371     shrink-wrapping and the latter does not know how to update the
15372     tracking status.  So disable it in this case.  */
15373  if (aarch64_track_speculation)
15374    flag_shrink_wrap = 0;
15375
15376  aarch64_override_options_internal (&global_options);
15377
15378  /* Save these options as the default ones in case we push and pop them later
15379     while processing functions with potential target attributes.  */
15380  target_option_default_node = target_option_current_node
15381      = build_target_option_node (&global_options);
15382}
15383
15384/* Implement targetm.override_options_after_change.  */
15385
15386static void
15387aarch64_override_options_after_change (void)
15388{
15389  aarch64_override_options_after_change_1 (&global_options);
15390}
15391
15392static struct machine_function *
15393aarch64_init_machine_status (void)
15394{
15395  struct machine_function *machine;
15396  machine = ggc_cleared_alloc<machine_function> ();
15397  return machine;
15398}
15399
15400void
15401aarch64_init_expanders (void)
15402{
15403  init_machine_status = aarch64_init_machine_status;
15404}
15405
15406/* A checking mechanism for the implementation of the various code models.  */
15407static void
15408initialize_aarch64_code_model (struct gcc_options *opts)
15409{
15410  aarch64_cmodel = opts->x_aarch64_cmodel_var;
15411  switch (opts->x_aarch64_cmodel_var)
15412    {
15413    case AARCH64_CMODEL_TINY:
15414      if (opts->x_flag_pic)
15415	aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15416      break;
15417    case AARCH64_CMODEL_SMALL:
15418      if (opts->x_flag_pic)
15419	{
15420#ifdef HAVE_AS_SMALL_PIC_RELOCS
15421	  aarch64_cmodel = (flag_pic == 2
15422			    ? AARCH64_CMODEL_SMALL_PIC
15423			    : AARCH64_CMODEL_SMALL_SPIC);
15424#else
15425	  aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
15426#endif
15427	}
15428      break;
15429    case AARCH64_CMODEL_LARGE:
15430      if (opts->x_flag_pic)
15431	sorry ("code model %qs with %<-f%s%>", "large",
15432	       opts->x_flag_pic > 1 ? "PIC" : "pic");
15433      if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15434	sorry ("code model %qs not supported in ilp32 mode", "large");
15435      break;
15436    case AARCH64_CMODEL_TINY_PIC:
15437    case AARCH64_CMODEL_SMALL_PIC:
15438    case AARCH64_CMODEL_SMALL_SPIC:
15439      gcc_unreachable ();
15440    }
15441}
15442
15443/* Implement TARGET_OPTION_SAVE.  */
15444
15445static void
15446aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
15447{
15448  ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
15449  ptr->x_aarch64_branch_protection_string
15450    = opts->x_aarch64_branch_protection_string;
15451}
15452
15453/* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
15454   using the information saved in PTR.  */
15455
15456static void
15457aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
15458{
15459  opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15460  selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15461  opts->x_explicit_arch = ptr->x_explicit_arch;
15462  selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15463  opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
15464  opts->x_aarch64_branch_protection_string
15465    = ptr->x_aarch64_branch_protection_string;
15466  if (opts->x_aarch64_branch_protection_string)
15467    {
15468      aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15469					NULL);
15470    }
15471
15472  aarch64_override_options_internal (opts);
15473}
15474
15475/* Implement TARGET_OPTION_PRINT.  */
15476
15477static void
15478aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15479{
15480  const struct processor *cpu
15481    = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15482  uint64_t isa_flags = ptr->x_aarch64_isa_flags;
15483  const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
15484  std::string extension
15485    = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
15486
15487  fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
15488  fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15489	   arch->name, extension.c_str ());
15490}
15491
15492static GTY(()) tree aarch64_previous_fndecl;
15493
15494void
15495aarch64_reset_previous_fndecl (void)
15496{
15497  aarch64_previous_fndecl = NULL;
15498}
15499
15500/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15501   Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15502   make sure optab availability predicates are recomputed when necessary.  */
15503
15504void
15505aarch64_save_restore_target_globals (tree new_tree)
15506{
15507  if (TREE_TARGET_GLOBALS (new_tree))
15508    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15509  else if (new_tree == target_option_default_node)
15510    restore_target_globals (&default_target_globals);
15511  else
15512    TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15513}
15514
15515/* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
15516   like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15517   of the function, if such exists.  This function may be called multiple
15518   times on a single function so use aarch64_previous_fndecl to avoid
15519   setting up identical state.  */
15520
15521static void
15522aarch64_set_current_function (tree fndecl)
15523{
15524  if (!fndecl || fndecl == aarch64_previous_fndecl)
15525    return;
15526
15527  tree old_tree = (aarch64_previous_fndecl
15528		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15529		   : NULL_TREE);
15530
15531  tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15532
15533  /* If current function has no attributes but the previous one did,
15534     use the default node.  */
15535  if (!new_tree && old_tree)
15536    new_tree = target_option_default_node;
15537
15538  /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
15539     the default have been handled by aarch64_save_restore_target_globals from
15540     aarch64_pragma_target_parse.  */
15541  if (old_tree == new_tree)
15542    return;
15543
15544  aarch64_previous_fndecl = fndecl;
15545
15546  /* First set the target options.  */
15547  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
15548
15549  aarch64_save_restore_target_globals (new_tree);
15550}
15551
15552/* Enum describing the various ways we can handle attributes.
15553   In many cases we can reuse the generic option handling machinery.  */
15554
15555enum aarch64_attr_opt_type
15556{
15557  aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
15558  aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
15559  aarch64_attr_enum,	/* Attribute sets an enum variable.  */
15560  aarch64_attr_custom	/* Attribute requires a custom handling function.  */
15561};
15562
15563/* All the information needed to handle a target attribute.
15564   NAME is the name of the attribute.
15565   ATTR_TYPE specifies the type of behavior of the attribute as described
15566   in the definition of enum aarch64_attr_opt_type.
15567   ALLOW_NEG is true if the attribute supports a "no-" form.
15568   HANDLER is the function that takes the attribute string as an argument
15569   It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15570   OPT_NUM is the enum specifying the option that the attribute modifies.
15571   This is needed for attributes that mirror the behavior of a command-line
15572   option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15573   aarch64_attr_enum.  */
15574
15575struct aarch64_attribute_info
15576{
15577  const char *name;
15578  enum aarch64_attr_opt_type attr_type;
15579  bool allow_neg;
15580  bool (*handler) (const char *);
15581  enum opt_code opt_num;
15582};
15583
15584/* Handle the ARCH_STR argument to the arch= target attribute.  */
15585
15586static bool
15587aarch64_handle_attr_arch (const char *str)
15588{
15589  const struct processor *tmp_arch = NULL;
15590  std::string invalid_extension;
15591  enum aarch64_parse_opt_result parse_res
15592    = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15593
15594  if (parse_res == AARCH64_PARSE_OK)
15595    {
15596      gcc_assert (tmp_arch);
15597      selected_arch = tmp_arch;
15598      explicit_arch = selected_arch->arch;
15599      return true;
15600    }
15601
15602  switch (parse_res)
15603    {
15604      case AARCH64_PARSE_MISSING_ARG:
15605	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15606	break;
15607      case AARCH64_PARSE_INVALID_ARG:
15608	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15609	aarch64_print_hint_for_arch (str);
15610	break;
15611      case AARCH64_PARSE_INVALID_FEATURE:
15612	error ("invalid feature modifier %s of value (\"%s\") in "
15613	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15614	aarch64_print_hint_for_extensions (invalid_extension);
15615	break;
15616      default:
15617	gcc_unreachable ();
15618    }
15619
15620  return false;
15621}
15622
15623/* Handle the argument CPU_STR to the cpu= target attribute.  */
15624
15625static bool
15626aarch64_handle_attr_cpu (const char *str)
15627{
15628  const struct processor *tmp_cpu = NULL;
15629  std::string invalid_extension;
15630  enum aarch64_parse_opt_result parse_res
15631    = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15632
15633  if (parse_res == AARCH64_PARSE_OK)
15634    {
15635      gcc_assert (tmp_cpu);
15636      selected_tune = tmp_cpu;
15637      explicit_tune_core = selected_tune->ident;
15638
15639      selected_arch = &all_architectures[tmp_cpu->arch];
15640      explicit_arch = selected_arch->arch;
15641      return true;
15642    }
15643
15644  switch (parse_res)
15645    {
15646      case AARCH64_PARSE_MISSING_ARG:
15647	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15648	break;
15649      case AARCH64_PARSE_INVALID_ARG:
15650	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15651	aarch64_print_hint_for_core (str);
15652	break;
15653      case AARCH64_PARSE_INVALID_FEATURE:
15654	error ("invalid feature modifier %s of value (\"%s\") in "
15655	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15656	aarch64_print_hint_for_extensions (invalid_extension);
15657	break;
15658      default:
15659	gcc_unreachable ();
15660    }
15661
15662  return false;
15663}
15664
15665/* Handle the argument STR to the branch-protection= attribute.  */
15666
15667 static bool
15668 aarch64_handle_attr_branch_protection (const char* str)
15669 {
15670  char *err_str = (char *) xmalloc (strlen (str) + 1);
15671  enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15672								      &err_str);
15673  bool success = false;
15674  switch (res)
15675    {
15676     case AARCH64_PARSE_MISSING_ARG:
15677       error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15678	      " attribute");
15679       break;
15680     case AARCH64_PARSE_INVALID_ARG:
15681       error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15682	      "=\")%> pragma or attribute", err_str);
15683       break;
15684     case AARCH64_PARSE_OK:
15685       success = true;
15686      /* Fall through.  */
15687     case AARCH64_PARSE_INVALID_FEATURE:
15688       break;
15689     default:
15690       gcc_unreachable ();
15691    }
15692  free (err_str);
15693  return success;
15694 }
15695
15696/* Handle the argument STR to the tune= target attribute.  */
15697
15698static bool
15699aarch64_handle_attr_tune (const char *str)
15700{
15701  const struct processor *tmp_tune = NULL;
15702  enum aarch64_parse_opt_result parse_res
15703    = aarch64_parse_tune (str, &tmp_tune);
15704
15705  if (parse_res == AARCH64_PARSE_OK)
15706    {
15707      gcc_assert (tmp_tune);
15708      selected_tune = tmp_tune;
15709      explicit_tune_core = selected_tune->ident;
15710      return true;
15711    }
15712
15713  switch (parse_res)
15714    {
15715      case AARCH64_PARSE_INVALID_ARG:
15716	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15717	aarch64_print_hint_for_core (str);
15718	break;
15719      default:
15720	gcc_unreachable ();
15721    }
15722
15723  return false;
15724}
15725
15726/* Parse an architecture extensions target attribute string specified in STR.
15727   For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
15728   if successful.  Update aarch64_isa_flags to reflect the ISA features
15729   modified.  */
15730
15731static bool
15732aarch64_handle_attr_isa_flags (char *str)
15733{
15734  enum aarch64_parse_opt_result parse_res;
15735  uint64_t isa_flags = aarch64_isa_flags;
15736
15737  /* We allow "+nothing" in the beginning to clear out all architectural
15738     features if the user wants to handpick specific features.  */
15739  if (strncmp ("+nothing", str, 8) == 0)
15740    {
15741      isa_flags = 0;
15742      str += 8;
15743    }
15744
15745  std::string invalid_extension;
15746  parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15747
15748  if (parse_res == AARCH64_PARSE_OK)
15749    {
15750      aarch64_isa_flags = isa_flags;
15751      return true;
15752    }
15753
15754  switch (parse_res)
15755    {
15756      case AARCH64_PARSE_MISSING_ARG:
15757	error ("missing value in %<target()%> pragma or attribute");
15758	break;
15759
15760      case AARCH64_PARSE_INVALID_FEATURE:
15761	error ("invalid feature modifier %s of value (\"%s\") in "
15762	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15763	break;
15764
15765      default:
15766	gcc_unreachable ();
15767    }
15768
15769 return false;
15770}
15771
15772/* The target attributes that we support.  On top of these we also support just
15773   ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
15774   handled explicitly in aarch64_process_one_target_attr.  */
15775
15776static const struct aarch64_attribute_info aarch64_attributes[] =
15777{
15778  { "general-regs-only", aarch64_attr_mask, false, NULL,
15779     OPT_mgeneral_regs_only },
15780  { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15781     OPT_mfix_cortex_a53_835769 },
15782  { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15783     OPT_mfix_cortex_a53_843419 },
15784  { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15785  { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15786  { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15787     OPT_momit_leaf_frame_pointer },
15788  { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15789  { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15790     OPT_march_ },
15791  { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15792  { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15793     OPT_mtune_ },
15794  { "branch-protection", aarch64_attr_custom, false,
15795     aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15796  { "sign-return-address", aarch64_attr_enum, false, NULL,
15797     OPT_msign_return_address_ },
15798  { "outline-atomics", aarch64_attr_bool, true, NULL,
15799     OPT_moutline_atomics},
15800  { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15801};
15802
15803/* Parse ARG_STR which contains the definition of one target attribute.
15804   Show appropriate errors if any or return true if the attribute is valid.  */
15805
15806static bool
15807aarch64_process_one_target_attr (char *arg_str)
15808{
15809  bool invert = false;
15810
15811  size_t len = strlen (arg_str);
15812
15813  if (len == 0)
15814    {
15815      error ("malformed %<target()%> pragma or attribute");
15816      return false;
15817    }
15818
15819  char *str_to_check = (char *) alloca (len + 1);
15820  strcpy (str_to_check, arg_str);
15821
15822  /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15823     It is easier to detect and handle it explicitly here rather than going
15824     through the machinery for the rest of the target attributes in this
15825     function.  */
15826  if (*str_to_check == '+')
15827    return aarch64_handle_attr_isa_flags (str_to_check);
15828
15829  if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15830    {
15831      invert = true;
15832      str_to_check += 3;
15833    }
15834  char *arg = strchr (str_to_check, '=');
15835
15836  /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15837     and point ARG to "foo".  */
15838  if (arg)
15839    {
15840      *arg = '\0';
15841      arg++;
15842    }
15843  const struct aarch64_attribute_info *p_attr;
15844  bool found = false;
15845  for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15846    {
15847      /* If the names don't match up, or the user has given an argument
15848	 to an attribute that doesn't accept one, or didn't give an argument
15849	 to an attribute that expects one, fail to match.  */
15850      if (strcmp (str_to_check, p_attr->name) != 0)
15851	continue;
15852
15853      found = true;
15854      bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15855			      || p_attr->attr_type == aarch64_attr_enum;
15856
15857      if (attr_need_arg_p ^ (arg != NULL))
15858	{
15859	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15860	  return false;
15861	}
15862
15863      /* If the name matches but the attribute does not allow "no-" versions
15864	 then we can't match.  */
15865      if (invert && !p_attr->allow_neg)
15866	{
15867	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15868	  return false;
15869	}
15870
15871      switch (p_attr->attr_type)
15872	{
15873	/* Has a custom handler registered.
15874	   For example, cpu=, arch=, tune=.  */
15875	  case aarch64_attr_custom:
15876	    gcc_assert (p_attr->handler);
15877	    if (!p_attr->handler (arg))
15878	      return false;
15879	    break;
15880
15881	  /* Either set or unset a boolean option.  */
15882	  case aarch64_attr_bool:
15883	    {
15884	      struct cl_decoded_option decoded;
15885
15886	      generate_option (p_attr->opt_num, NULL, !invert,
15887			       CL_TARGET, &decoded);
15888	      aarch64_handle_option (&global_options, &global_options_set,
15889				      &decoded, input_location);
15890	      break;
15891	    }
15892	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
15893	     should know what mask to apply given the option number.  */
15894	  case aarch64_attr_mask:
15895	    {
15896	      struct cl_decoded_option decoded;
15897	      /* We only need to specify the option number.
15898		 aarch64_handle_option will know which mask to apply.  */
15899	      decoded.opt_index = p_attr->opt_num;
15900	      decoded.value = !invert;
15901	      aarch64_handle_option (&global_options, &global_options_set,
15902				      &decoded, input_location);
15903	      break;
15904	    }
15905	  /* Use the option setting machinery to set an option to an enum.  */
15906	  case aarch64_attr_enum:
15907	    {
15908	      gcc_assert (arg);
15909	      bool valid;
15910	      int value;
15911	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15912					      &value, CL_TARGET);
15913	      if (valid)
15914		{
15915		  set_option (&global_options, NULL, p_attr->opt_num, value,
15916			      NULL, DK_UNSPECIFIED, input_location,
15917			      global_dc);
15918		}
15919	      else
15920		{
15921		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15922		}
15923	      break;
15924	    }
15925	  default:
15926	    gcc_unreachable ();
15927	}
15928    }
15929
15930  /* If we reached here we either have found an attribute and validated
15931     it or didn't match any.  If we matched an attribute but its arguments
15932     were malformed we will have returned false already.  */
15933  return found;
15934}
15935
15936/* Count how many times the character C appears in
15937   NULL-terminated string STR.  */
15938
15939static unsigned int
15940num_occurences_in_str (char c, char *str)
15941{
15942  unsigned int res = 0;
15943  while (*str != '\0')
15944    {
15945      if (*str == c)
15946	res++;
15947
15948      str++;
15949    }
15950
15951  return res;
15952}
15953
15954/* Parse the tree in ARGS that contains the target attribute information
15955   and update the global target options space.  */
15956
15957bool
15958aarch64_process_target_attr (tree args)
15959{
15960  if (TREE_CODE (args) == TREE_LIST)
15961    {
15962      do
15963	{
15964	  tree head = TREE_VALUE (args);
15965	  if (head)
15966	    {
15967	      if (!aarch64_process_target_attr (head))
15968		return false;
15969	    }
15970	  args = TREE_CHAIN (args);
15971	} while (args);
15972
15973      return true;
15974    }
15975
15976  if (TREE_CODE (args) != STRING_CST)
15977    {
15978      error ("attribute %<target%> argument not a string");
15979      return false;
15980    }
15981
15982  size_t len = strlen (TREE_STRING_POINTER (args));
15983  char *str_to_check = (char *) alloca (len + 1);
15984  strcpy (str_to_check, TREE_STRING_POINTER (args));
15985
15986  if (len == 0)
15987    {
15988      error ("malformed %<target()%> pragma or attribute");
15989      return false;
15990    }
15991
15992  /* Used to catch empty spaces between commas i.e.
15993     attribute ((target ("attr1,,attr2"))).  */
15994  unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15995
15996  /* Handle multiple target attributes separated by ','.  */
15997  char *token = strtok_r (str_to_check, ",", &str_to_check);
15998
15999  unsigned int num_attrs = 0;
16000  while (token)
16001    {
16002      num_attrs++;
16003      if (!aarch64_process_one_target_attr (token))
16004	{
16005	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
16006	  return false;
16007	}
16008
16009      token = strtok_r (NULL, ",", &str_to_check);
16010    }
16011
16012  if (num_attrs != num_commas + 1)
16013    {
16014      error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
16015      return false;
16016    }
16017
16018  return true;
16019}
16020
16021/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
16022   process attribute ((target ("..."))).  */
16023
16024static bool
16025aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
16026{
16027  struct cl_target_option cur_target;
16028  bool ret;
16029  tree old_optimize;
16030  tree new_target, new_optimize;
16031  tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16032
16033  /* If what we're processing is the current pragma string then the
16034     target option node is already stored in target_option_current_node
16035     by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
16036     having to re-parse the string.  This is especially useful to keep
16037     arm_neon.h compile times down since that header contains a lot
16038     of intrinsics enclosed in pragmas.  */
16039  if (!existing_target && args == current_target_pragma)
16040    {
16041      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
16042      return true;
16043    }
16044  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
16045
16046  old_optimize = build_optimization_node (&global_options);
16047  func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
16048
16049  /* If the function changed the optimization levels as well as setting
16050     target options, start with the optimizations specified.  */
16051  if (func_optimize && func_optimize != old_optimize)
16052    cl_optimization_restore (&global_options,
16053			     TREE_OPTIMIZATION (func_optimize));
16054
16055  /* Save the current target options to restore at the end.  */
16056  cl_target_option_save (&cur_target, &global_options);
16057
16058  /* If fndecl already has some target attributes applied to it, unpack
16059     them so that we add this attribute on top of them, rather than
16060     overwriting them.  */
16061  if (existing_target)
16062    {
16063      struct cl_target_option *existing_options
16064	= TREE_TARGET_OPTION (existing_target);
16065
16066      if (existing_options)
16067	cl_target_option_restore (&global_options, existing_options);
16068    }
16069  else
16070    cl_target_option_restore (&global_options,
16071			TREE_TARGET_OPTION (target_option_current_node));
16072
16073  ret = aarch64_process_target_attr (args);
16074
16075  /* Set up any additional state.  */
16076  if (ret)
16077    {
16078      aarch64_override_options_internal (&global_options);
16079      /* Initialize SIMD builtins if we haven't already.
16080	 Set current_target_pragma to NULL for the duration so that
16081	 the builtin initialization code doesn't try to tag the functions
16082	 being built with the attributes specified by any current pragma, thus
16083	 going into an infinite recursion.  */
16084      if (TARGET_SIMD)
16085	{
16086	  tree saved_current_target_pragma = current_target_pragma;
16087	  current_target_pragma = NULL;
16088	  aarch64_init_simd_builtins ();
16089	  current_target_pragma = saved_current_target_pragma;
16090	}
16091      new_target = build_target_option_node (&global_options);
16092    }
16093  else
16094    new_target = NULL;
16095
16096  new_optimize = build_optimization_node (&global_options);
16097
16098  if (fndecl && ret)
16099    {
16100      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
16101
16102      if (old_optimize != new_optimize)
16103	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
16104    }
16105
16106  cl_target_option_restore (&global_options, &cur_target);
16107
16108  if (old_optimize != new_optimize)
16109    cl_optimization_restore (&global_options,
16110			     TREE_OPTIMIZATION (old_optimize));
16111  return ret;
16112}
16113
16114/* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
16115   tri-bool options (yes, no, don't care) and the default value is
16116   DEF, determine whether to reject inlining.  */
16117
16118static bool
16119aarch64_tribools_ok_for_inlining_p (int caller, int callee,
16120				     int dont_care, int def)
16121{
16122  /* If the callee doesn't care, always allow inlining.  */
16123  if (callee == dont_care)
16124    return true;
16125
16126  /* If the caller doesn't care, always allow inlining.  */
16127  if (caller == dont_care)
16128    return true;
16129
16130  /* Otherwise, allow inlining if either the callee and caller values
16131     agree, or if the callee is using the default value.  */
16132  return (callee == caller || callee == def);
16133}
16134
16135/* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
16136   to inline CALLEE into CALLER based on target-specific info.
16137   Make sure that the caller and callee have compatible architectural
16138   features.  Then go through the other possible target attributes
16139   and see if they can block inlining.  Try not to reject always_inline
16140   callees unless they are incompatible architecturally.  */
16141
16142static bool
16143aarch64_can_inline_p (tree caller, tree callee)
16144{
16145  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
16146  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
16147
16148  struct cl_target_option *caller_opts
16149	= TREE_TARGET_OPTION (caller_tree ? caller_tree
16150					   : target_option_default_node);
16151
16152  struct cl_target_option *callee_opts
16153	= TREE_TARGET_OPTION (callee_tree ? callee_tree
16154					   : target_option_default_node);
16155
16156  /* Callee's ISA flags should be a subset of the caller's.  */
16157  if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
16158       != callee_opts->x_aarch64_isa_flags)
16159    return false;
16160
16161  /* Allow non-strict aligned functions inlining into strict
16162     aligned ones.  */
16163  if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
16164       != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
16165      && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
16166	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
16167    return false;
16168
16169  bool always_inline = lookup_attribute ("always_inline",
16170					  DECL_ATTRIBUTES (callee));
16171
16172  /* If the architectural features match up and the callee is always_inline
16173     then the other attributes don't matter.  */
16174  if (always_inline)
16175    return true;
16176
16177  if (caller_opts->x_aarch64_cmodel_var
16178      != callee_opts->x_aarch64_cmodel_var)
16179    return false;
16180
16181  if (caller_opts->x_aarch64_tls_dialect
16182      != callee_opts->x_aarch64_tls_dialect)
16183    return false;
16184
16185  /* Honour explicit requests to workaround errata.  */
16186  if (!aarch64_tribools_ok_for_inlining_p (
16187	  caller_opts->x_aarch64_fix_a53_err835769,
16188	  callee_opts->x_aarch64_fix_a53_err835769,
16189	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
16190    return false;
16191
16192  if (!aarch64_tribools_ok_for_inlining_p (
16193	  caller_opts->x_aarch64_fix_a53_err843419,
16194	  callee_opts->x_aarch64_fix_a53_err843419,
16195	  2, TARGET_FIX_ERR_A53_843419))
16196    return false;
16197
16198  /* If the user explicitly specified -momit-leaf-frame-pointer for the
16199     caller and calle and they don't match up, reject inlining.  */
16200  if (!aarch64_tribools_ok_for_inlining_p (
16201	  caller_opts->x_flag_omit_leaf_frame_pointer,
16202	  callee_opts->x_flag_omit_leaf_frame_pointer,
16203	  2, 1))
16204    return false;
16205
16206  /* If the callee has specific tuning overrides, respect them.  */
16207  if (callee_opts->x_aarch64_override_tune_string != NULL
16208      && caller_opts->x_aarch64_override_tune_string == NULL)
16209    return false;
16210
16211  /* If the user specified tuning override strings for the
16212     caller and callee and they don't match up, reject inlining.
16213     We just do a string compare here, we don't analyze the meaning
16214     of the string, as it would be too costly for little gain.  */
16215  if (callee_opts->x_aarch64_override_tune_string
16216      && caller_opts->x_aarch64_override_tune_string
16217      && (strcmp (callee_opts->x_aarch64_override_tune_string,
16218		  caller_opts->x_aarch64_override_tune_string) != 0))
16219    return false;
16220
16221  return true;
16222}
16223
16224/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
16225   been already.  */
16226
16227unsigned int
16228aarch64_tlsdesc_abi_id ()
16229{
16230  predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
16231  if (!tlsdesc_abi.initialized_p ())
16232    {
16233      HARD_REG_SET full_reg_clobbers;
16234      CLEAR_HARD_REG_SET (full_reg_clobbers);
16235      SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
16236      SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
16237      for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
16238	SET_HARD_REG_BIT (full_reg_clobbers, regno);
16239      tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
16240    }
16241  return tlsdesc_abi.id ();
16242}
16243
16244/* Return true if SYMBOL_REF X binds locally.  */
16245
16246static bool
16247aarch64_symbol_binds_local_p (const_rtx x)
16248{
16249  return (SYMBOL_REF_DECL (x)
16250	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
16251	  : SYMBOL_REF_LOCAL_P (x));
16252}
16253
16254/* Return true if SYMBOL_REF X is thread local */
16255static bool
16256aarch64_tls_symbol_p (rtx x)
16257{
16258  if (! TARGET_HAVE_TLS)
16259    return false;
16260
16261  x = strip_salt (x);
16262  if (GET_CODE (x) != SYMBOL_REF)
16263    return false;
16264
16265  return SYMBOL_REF_TLS_MODEL (x) != 0;
16266}
16267
16268/* Classify a TLS symbol into one of the TLS kinds.  */
16269enum aarch64_symbol_type
16270aarch64_classify_tls_symbol (rtx x)
16271{
16272  enum tls_model tls_kind = tls_symbolic_operand_type (x);
16273
16274  switch (tls_kind)
16275    {
16276    case TLS_MODEL_GLOBAL_DYNAMIC:
16277    case TLS_MODEL_LOCAL_DYNAMIC:
16278      return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
16279
16280    case TLS_MODEL_INITIAL_EXEC:
16281      switch (aarch64_cmodel)
16282	{
16283	case AARCH64_CMODEL_TINY:
16284	case AARCH64_CMODEL_TINY_PIC:
16285	  return SYMBOL_TINY_TLSIE;
16286	default:
16287	  return SYMBOL_SMALL_TLSIE;
16288	}
16289
16290    case TLS_MODEL_LOCAL_EXEC:
16291      if (aarch64_tls_size == 12)
16292	return SYMBOL_TLSLE12;
16293      else if (aarch64_tls_size == 24)
16294	return SYMBOL_TLSLE24;
16295      else if (aarch64_tls_size == 32)
16296	return SYMBOL_TLSLE32;
16297      else if (aarch64_tls_size == 48)
16298	return SYMBOL_TLSLE48;
16299      else
16300	gcc_unreachable ();
16301
16302    case TLS_MODEL_EMULATED:
16303    case TLS_MODEL_NONE:
16304      return SYMBOL_FORCE_TO_MEM;
16305
16306    default:
16307      gcc_unreachable ();
16308    }
16309}
16310
16311/* Return the correct method for accessing X + OFFSET, where X is either
16312   a SYMBOL_REF or LABEL_REF.  */
16313
16314enum aarch64_symbol_type
16315aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
16316{
16317  x = strip_salt (x);
16318
16319  if (GET_CODE (x) == LABEL_REF)
16320    {
16321      switch (aarch64_cmodel)
16322	{
16323	case AARCH64_CMODEL_LARGE:
16324	  return SYMBOL_FORCE_TO_MEM;
16325
16326	case AARCH64_CMODEL_TINY_PIC:
16327	case AARCH64_CMODEL_TINY:
16328	  return SYMBOL_TINY_ABSOLUTE;
16329
16330	case AARCH64_CMODEL_SMALL_SPIC:
16331	case AARCH64_CMODEL_SMALL_PIC:
16332	case AARCH64_CMODEL_SMALL:
16333	  return SYMBOL_SMALL_ABSOLUTE;
16334
16335	default:
16336	  gcc_unreachable ();
16337	}
16338    }
16339
16340  if (GET_CODE (x) == SYMBOL_REF)
16341    {
16342      if (aarch64_tls_symbol_p (x))
16343	return aarch64_classify_tls_symbol (x);
16344
16345      switch (aarch64_cmodel)
16346	{
16347	case AARCH64_CMODEL_TINY:
16348	  /* When we retrieve symbol + offset address, we have to make sure
16349	     the offset does not cause overflow of the final address.  But
16350	     we have no way of knowing the address of symbol at compile time
16351	     so we can't accurately say if the distance between the PC and
16352	     symbol + offset is outside the addressible range of +/-1MB in the
16353	     TINY code model.  So we limit the maximum offset to +/-64KB and
16354	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
16355	     If offset_within_block_p is true we allow larger offsets.
16356	     Furthermore force to memory if the symbol is a weak reference to
16357	     something that doesn't resolve to a symbol in this module.  */
16358
16359	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16360	    return SYMBOL_FORCE_TO_MEM;
16361	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
16362		|| offset_within_block_p (x, offset)))
16363	    return SYMBOL_FORCE_TO_MEM;
16364
16365	  return SYMBOL_TINY_ABSOLUTE;
16366
16367	case AARCH64_CMODEL_SMALL:
16368	  /* Same reasoning as the tiny code model, but the offset cap here is
16369	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
16370
16371	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
16372	    return SYMBOL_FORCE_TO_MEM;
16373	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
16374		|| offset_within_block_p (x, offset)))
16375	    return SYMBOL_FORCE_TO_MEM;
16376
16377	  return SYMBOL_SMALL_ABSOLUTE;
16378
16379	case AARCH64_CMODEL_TINY_PIC:
16380	  if (!aarch64_symbol_binds_local_p (x))
16381	    return SYMBOL_TINY_GOT;
16382	  return SYMBOL_TINY_ABSOLUTE;
16383
16384	case AARCH64_CMODEL_SMALL_SPIC:
16385	case AARCH64_CMODEL_SMALL_PIC:
16386	  if (!aarch64_symbol_binds_local_p (x))
16387	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16388		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
16389	  return SYMBOL_SMALL_ABSOLUTE;
16390
16391	case AARCH64_CMODEL_LARGE:
16392	  /* This is alright even in PIC code as the constant
16393	     pool reference is always PC relative and within
16394	     the same translation unit.  */
16395	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
16396	    return SYMBOL_SMALL_ABSOLUTE;
16397	  else
16398	    return SYMBOL_FORCE_TO_MEM;
16399
16400	default:
16401	  gcc_unreachable ();
16402	}
16403    }
16404
16405  /* By default push everything into the constant pool.  */
16406  return SYMBOL_FORCE_TO_MEM;
16407}
16408
16409bool
16410aarch64_constant_address_p (rtx x)
16411{
16412  return (CONSTANT_P (x) && memory_address_p (DImode, x));
16413}
16414
16415bool
16416aarch64_legitimate_pic_operand_p (rtx x)
16417{
16418  poly_int64 offset;
16419  x = strip_offset_and_salt (x, &offset);
16420  if (GET_CODE (x) == SYMBOL_REF)
16421    return false;
16422
16423  return true;
16424}
16425
16426/* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
16427   that should be rematerialized rather than spilled.  */
16428
16429static bool
16430aarch64_legitimate_constant_p (machine_mode mode, rtx x)
16431{
16432  /* Support CSE and rematerialization of common constants.  */
16433  if (CONST_INT_P (x)
16434      || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
16435    return true;
16436
16437  /* Only accept variable-length vector constants if they can be
16438     handled directly.
16439
16440     ??? It would be possible (but complex) to handle rematerialization
16441     of other constants via secondary reloads.  */
16442  if (!GET_MODE_SIZE (mode).is_constant ())
16443    return aarch64_simd_valid_immediate (x, NULL);
16444
16445  /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
16446     least be forced to memory and loaded from there.  */
16447  if (GET_CODE (x) == CONST_VECTOR)
16448    return !targetm.cannot_force_const_mem (mode, x);
16449
16450  /* Do not allow vector struct mode constants for Advanced SIMD.
16451     We could support 0 and -1 easily, but they need support in
16452     aarch64-simd.md.  */
16453  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16454  if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16455    return false;
16456
16457  if (GET_CODE (x) == HIGH)
16458    x = XEXP (x, 0);
16459
16460  /* Accept polynomial constants that can be calculated by using the
16461     destination of a move as the sole temporary.  Constants that
16462     require a second temporary cannot be rematerialized (they can't be
16463     forced to memory and also aren't legitimate constants).  */
16464  poly_int64 offset;
16465  if (poly_int_rtx_p (x, &offset))
16466    return aarch64_offset_temporaries (false, offset) <= 1;
16467
16468  /* If an offset is being added to something else, we need to allow the
16469     base to be moved into the destination register, meaning that there
16470     are no free temporaries for the offset.  */
16471  x = strip_offset_and_salt (x, &offset);
16472  if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16473    return false;
16474
16475  /* Do not allow const (plus (anchor_symbol, const_int)).  */
16476  if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16477    return false;
16478
16479  /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
16480     so spilling them is better than rematerialization.  */
16481  if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16482    return true;
16483
16484  /* Label references are always constant.  */
16485  if (GET_CODE (x) == LABEL_REF)
16486    return true;
16487
16488  return false;
16489}
16490
16491rtx
16492aarch64_load_tp (rtx target)
16493{
16494  if (!target
16495      || GET_MODE (target) != Pmode
16496      || !register_operand (target, Pmode))
16497    target = gen_reg_rtx (Pmode);
16498
16499  /* Can return in any reg.  */
16500  emit_insn (gen_aarch64_load_tp_hard (target));
16501  return target;
16502}
16503
16504/* On AAPCS systems, this is the "struct __va_list".  */
16505static GTY(()) tree va_list_type;
16506
16507/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16508   Return the type to use as __builtin_va_list.
16509
16510   AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16511
16512   struct __va_list
16513   {
16514     void *__stack;
16515     void *__gr_top;
16516     void *__vr_top;
16517     int   __gr_offs;
16518     int   __vr_offs;
16519   };  */
16520
16521static tree
16522aarch64_build_builtin_va_list (void)
16523{
16524  tree va_list_name;
16525  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16526
16527  /* Create the type.  */
16528  va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16529  /* Give it the required name.  */
16530  va_list_name = build_decl (BUILTINS_LOCATION,
16531			     TYPE_DECL,
16532			     get_identifier ("__va_list"),
16533			     va_list_type);
16534  DECL_ARTIFICIAL (va_list_name) = 1;
16535  TYPE_NAME (va_list_type) = va_list_name;
16536  TYPE_STUB_DECL (va_list_type) = va_list_name;
16537
16538  /* Create the fields.  */
16539  f_stack = build_decl (BUILTINS_LOCATION,
16540			FIELD_DECL, get_identifier ("__stack"),
16541			ptr_type_node);
16542  f_grtop = build_decl (BUILTINS_LOCATION,
16543			FIELD_DECL, get_identifier ("__gr_top"),
16544			ptr_type_node);
16545  f_vrtop = build_decl (BUILTINS_LOCATION,
16546			FIELD_DECL, get_identifier ("__vr_top"),
16547			ptr_type_node);
16548  f_groff = build_decl (BUILTINS_LOCATION,
16549			FIELD_DECL, get_identifier ("__gr_offs"),
16550			integer_type_node);
16551  f_vroff = build_decl (BUILTINS_LOCATION,
16552			FIELD_DECL, get_identifier ("__vr_offs"),
16553			integer_type_node);
16554
16555  /* Tell tree-stdarg pass about our internal offset fields.
16556     NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16557     purpose to identify whether the code is updating va_list internal
16558     offset fields through irregular way.  */
16559  va_list_gpr_counter_field = f_groff;
16560  va_list_fpr_counter_field = f_vroff;
16561
16562  DECL_ARTIFICIAL (f_stack) = 1;
16563  DECL_ARTIFICIAL (f_grtop) = 1;
16564  DECL_ARTIFICIAL (f_vrtop) = 1;
16565  DECL_ARTIFICIAL (f_groff) = 1;
16566  DECL_ARTIFICIAL (f_vroff) = 1;
16567
16568  DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16569  DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16570  DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16571  DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16572  DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16573
16574  TYPE_FIELDS (va_list_type) = f_stack;
16575  DECL_CHAIN (f_stack) = f_grtop;
16576  DECL_CHAIN (f_grtop) = f_vrtop;
16577  DECL_CHAIN (f_vrtop) = f_groff;
16578  DECL_CHAIN (f_groff) = f_vroff;
16579
16580  /* Compute its layout.  */
16581  layout_type (va_list_type);
16582
16583  return va_list_type;
16584}
16585
16586/* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
16587static void
16588aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16589{
16590  const CUMULATIVE_ARGS *cum;
16591  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16592  tree stack, grtop, vrtop, groff, vroff;
16593  tree t;
16594  int gr_save_area_size = cfun->va_list_gpr_size;
16595  int vr_save_area_size = cfun->va_list_fpr_size;
16596  int vr_offset;
16597
16598  cum = &crtl->args.info;
16599  if (cfun->va_list_gpr_size)
16600    gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16601			     cfun->va_list_gpr_size);
16602  if (cfun->va_list_fpr_size)
16603    vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16604			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
16605
16606  if (!TARGET_FLOAT)
16607    {
16608      gcc_assert (cum->aapcs_nvrn == 0);
16609      vr_save_area_size = 0;
16610    }
16611
16612  f_stack = TYPE_FIELDS (va_list_type_node);
16613  f_grtop = DECL_CHAIN (f_stack);
16614  f_vrtop = DECL_CHAIN (f_grtop);
16615  f_groff = DECL_CHAIN (f_vrtop);
16616  f_vroff = DECL_CHAIN (f_groff);
16617
16618  stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16619		  NULL_TREE);
16620  grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16621		  NULL_TREE);
16622  vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16623		  NULL_TREE);
16624  groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16625		  NULL_TREE);
16626  vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16627		  NULL_TREE);
16628
16629  /* Emit code to initialize STACK, which points to the next varargs stack
16630     argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
16631     by named arguments.  STACK is 8-byte aligned.  */
16632  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16633  if (cum->aapcs_stack_size > 0)
16634    t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16635  t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16636  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16637
16638  /* Emit code to initialize GRTOP, the top of the GR save area.
16639     virtual_incoming_args_rtx should have been 16 byte aligned.  */
16640  t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16641  t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16642  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16643
16644  /* Emit code to initialize VRTOP, the top of the VR save area.
16645     This address is gr_save_area_bytes below GRTOP, rounded
16646     down to the next 16-byte boundary.  */
16647  t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16648  vr_offset = ROUND_UP (gr_save_area_size,
16649			STACK_BOUNDARY / BITS_PER_UNIT);
16650
16651  if (vr_offset)
16652    t = fold_build_pointer_plus_hwi (t, -vr_offset);
16653  t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16654  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16655
16656  /* Emit code to initialize GROFF, the offset from GRTOP of the
16657     next GPR argument.  */
16658  t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16659	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16660  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16661
16662  /* Likewise emit code to initialize VROFF, the offset from FTOP
16663     of the next VR argument.  */
16664  t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16665	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16666  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16667}
16668
16669/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
16670
16671static tree
16672aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16673			      gimple_seq *post_p ATTRIBUTE_UNUSED)
16674{
16675  tree addr;
16676  bool indirect_p;
16677  bool is_ha;		/* is HFA or HVA.  */
16678  bool dw_align;	/* double-word align.  */
16679  machine_mode ag_mode = VOIDmode;
16680  int nregs;
16681  machine_mode mode;
16682
16683  tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16684  tree stack, f_top, f_off, off, arg, roundup, on_stack;
16685  HOST_WIDE_INT size, rsize, adjust, align;
16686  tree t, u, cond1, cond2;
16687
16688  indirect_p = pass_va_arg_by_reference (type);
16689  if (indirect_p)
16690    type = build_pointer_type (type);
16691
16692  mode = TYPE_MODE (type);
16693
16694  f_stack = TYPE_FIELDS (va_list_type_node);
16695  f_grtop = DECL_CHAIN (f_stack);
16696  f_vrtop = DECL_CHAIN (f_grtop);
16697  f_groff = DECL_CHAIN (f_vrtop);
16698  f_vroff = DECL_CHAIN (f_groff);
16699
16700  stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16701		  f_stack, NULL_TREE);
16702  size = int_size_in_bytes (type);
16703
16704  bool abi_break;
16705  align
16706    = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16707
16708  dw_align = false;
16709  adjust = 0;
16710  if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16711					       &is_ha, false))
16712    {
16713      /* No frontends can create types with variable-sized modes, so we
16714	 shouldn't be asked to pass or return them.  */
16715      unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16716
16717      /* TYPE passed in fp/simd registers.  */
16718      if (!TARGET_FLOAT)
16719	aarch64_err_no_fpadvsimd (mode);
16720
16721      f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16722		      unshare_expr (valist), f_vrtop, NULL_TREE);
16723      f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16724		      unshare_expr (valist), f_vroff, NULL_TREE);
16725
16726      rsize = nregs * UNITS_PER_VREG;
16727
16728      if (is_ha)
16729	{
16730	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16731	    adjust = UNITS_PER_VREG - ag_size;
16732	}
16733      else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16734	       && size < UNITS_PER_VREG)
16735	{
16736	  adjust = UNITS_PER_VREG - size;
16737	}
16738    }
16739  else
16740    {
16741      /* TYPE passed in general registers.  */
16742      f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16743		      unshare_expr (valist), f_grtop, NULL_TREE);
16744      f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16745		      unshare_expr (valist), f_groff, NULL_TREE);
16746      rsize = ROUND_UP (size, UNITS_PER_WORD);
16747      nregs = rsize / UNITS_PER_WORD;
16748
16749      if (align > 8)
16750	{
16751	  if (abi_break && warn_psabi)
16752	    inform (input_location, "parameter passing for argument of type "
16753		    "%qT changed in GCC 9.1", type);
16754	  dw_align = true;
16755	}
16756
16757      if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16758	  && size < UNITS_PER_WORD)
16759	{
16760	  adjust = UNITS_PER_WORD  - size;
16761	}
16762    }
16763
16764  /* Get a local temporary for the field value.  */
16765  off = get_initialized_tmp_var (f_off, pre_p, NULL);
16766
16767  /* Emit code to branch if off >= 0.  */
16768  t = build2 (GE_EXPR, boolean_type_node, off,
16769	      build_int_cst (TREE_TYPE (off), 0));
16770  cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16771
16772  if (dw_align)
16773    {
16774      /* Emit: offs = (offs + 15) & -16.  */
16775      t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16776		  build_int_cst (TREE_TYPE (off), 15));
16777      t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16778		  build_int_cst (TREE_TYPE (off), -16));
16779      roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16780    }
16781  else
16782    roundup = NULL;
16783
16784  /* Update ap.__[g|v]r_offs  */
16785  t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16786	      build_int_cst (TREE_TYPE (off), rsize));
16787  t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16788
16789  /* String up.  */
16790  if (roundup)
16791    t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16792
16793  /* [cond2] if (ap.__[g|v]r_offs > 0)  */
16794  u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16795	      build_int_cst (TREE_TYPE (f_off), 0));
16796  cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16797
16798  /* String up: make sure the assignment happens before the use.  */
16799  t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16800  COND_EXPR_ELSE (cond1) = t;
16801
16802  /* Prepare the trees handling the argument that is passed on the stack;
16803     the top level node will store in ON_STACK.  */
16804  arg = get_initialized_tmp_var (stack, pre_p, NULL);
16805  if (align > 8)
16806    {
16807      /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
16808      t = fold_build_pointer_plus_hwi (arg, 15);
16809      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16810		  build_int_cst (TREE_TYPE (t), -16));
16811      roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16812    }
16813  else
16814    roundup = NULL;
16815  /* Advance ap.__stack  */
16816  t = fold_build_pointer_plus_hwi (arg, size + 7);
16817  t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16818	      build_int_cst (TREE_TYPE (t), -8));
16819  t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16820  /* String up roundup and advance.  */
16821  if (roundup)
16822    t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16823  /* String up with arg */
16824  on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16825  /* Big-endianness related address adjustment.  */
16826  if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16827      && size < UNITS_PER_WORD)
16828  {
16829    t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16830		size_int (UNITS_PER_WORD - size));
16831    on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16832  }
16833
16834  COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16835  COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16836
16837  /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
16838  t = off;
16839  if (adjust)
16840    t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16841		build_int_cst (TREE_TYPE (off), adjust));
16842
16843  t = fold_convert (sizetype, t);
16844  t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16845
16846  if (is_ha)
16847    {
16848      /* type ha; // treat as "struct {ftype field[n];}"
16849         ... [computing offs]
16850         for (i = 0; i <nregs; ++i, offs += 16)
16851	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16852	 return ha;  */
16853      int i;
16854      tree tmp_ha, field_t, field_ptr_t;
16855
16856      /* Declare a local variable.  */
16857      tmp_ha = create_tmp_var_raw (type, "ha");
16858      gimple_add_tmp_var (tmp_ha);
16859
16860      /* Establish the base type.  */
16861      switch (ag_mode)
16862	{
16863	case E_SFmode:
16864	  field_t = float_type_node;
16865	  field_ptr_t = float_ptr_type_node;
16866	  break;
16867	case E_DFmode:
16868	  field_t = double_type_node;
16869	  field_ptr_t = double_ptr_type_node;
16870	  break;
16871	case E_TFmode:
16872	  field_t = long_double_type_node;
16873	  field_ptr_t = long_double_ptr_type_node;
16874	  break;
16875	case E_HFmode:
16876	  field_t = aarch64_fp16_type_node;
16877	  field_ptr_t = aarch64_fp16_ptr_type_node;
16878	  break;
16879	case E_BFmode:
16880	  field_t = aarch64_bf16_type_node;
16881	  field_ptr_t = aarch64_bf16_ptr_type_node;
16882	  break;
16883	case E_V2SImode:
16884	case E_V4SImode:
16885	    {
16886	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16887	      field_t = build_vector_type_for_mode (innertype, ag_mode);
16888	      field_ptr_t = build_pointer_type (field_t);
16889	    }
16890	  break;
16891	default:
16892	  gcc_assert (0);
16893	}
16894
16895      /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
16896      tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16897      addr = t;
16898      t = fold_convert (field_ptr_t, addr);
16899      t = build2 (MODIFY_EXPR, field_t,
16900		  build1 (INDIRECT_REF, field_t, tmp_ha),
16901		  build1 (INDIRECT_REF, field_t, t));
16902
16903      /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
16904      for (i = 1; i < nregs; ++i)
16905	{
16906	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16907	  u = fold_convert (field_ptr_t, addr);
16908	  u = build2 (MODIFY_EXPR, field_t,
16909		      build2 (MEM_REF, field_t, tmp_ha,
16910			      build_int_cst (field_ptr_t,
16911					     (i *
16912					      int_size_in_bytes (field_t)))),
16913		      build1 (INDIRECT_REF, field_t, u));
16914	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16915	}
16916
16917      u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16918      t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16919    }
16920
16921  COND_EXPR_ELSE (cond2) = t;
16922  addr = fold_convert (build_pointer_type (type), cond1);
16923  addr = build_va_arg_indirect_ref (addr);
16924
16925  if (indirect_p)
16926    addr = build_va_arg_indirect_ref (addr);
16927
16928  return addr;
16929}
16930
16931/* Implement TARGET_SETUP_INCOMING_VARARGS.  */
16932
16933static void
16934aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16935				const function_arg_info &arg,
16936				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16937{
16938  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16939  CUMULATIVE_ARGS local_cum;
16940  int gr_saved = cfun->va_list_gpr_size;
16941  int vr_saved = cfun->va_list_fpr_size;
16942
16943  /* The caller has advanced CUM up to, but not beyond, the last named
16944     argument.  Advance a local copy of CUM past the last "real" named
16945     argument, to find out how many registers are left over.  */
16946  local_cum = *cum;
16947  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16948
16949  /* Found out how many registers we need to save.
16950     Honor tree-stdvar analysis results.  */
16951  if (cfun->va_list_gpr_size)
16952    gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16953		    cfun->va_list_gpr_size / UNITS_PER_WORD);
16954  if (cfun->va_list_fpr_size)
16955    vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16956		    cfun->va_list_fpr_size / UNITS_PER_VREG);
16957
16958  if (!TARGET_FLOAT)
16959    {
16960      gcc_assert (local_cum.aapcs_nvrn == 0);
16961      vr_saved = 0;
16962    }
16963
16964  if (!no_rtl)
16965    {
16966      if (gr_saved > 0)
16967	{
16968	  rtx ptr, mem;
16969
16970	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
16971	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16972			       - gr_saved * UNITS_PER_WORD);
16973	  mem = gen_frame_mem (BLKmode, ptr);
16974	  set_mem_alias_set (mem, get_varargs_alias_set ());
16975
16976	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16977			       mem, gr_saved);
16978	}
16979      if (vr_saved > 0)
16980	{
16981	  /* We can't use move_block_from_reg, because it will use
16982	     the wrong mode, storing D regs only.  */
16983	  machine_mode mode = TImode;
16984	  int off, i, vr_start;
16985
16986	  /* Set OFF to the offset from virtual_incoming_args_rtx of
16987	     the first vector register.  The VR save area lies below
16988	     the GR one, and is aligned to 16 bytes.  */
16989	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16990			   STACK_BOUNDARY / BITS_PER_UNIT);
16991	  off -= vr_saved * UNITS_PER_VREG;
16992
16993	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16994	  for (i = 0; i < vr_saved; ++i)
16995	    {
16996	      rtx ptr, mem;
16997
16998	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16999	      mem = gen_frame_mem (mode, ptr);
17000	      set_mem_alias_set (mem, get_varargs_alias_set ());
17001	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
17002	      off += UNITS_PER_VREG;
17003	    }
17004	}
17005    }
17006
17007  /* We don't save the size into *PRETEND_SIZE because we want to avoid
17008     any complication of having crtl->args.pretend_args_size changed.  */
17009  cfun->machine->frame.saved_varargs_size
17010    = (ROUND_UP (gr_saved * UNITS_PER_WORD,
17011		 STACK_BOUNDARY / BITS_PER_UNIT)
17012       + vr_saved * UNITS_PER_VREG);
17013}
17014
17015static void
17016aarch64_conditional_register_usage (void)
17017{
17018  int i;
17019  if (!TARGET_FLOAT)
17020    {
17021      for (i = V0_REGNUM; i <= V31_REGNUM; i++)
17022	{
17023	  fixed_regs[i] = 1;
17024	  call_used_regs[i] = 1;
17025	}
17026    }
17027  if (!TARGET_SVE)
17028    for (i = P0_REGNUM; i <= P15_REGNUM; i++)
17029      {
17030	fixed_regs[i] = 1;
17031	call_used_regs[i] = 1;
17032      }
17033
17034  /* Only allow the FFR and FFRT to be accessed via special patterns.  */
17035  CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
17036  CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
17037
17038  /* When tracking speculation, we need a couple of call-clobbered registers
17039     to track the speculation state.  It would be nice to just use
17040     IP0 and IP1, but currently there are numerous places that just
17041     assume these registers are free for other uses (eg pointer
17042     authentication).  */
17043  if (aarch64_track_speculation)
17044    {
17045      fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
17046      call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
17047      fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
17048      call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
17049    }
17050}
17051
17052/* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
17053
17054bool
17055aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
17056{
17057  /* For records we're passed a FIELD_DECL, for arrays we're passed
17058     an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
17059  const_tree type = TREE_TYPE (field_or_array);
17060
17061  /* Assign BLKmode to anything that contains multiple SVE predicates.
17062     For structures, the "multiple" case is indicated by MODE being
17063     VOIDmode.  */
17064  unsigned int num_zr, num_pr;
17065  if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
17066    {
17067      if (TREE_CODE (field_or_array) == ARRAY_TYPE)
17068	return !simple_cst_equal (TYPE_SIZE (field_or_array),
17069				  TYPE_SIZE (type));
17070      return mode == VOIDmode;
17071    }
17072
17073  return default_member_type_forces_blk (field_or_array, mode);
17074}
17075
17076/* Bitmasks that indicate whether earlier versions of GCC would have
17077   taken a different path through the ABI logic.  This should result in
17078   a -Wpsabi warning if the earlier path led to a different ABI decision.
17079
17080   WARN_PSABI_EMPTY_CXX17_BASE
17081      Indicates that the type includes an artificial empty C++17 base field
17082      that, prior to GCC 10.1, would prevent the type from being treated as
17083      a HFA or HVA.  See PR94383 for details.
17084
17085   WARN_PSABI_NO_UNIQUE_ADDRESS
17086      Indicates that the type includes an empty [[no_unique_address]] field
17087      that, prior to GCC 10.1, would prevent the type from being treated as
17088      a HFA or HVA.  */
17089const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
17090const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
17091
17092/* Walk down the type tree of TYPE counting consecutive base elements.
17093   If *MODEP is VOIDmode, then set it to the first valid floating point
17094   type.  If a non-floating point type is found, or if a floating point
17095   type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
17096   otherwise return the count in the sub-tree.
17097
17098   The WARN_PSABI_FLAGS argument allows the caller to check whether this
17099   function has changed its behavior relative to earlier versions of GCC.
17100   Normally the argument should be nonnull and point to a zero-initialized
17101   variable.  The function then records whether the ABI decision might
17102   be affected by a known fix to the ABI logic, setting the associated
17103   WARN_PSABI_* bits if so.
17104
17105   When the argument is instead a null pointer, the function tries to
17106   simulate the behavior of GCC before all such ABI fixes were made.
17107   This is useful to check whether the function returns something
17108   different after the ABI fixes.  */
17109static int
17110aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
17111			 unsigned int *warn_psabi_flags)
17112{
17113  machine_mode mode;
17114  HOST_WIDE_INT size;
17115
17116  if (aarch64_sve::builtin_type_p (type))
17117    return -1;
17118
17119  switch (TREE_CODE (type))
17120    {
17121    case REAL_TYPE:
17122      mode = TYPE_MODE (type);
17123      if (mode != DFmode && mode != SFmode
17124	  && mode != TFmode && mode != HFmode)
17125	return -1;
17126
17127      if (*modep == VOIDmode)
17128	*modep = mode;
17129
17130      if (*modep == mode)
17131	return 1;
17132
17133      break;
17134
17135    case COMPLEX_TYPE:
17136      mode = TYPE_MODE (TREE_TYPE (type));
17137      if (mode != DFmode && mode != SFmode
17138	  && mode != TFmode && mode != HFmode)
17139	return -1;
17140
17141      if (*modep == VOIDmode)
17142	*modep = mode;
17143
17144      if (*modep == mode)
17145	return 2;
17146
17147      break;
17148
17149    case VECTOR_TYPE:
17150      /* Use V2SImode and V4SImode as representatives of all 64-bit
17151	 and 128-bit vector types.  */
17152      size = int_size_in_bytes (type);
17153      switch (size)
17154	{
17155	case 8:
17156	  mode = V2SImode;
17157	  break;
17158	case 16:
17159	  mode = V4SImode;
17160	  break;
17161	default:
17162	  return -1;
17163	}
17164
17165      if (*modep == VOIDmode)
17166	*modep = mode;
17167
17168      /* Vector modes are considered to be opaque: two vectors are
17169	 equivalent for the purposes of being homogeneous aggregates
17170	 if they are the same size.  */
17171      if (*modep == mode)
17172	return 1;
17173
17174      break;
17175
17176    case ARRAY_TYPE:
17177      {
17178	int count;
17179	tree index = TYPE_DOMAIN (type);
17180
17181	/* Can't handle incomplete types nor sizes that are not
17182	   fixed.  */
17183	if (!COMPLETE_TYPE_P (type)
17184	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17185	  return -1;
17186
17187	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
17188					 warn_psabi_flags);
17189	if (count == -1
17190	    || !index
17191	    || !TYPE_MAX_VALUE (index)
17192	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
17193	    || !TYPE_MIN_VALUE (index)
17194	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
17195	    || count < 0)
17196	  return -1;
17197
17198	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
17199		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
17200
17201	/* There must be no padding.  */
17202	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17203		      count * GET_MODE_BITSIZE (*modep)))
17204	  return -1;
17205
17206	return count;
17207      }
17208
17209    case RECORD_TYPE:
17210      {
17211	int count = 0;
17212	int sub_count;
17213	tree field;
17214
17215	/* Can't handle incomplete types nor sizes that are not
17216	   fixed.  */
17217	if (!COMPLETE_TYPE_P (type)
17218	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17219	  return -1;
17220
17221	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17222	  {
17223	    if (TREE_CODE (field) != FIELD_DECL)
17224	      continue;
17225
17226	    if (DECL_FIELD_ABI_IGNORED (field))
17227	      {
17228		/* See whether this is something that earlier versions of
17229		   GCC failed to ignore.  */
17230		unsigned int flag;
17231		if (lookup_attribute ("no_unique_address",
17232				      DECL_ATTRIBUTES (field)))
17233		  flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
17234		else if (cxx17_empty_base_field_p (field))
17235		  flag = WARN_PSABI_EMPTY_CXX17_BASE;
17236		else
17237		  /* No compatibility problem.  */
17238		  continue;
17239
17240		/* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
17241		if (warn_psabi_flags)
17242		  {
17243		    *warn_psabi_flags |= flag;
17244		    continue;
17245		  }
17246	      }
17247
17248	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17249						 warn_psabi_flags);
17250	    if (sub_count < 0)
17251	      return -1;
17252	    count += sub_count;
17253	  }
17254
17255	/* There must be no padding.  */
17256	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17257		      count * GET_MODE_BITSIZE (*modep)))
17258	  return -1;
17259
17260	return count;
17261      }
17262
17263    case UNION_TYPE:
17264    case QUAL_UNION_TYPE:
17265      {
17266	/* These aren't very interesting except in a degenerate case.  */
17267	int count = 0;
17268	int sub_count;
17269	tree field;
17270
17271	/* Can't handle incomplete types nor sizes that are not
17272	   fixed.  */
17273	if (!COMPLETE_TYPE_P (type)
17274	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
17275	  return -1;
17276
17277	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
17278	  {
17279	    if (TREE_CODE (field) != FIELD_DECL)
17280	      continue;
17281
17282	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
17283						 warn_psabi_flags);
17284	    if (sub_count < 0)
17285	      return -1;
17286	    count = count > sub_count ? count : sub_count;
17287	  }
17288
17289	/* There must be no padding.  */
17290	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
17291		      count * GET_MODE_BITSIZE (*modep)))
17292	  return -1;
17293
17294	return count;
17295      }
17296
17297    default:
17298      break;
17299    }
17300
17301  return -1;
17302}
17303
17304/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
17305   type as described in AAPCS64 \S 4.1.2.
17306
17307   See the comment above aarch64_composite_type_p for the notes on MODE.  */
17308
17309static bool
17310aarch64_short_vector_p (const_tree type,
17311			machine_mode mode)
17312{
17313  poly_int64 size = -1;
17314
17315  if (type && TREE_CODE (type) == VECTOR_TYPE)
17316    {
17317      if (aarch64_sve::builtin_type_p (type))
17318	return false;
17319      size = int_size_in_bytes (type);
17320    }
17321  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
17322	   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
17323    {
17324      /* Rely only on the type, not the mode, when processing SVE types.  */
17325      if (type && aarch64_some_values_include_pst_objects_p (type))
17326	gcc_assert (aarch64_sve_mode_p (mode));
17327      else
17328	size = GET_MODE_SIZE (mode);
17329    }
17330  if (known_eq (size, 8) || known_eq (size, 16))
17331    {
17332      /* 64-bit and 128-bit vectors should only acquire an SVE mode if
17333	 they are being treated as scalable AAPCS64 types.  */
17334      gcc_assert (!aarch64_sve_mode_p (mode));
17335      return true;
17336    }
17337  return false;
17338}
17339
17340/* Return TRUE if the type, as described by TYPE and MODE, is a composite
17341   type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
17342   array types.  The C99 floating-point complex types are also considered
17343   as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
17344   types, which are GCC extensions and out of the scope of AAPCS64, are
17345   treated as composite types here as well.
17346
17347   Note that MODE itself is not sufficient in determining whether a type
17348   is such a composite type or not.  This is because
17349   stor-layout.c:compute_record_mode may have already changed the MODE
17350   (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
17351   structure with only one field may have its MODE set to the mode of the
17352   field.  Also an integer mode whose size matches the size of the
17353   RECORD_TYPE type may be used to substitute the original mode
17354   (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
17355   solely relied on.  */
17356
17357static bool
17358aarch64_composite_type_p (const_tree type,
17359			  machine_mode mode)
17360{
17361  if (aarch64_short_vector_p (type, mode))
17362    return false;
17363
17364  if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
17365    return true;
17366
17367  if (mode == BLKmode
17368      || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
17369      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
17370    return true;
17371
17372  return false;
17373}
17374
17375/* Return TRUE if an argument, whose type is described by TYPE and MODE,
17376   shall be passed or returned in simd/fp register(s) (providing these
17377   parameter passing registers are available).
17378
17379   Upon successful return, *COUNT returns the number of needed registers,
17380   *BASE_MODE returns the mode of the individual register and when IS_HAF
17381   is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
17382   floating-point aggregate or a homogeneous short-vector aggregate.
17383
17384   SILENT_P is true if the function should refrain from reporting any
17385   diagnostics.  This should only be used if the caller is certain that
17386   any ABI decisions would eventually come through this function with
17387   SILENT_P set to false.  */
17388
17389static bool
17390aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
17391					 const_tree type,
17392					 machine_mode *base_mode,
17393					 int *count,
17394					 bool *is_ha,
17395					 bool silent_p)
17396{
17397  if (is_ha != NULL) *is_ha = false;
17398
17399  machine_mode new_mode = VOIDmode;
17400  bool composite_p = aarch64_composite_type_p (type, mode);
17401
17402  if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17403      || aarch64_short_vector_p (type, mode))
17404    {
17405      *count = 1;
17406      new_mode = mode;
17407    }
17408  else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17409    {
17410      if (is_ha != NULL) *is_ha = true;
17411      *count = 2;
17412      new_mode = GET_MODE_INNER (mode);
17413    }
17414  else if (type && composite_p)
17415    {
17416      unsigned int warn_psabi_flags = 0;
17417      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17418					      &warn_psabi_flags);
17419      if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17420	{
17421	  static unsigned last_reported_type_uid;
17422	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17423	  int alt;
17424	  if (!silent_p
17425	      && warn_psabi
17426	      && warn_psabi_flags
17427	      && uid != last_reported_type_uid
17428	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17429		  != ag_count))
17430	    {
17431	      const char *url
17432		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
17433	      gcc_assert (alt == -1);
17434	      last_reported_type_uid = uid;
17435	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
17436		 qualification.  */
17437	      if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17438		inform (input_location, "parameter passing for argument of "
17439			"type %qT with %<[[no_unique_address]]%> members "
17440			"changed %{in GCC 10.1%}",
17441			TYPE_MAIN_VARIANT (type), url);
17442	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17443		inform (input_location, "parameter passing for argument of "
17444			"type %qT when C++17 is enabled changed to match "
17445			"C++14 %{in GCC 10.1%}",
17446			TYPE_MAIN_VARIANT (type), url);
17447	    }
17448
17449	  if (is_ha != NULL) *is_ha = true;
17450	  *count = ag_count;
17451	}
17452      else
17453	return false;
17454    }
17455  else
17456    return false;
17457
17458  gcc_assert (!aarch64_sve_mode_p (new_mode));
17459  *base_mode = new_mode;
17460  return true;
17461}
17462
17463/* Implement TARGET_STRUCT_VALUE_RTX.  */
17464
17465static rtx
17466aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17467			  int incoming ATTRIBUTE_UNUSED)
17468{
17469  return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17470}
17471
17472/* Implements target hook vector_mode_supported_p.  */
17473static bool
17474aarch64_vector_mode_supported_p (machine_mode mode)
17475{
17476  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17477  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
17478}
17479
17480/* Return the full-width SVE vector mode for element mode MODE, if one
17481   exists.  */
17482opt_machine_mode
17483aarch64_full_sve_mode (scalar_mode mode)
17484{
17485  switch (mode)
17486    {
17487    case E_DFmode:
17488      return VNx2DFmode;
17489    case E_SFmode:
17490      return VNx4SFmode;
17491    case E_HFmode:
17492      return VNx8HFmode;
17493    case E_BFmode:
17494      return VNx8BFmode;
17495    case E_DImode:
17496      return VNx2DImode;
17497    case E_SImode:
17498      return VNx4SImode;
17499    case E_HImode:
17500      return VNx8HImode;
17501    case E_QImode:
17502      return VNx16QImode;
17503    default:
17504      return opt_machine_mode ();
17505    }
17506}
17507
17508/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17509   if it exists.  */
17510opt_machine_mode
17511aarch64_vq_mode (scalar_mode mode)
17512{
17513  switch (mode)
17514    {
17515    case E_DFmode:
17516      return V2DFmode;
17517    case E_SFmode:
17518      return V4SFmode;
17519    case E_HFmode:
17520      return V8HFmode;
17521    case E_BFmode:
17522      return V8BFmode;
17523    case E_SImode:
17524      return V4SImode;
17525    case E_HImode:
17526      return V8HImode;
17527    case E_QImode:
17528      return V16QImode;
17529    case E_DImode:
17530      return V2DImode;
17531    default:
17532      return opt_machine_mode ();
17533    }
17534}
17535
17536/* Return appropriate SIMD container
17537   for MODE within a vector of WIDTH bits.  */
17538static machine_mode
17539aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17540{
17541  if (TARGET_SVE
17542      && maybe_ne (width, 128)
17543      && known_eq (width, BITS_PER_SVE_VECTOR))
17544    return aarch64_full_sve_mode (mode).else_mode (word_mode);
17545
17546  gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17547  if (TARGET_SIMD)
17548    {
17549      if (known_eq (width, 128))
17550	return aarch64_vq_mode (mode).else_mode (word_mode);
17551      else
17552	switch (mode)
17553	  {
17554	  case E_SFmode:
17555	    return V2SFmode;
17556	  case E_HFmode:
17557	    return V4HFmode;
17558	  case E_BFmode:
17559	    return V4BFmode;
17560	  case E_SImode:
17561	    return V2SImode;
17562	  case E_HImode:
17563	    return V4HImode;
17564	  case E_QImode:
17565	    return V8QImode;
17566	  default:
17567	    break;
17568	  }
17569    }
17570  return word_mode;
17571}
17572
17573static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
17574
17575/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
17576   and return whether the SVE mode should be preferred over the
17577   Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
17578static bool
17579aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
17580{
17581  /* Take into account the aarch64-autovec-preference param if non-zero.  */
17582  bool only_asimd_p = aarch64_autovec_preference == 1;
17583  bool only_sve_p = aarch64_autovec_preference == 2;
17584
17585  if (only_asimd_p)
17586    return false;
17587  if (only_sve_p)
17588    return true;
17589
17590  /* The preference in case of a tie in costs.  */
17591  bool prefer_asimd = aarch64_autovec_preference == 3;
17592  bool prefer_sve = aarch64_autovec_preference == 4;
17593
17594  aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
17595
17596  poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
17597  poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
17598  /* If the CPU information does not have an SVE width registered use the
17599     generic poly_int comparison that prefers SVE.  If a preference is
17600     explicitly requested avoid this path.  */
17601  if (tune_width == SVE_SCALABLE
17602      && !prefer_asimd
17603      && !prefer_sve)
17604    return maybe_gt (nunits_sve, nunits_asimd);
17605
17606  /* Otherwise estimate the runtime width of the modes involved.  */
17607  HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
17608  HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
17609
17610  /* Preferring SVE means picking it first unless the Advanced SIMD mode
17611     is clearly wider.  */
17612  if (prefer_sve)
17613    return est_sve >= est_asimd;
17614  /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
17615     is clearly wider.  */
17616  if (prefer_asimd)
17617    return est_sve > est_asimd;
17618
17619  /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
17620  return est_sve > est_asimd;
17621}
17622
17623/* Return 128-bit container as the preferred SIMD mode for MODE.  */
17624static machine_mode
17625aarch64_preferred_simd_mode (scalar_mode mode)
17626{
17627  /* Take into account explicit auto-vectorization ISA preferences through
17628     aarch64_cmp_autovec_modes.  */
17629  if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
17630    return aarch64_full_sve_mode (mode).else_mode (word_mode);
17631  if (TARGET_SIMD)
17632    return aarch64_vq_mode (mode).else_mode (word_mode);
17633  return word_mode;
17634}
17635
17636/* Return a list of possible vector sizes for the vectorizer
17637   to iterate over.  */
17638static unsigned int
17639aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17640{
17641  static const machine_mode sve_modes[] = {
17642    /* Try using full vectors for all element types.  */
17643    VNx16QImode,
17644
17645    /* Try using 16-bit containers for 8-bit elements and full vectors
17646       for wider elements.  */
17647    VNx8QImode,
17648
17649    /* Try using 32-bit containers for 8-bit and 16-bit elements and
17650       full vectors for wider elements.  */
17651    VNx4QImode,
17652
17653    /* Try using 64-bit containers for all element types.  */
17654    VNx2QImode
17655  };
17656
17657  static const machine_mode advsimd_modes[] = {
17658    /* Try using 128-bit vectors for all element types.  */
17659    V16QImode,
17660
17661    /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17662       for wider elements.  */
17663    V8QImode,
17664
17665    /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17666       for wider elements.
17667
17668       TODO: We could support a limited form of V4QImode too, so that
17669       we use 32-bit vectors for 8-bit elements.  */
17670    V4HImode,
17671
17672    /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17673       for 64-bit elements.
17674
17675       TODO: We could similarly support limited forms of V2QImode and V2HImode
17676       for this case.  */
17677    V2SImode
17678  };
17679
17680  /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17681     This is because:
17682
17683     - If we can't use N-byte Advanced SIMD vectors then the placement
17684       doesn't matter; we'll just continue as though the Advanced SIMD
17685       entry didn't exist.
17686
17687     - If an SVE main loop with N bytes ends up being cheaper than an
17688       Advanced SIMD main loop with N bytes then by default we'll replace
17689       the Advanced SIMD version with the SVE one.
17690
17691     - If an Advanced SIMD main loop with N bytes ends up being cheaper
17692       than an SVE main loop with N bytes then by default we'll try to
17693       use the SVE loop to vectorize the epilogue instead.  */
17694
17695  bool only_asimd_p = aarch64_autovec_preference == 1;
17696  bool only_sve_p = aarch64_autovec_preference == 2;
17697
17698  unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
17699  unsigned int advsimd_i = 0;
17700
17701  while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
17702    {
17703      if (sve_i < ARRAY_SIZE (sve_modes)
17704	  && aarch64_cmp_autovec_modes (sve_modes[sve_i],
17705					advsimd_modes[advsimd_i]))
17706	modes->safe_push (sve_modes[sve_i++]);
17707      else
17708	modes->safe_push (advsimd_modes[advsimd_i++]);
17709    }
17710  while (sve_i < ARRAY_SIZE (sve_modes))
17711   modes->safe_push (sve_modes[sve_i++]);
17712
17713  unsigned int flags = 0;
17714  /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17715     can compare SVE against Advanced SIMD and so that we can compare
17716     multiple SVE vectorization approaches against each other.  There's
17717     not really any point doing this for Advanced SIMD only, since the
17718     first mode that works should always be the best.  */
17719  if (TARGET_SVE && aarch64_sve_compare_costs)
17720    flags |= VECT_COMPARE_COSTS;
17721  return flags;
17722}
17723
17724/* Implement TARGET_MANGLE_TYPE.  */
17725
17726static const char *
17727aarch64_mangle_type (const_tree type)
17728{
17729  /* The AArch64 ABI documents say that "__va_list" has to be
17730     mangled as if it is in the "std" namespace.  */
17731  if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17732    return "St9__va_list";
17733
17734  /* Half-precision floating point types.  */
17735  if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17736    {
17737      if (TYPE_MODE (type) == BFmode)
17738	return "u6__bf16";
17739      else
17740	return "Dh";
17741    }
17742
17743  /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
17744     builtin types.  */
17745  if (TYPE_NAME (type) != NULL)
17746    {
17747      const char *res;
17748      if ((res = aarch64_general_mangle_builtin_type (type))
17749	  || (res = aarch64_sve::mangle_builtin_type (type)))
17750	return res;
17751    }
17752
17753  /* Use the default mangling.  */
17754  return NULL;
17755}
17756
17757/* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
17758
17759static bool
17760aarch64_verify_type_context (location_t loc, type_context_kind context,
17761			     const_tree type, bool silent_p)
17762{
17763  return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17764}
17765
17766/* Find the first rtx_insn before insn that will generate an assembly
17767   instruction.  */
17768
17769static rtx_insn *
17770aarch64_prev_real_insn (rtx_insn *insn)
17771{
17772  if (!insn)
17773    return NULL;
17774
17775  do
17776    {
17777      insn = prev_real_insn (insn);
17778    }
17779  while (insn && recog_memoized (insn) < 0);
17780
17781  return insn;
17782}
17783
17784static bool
17785is_madd_op (enum attr_type t1)
17786{
17787  unsigned int i;
17788  /* A number of these may be AArch32 only.  */
17789  enum attr_type mlatypes[] = {
17790    TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17791    TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17792    TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17793  };
17794
17795  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17796    {
17797      if (t1 == mlatypes[i])
17798	return true;
17799    }
17800
17801  return false;
17802}
17803
17804/* Check if there is a register dependency between a load and the insn
17805   for which we hold recog_data.  */
17806
17807static bool
17808dep_between_memop_and_curr (rtx memop)
17809{
17810  rtx load_reg;
17811  int opno;
17812
17813  gcc_assert (GET_CODE (memop) == SET);
17814
17815  if (!REG_P (SET_DEST (memop)))
17816    return false;
17817
17818  load_reg = SET_DEST (memop);
17819  for (opno = 1; opno < recog_data.n_operands; opno++)
17820    {
17821      rtx operand = recog_data.operand[opno];
17822      if (REG_P (operand)
17823          && reg_overlap_mentioned_p (load_reg, operand))
17824        return true;
17825
17826    }
17827  return false;
17828}
17829
17830
17831/* When working around the Cortex-A53 erratum 835769,
17832   given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17833   instruction and has a preceding memory instruction such that a NOP
17834   should be inserted between them.  */
17835
17836bool
17837aarch64_madd_needs_nop (rtx_insn* insn)
17838{
17839  enum attr_type attr_type;
17840  rtx_insn *prev;
17841  rtx body;
17842
17843  if (!TARGET_FIX_ERR_A53_835769)
17844    return false;
17845
17846  if (!INSN_P (insn) || recog_memoized (insn) < 0)
17847    return false;
17848
17849  attr_type = get_attr_type (insn);
17850  if (!is_madd_op (attr_type))
17851    return false;
17852
17853  prev = aarch64_prev_real_insn (insn);
17854  /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17855     Restore recog state to INSN to avoid state corruption.  */
17856  extract_constrain_insn_cached (insn);
17857
17858  if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17859    return false;
17860
17861  body = single_set (prev);
17862
17863  /* If the previous insn is a memory op and there is no dependency between
17864     it and the DImode madd, emit a NOP between them.  If body is NULL then we
17865     have a complex memory operation, probably a load/store pair.
17866     Be conservative for now and emit a NOP.  */
17867  if (GET_MODE (recog_data.operand[0]) == DImode
17868      && (!body || !dep_between_memop_and_curr (body)))
17869    return true;
17870
17871  return false;
17872
17873}
17874
17875
17876/* Implement FINAL_PRESCAN_INSN.  */
17877
17878void
17879aarch64_final_prescan_insn (rtx_insn *insn)
17880{
17881  if (aarch64_madd_needs_nop (insn))
17882    fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17883}
17884
17885
17886/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17887   instruction.  */
17888
17889bool
17890aarch64_sve_index_immediate_p (rtx base_or_step)
17891{
17892  return (CONST_INT_P (base_or_step)
17893	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
17894}
17895
17896/* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17897   when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
17898
17899bool
17900aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17901{
17902  rtx elt = unwrap_const_vec_duplicate (x);
17903  if (!CONST_INT_P (elt))
17904    return false;
17905
17906  HOST_WIDE_INT val = INTVAL (elt);
17907  if (negate_p)
17908    val = -val;
17909  val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17910
17911  if (val & 0xff)
17912    return IN_RANGE (val, 0, 0xff);
17913  return IN_RANGE (val, 0, 0xff00);
17914}
17915
17916/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17917   instructions when applied to mode MODE.  Negate X first if NEGATE_P
17918   is true.  */
17919
17920bool
17921aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17922{
17923  if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17924    return false;
17925
17926  /* After the optional negation, the immediate must be nonnegative.
17927     E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17928     instead of SQADD Zn.B, Zn.B, #129.  */
17929  rtx elt = unwrap_const_vec_duplicate (x);
17930  return negate_p == (INTVAL (elt) < 0);
17931}
17932
17933/* Return true if X is a valid immediate operand for an SVE logical
17934   instruction such as AND.  */
17935
17936bool
17937aarch64_sve_bitmask_immediate_p (rtx x)
17938{
17939  rtx elt;
17940
17941  return (const_vec_duplicate_p (x, &elt)
17942	  && CONST_INT_P (elt)
17943	  && aarch64_bitmask_imm (INTVAL (elt),
17944				  GET_MODE_INNER (GET_MODE (x))));
17945}
17946
17947/* Return true if X is a valid immediate for the SVE DUP and CPY
17948   instructions.  */
17949
17950bool
17951aarch64_sve_dup_immediate_p (rtx x)
17952{
17953  x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17954  if (!CONST_INT_P (x))
17955    return false;
17956
17957  HOST_WIDE_INT val = INTVAL (x);
17958  if (val & 0xff)
17959    return IN_RANGE (val, -0x80, 0x7f);
17960  return IN_RANGE (val, -0x8000, 0x7f00);
17961}
17962
17963/* Return true if X is a valid immediate operand for an SVE CMP instruction.
17964   SIGNED_P says whether the operand is signed rather than unsigned.  */
17965
17966bool
17967aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17968{
17969  x = unwrap_const_vec_duplicate (x);
17970  return (CONST_INT_P (x)
17971	  && (signed_p
17972	      ? IN_RANGE (INTVAL (x), -16, 15)
17973	      : IN_RANGE (INTVAL (x), 0, 127)));
17974}
17975
17976/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17977   instruction.  Negate X first if NEGATE_P is true.  */
17978
17979bool
17980aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17981{
17982  rtx elt;
17983  REAL_VALUE_TYPE r;
17984
17985  if (!const_vec_duplicate_p (x, &elt)
17986      || GET_CODE (elt) != CONST_DOUBLE)
17987    return false;
17988
17989  r = *CONST_DOUBLE_REAL_VALUE (elt);
17990
17991  if (negate_p)
17992    r = real_value_negate (&r);
17993
17994  if (real_equal (&r, &dconst1))
17995    return true;
17996  if (real_equal (&r, &dconsthalf))
17997    return true;
17998  return false;
17999}
18000
18001/* Return true if X is a valid immediate operand for an SVE FMUL
18002   instruction.  */
18003
18004bool
18005aarch64_sve_float_mul_immediate_p (rtx x)
18006{
18007  rtx elt;
18008
18009  return (const_vec_duplicate_p (x, &elt)
18010	  && GET_CODE (elt) == CONST_DOUBLE
18011	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
18012	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
18013}
18014
18015/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
18016   for the Advanced SIMD operation described by WHICH and INSN.  If INFO
18017   is nonnull, use it to describe valid immediates.  */
18018static bool
18019aarch64_advsimd_valid_immediate_hs (unsigned int val32,
18020				    simd_immediate_info *info,
18021				    enum simd_immediate_check which,
18022				    simd_immediate_info::insn_type insn)
18023{
18024  /* Try a 4-byte immediate with LSL.  */
18025  for (unsigned int shift = 0; shift < 32; shift += 8)
18026    if ((val32 & (0xff << shift)) == val32)
18027      {
18028	if (info)
18029	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
18030				       simd_immediate_info::LSL, shift);
18031	return true;
18032      }
18033
18034  /* Try a 2-byte immediate with LSL.  */
18035  unsigned int imm16 = val32 & 0xffff;
18036  if (imm16 == (val32 >> 16))
18037    for (unsigned int shift = 0; shift < 16; shift += 8)
18038      if ((imm16 & (0xff << shift)) == imm16)
18039	{
18040	  if (info)
18041	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
18042					 simd_immediate_info::LSL, shift);
18043	  return true;
18044	}
18045
18046  /* Try a 4-byte immediate with MSL, except for cases that MVN
18047     can handle.  */
18048  if (which == AARCH64_CHECK_MOV)
18049    for (unsigned int shift = 8; shift < 24; shift += 8)
18050      {
18051	unsigned int low = (1 << shift) - 1;
18052	if (((val32 & (0xff << shift)) | low) == val32)
18053	  {
18054	    if (info)
18055	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
18056					   simd_immediate_info::MSL, shift);
18057	    return true;
18058	  }
18059      }
18060
18061  return false;
18062}
18063
18064/* Return true if replicating VAL64 is a valid immediate for the
18065   Advanced SIMD operation described by WHICH.  If INFO is nonnull,
18066   use it to describe valid immediates.  */
18067static bool
18068aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
18069				 simd_immediate_info *info,
18070				 enum simd_immediate_check which)
18071{
18072  unsigned int val32 = val64 & 0xffffffff;
18073  unsigned int val16 = val64 & 0xffff;
18074  unsigned int val8 = val64 & 0xff;
18075
18076  if (val32 == (val64 >> 32))
18077    {
18078      if ((which & AARCH64_CHECK_ORR) != 0
18079	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
18080						 simd_immediate_info::MOV))
18081	return true;
18082
18083      if ((which & AARCH64_CHECK_BIC) != 0
18084	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
18085						 simd_immediate_info::MVN))
18086	return true;
18087
18088      /* Try using a replicated byte.  */
18089      if (which == AARCH64_CHECK_MOV
18090	  && val16 == (val32 >> 16)
18091	  && val8 == (val16 >> 8))
18092	{
18093	  if (info)
18094	    *info = simd_immediate_info (QImode, val8);
18095	  return true;
18096	}
18097    }
18098
18099  /* Try using a bit-to-bytemask.  */
18100  if (which == AARCH64_CHECK_MOV)
18101    {
18102      unsigned int i;
18103      for (i = 0; i < 64; i += 8)
18104	{
18105	  unsigned char byte = (val64 >> i) & 0xff;
18106	  if (byte != 0 && byte != 0xff)
18107	    break;
18108	}
18109      if (i == 64)
18110	{
18111	  if (info)
18112	    *info = simd_immediate_info (DImode, val64);
18113	  return true;
18114	}
18115    }
18116  return false;
18117}
18118
18119/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
18120   instruction.  If INFO is nonnull, use it to describe valid immediates.  */
18121
18122static bool
18123aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
18124			     simd_immediate_info *info)
18125{
18126  scalar_int_mode mode = DImode;
18127  unsigned int val32 = val64 & 0xffffffff;
18128  if (val32 == (val64 >> 32))
18129    {
18130      mode = SImode;
18131      unsigned int val16 = val32 & 0xffff;
18132      if (val16 == (val32 >> 16))
18133	{
18134	  mode = HImode;
18135	  unsigned int val8 = val16 & 0xff;
18136	  if (val8 == (val16 >> 8))
18137	    mode = QImode;
18138	}
18139    }
18140  HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
18141  if (IN_RANGE (val, -0x80, 0x7f))
18142    {
18143      /* DUP with no shift.  */
18144      if (info)
18145	*info = simd_immediate_info (mode, val);
18146      return true;
18147    }
18148  if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
18149    {
18150      /* DUP with LSL #8.  */
18151      if (info)
18152	*info = simd_immediate_info (mode, val);
18153      return true;
18154    }
18155  if (aarch64_bitmask_imm (val64, mode))
18156    {
18157      /* DUPM.  */
18158      if (info)
18159	*info = simd_immediate_info (mode, val);
18160      return true;
18161    }
18162  return false;
18163}
18164
18165/* Return true if X is an UNSPEC_PTRUE constant of the form:
18166
18167       (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
18168
18169   where PATTERN is the svpattern as a CONST_INT and where ZERO
18170   is a zero constant of the required PTRUE mode (which can have
18171   fewer elements than X's mode, if zero bits are significant).
18172
18173   If so, and if INFO is nonnull, describe the immediate in INFO.  */
18174bool
18175aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
18176{
18177  if (GET_CODE (x) != CONST)
18178    return false;
18179
18180  x = XEXP (x, 0);
18181  if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
18182    return false;
18183
18184  if (info)
18185    {
18186      aarch64_svpattern pattern
18187	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
18188      machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
18189      scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
18190      *info = simd_immediate_info (int_mode, pattern);
18191    }
18192  return true;
18193}
18194
18195/* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
18196   it to describe valid immediates.  */
18197
18198static bool
18199aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
18200{
18201  if (aarch64_sve_ptrue_svpattern_p (x, info))
18202    return true;
18203
18204  if (x == CONST0_RTX (GET_MODE (x)))
18205    {
18206      if (info)
18207	*info = simd_immediate_info (DImode, 0);
18208      return true;
18209    }
18210
18211  /* Analyze the value as a VNx16BImode.  This should be relatively
18212     efficient, since rtx_vector_builder has enough built-in capacity
18213     to store all VLA predicate constants without needing the heap.  */
18214  rtx_vector_builder builder;
18215  if (!aarch64_get_sve_pred_bits (builder, x))
18216    return false;
18217
18218  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
18219  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
18220    {
18221      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
18222      aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
18223      if (pattern != AARCH64_NUM_SVPATTERNS)
18224	{
18225	  if (info)
18226	    {
18227	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
18228	      *info = simd_immediate_info (int_mode, pattern);
18229	    }
18230	  return true;
18231	}
18232    }
18233  return false;
18234}
18235
18236/* Return true if OP is a valid SIMD immediate for the operation
18237   described by WHICH.  If INFO is nonnull, use it to describe valid
18238   immediates.  */
18239bool
18240aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
18241			      enum simd_immediate_check which)
18242{
18243  machine_mode mode = GET_MODE (op);
18244  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18245  if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18246    return false;
18247
18248  if (vec_flags & VEC_SVE_PRED)
18249    return aarch64_sve_pred_valid_immediate (op, info);
18250
18251  scalar_mode elt_mode = GET_MODE_INNER (mode);
18252  rtx base, step;
18253  unsigned int n_elts;
18254  if (GET_CODE (op) == CONST_VECTOR
18255      && CONST_VECTOR_DUPLICATE_P (op))
18256    n_elts = CONST_VECTOR_NPATTERNS (op);
18257  else if ((vec_flags & VEC_SVE_DATA)
18258	   && const_vec_series_p (op, &base, &step))
18259    {
18260      gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
18261      if (!aarch64_sve_index_immediate_p (base)
18262	  || !aarch64_sve_index_immediate_p (step))
18263	return false;
18264
18265      if (info)
18266	{
18267	  /* Get the corresponding container mode.  E.g. an INDEX on V2SI
18268	     should yield two integer values per 128-bit block, meaning
18269	     that we need to treat it in the same way as V2DI and then
18270	     ignore the upper 32 bits of each element.  */
18271	  elt_mode = aarch64_sve_container_int_mode (mode);
18272	  *info = simd_immediate_info (elt_mode, base, step);
18273	}
18274      return true;
18275    }
18276  else if (GET_CODE (op) == CONST_VECTOR
18277	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
18278    /* N_ELTS set above.  */;
18279  else
18280    return false;
18281
18282  scalar_float_mode elt_float_mode;
18283  if (n_elts == 1
18284      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
18285    {
18286      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
18287      if (aarch64_float_const_zero_rtx_p (elt)
18288	  || aarch64_float_const_representable_p (elt))
18289	{
18290	  if (info)
18291	    *info = simd_immediate_info (elt_float_mode, elt);
18292	  return true;
18293	}
18294    }
18295
18296  /* If all elements in an SVE vector have the same value, we have a free
18297     choice between using the element mode and using the container mode.
18298     Using the element mode means that unused parts of the vector are
18299     duplicates of the used elements, while using the container mode means
18300     that the unused parts are an extension of the used elements.  Using the
18301     element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
18302     for its container mode VNx4SI while 0x00000101 isn't.
18303
18304     If not all elements in an SVE vector have the same value, we need the
18305     transition from one element to the next to occur at container boundaries.
18306     E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
18307     in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
18308  scalar_int_mode elt_int_mode;
18309  if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
18310    elt_int_mode = aarch64_sve_container_int_mode (mode);
18311  else
18312    elt_int_mode = int_mode_for_mode (elt_mode).require ();
18313
18314  unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
18315  if (elt_size > 8)
18316    return false;
18317
18318  /* Expand the vector constant out into a byte vector, with the least
18319     significant byte of the register first.  */
18320  auto_vec<unsigned char, 16> bytes;
18321  bytes.reserve (n_elts * elt_size);
18322  for (unsigned int i = 0; i < n_elts; i++)
18323    {
18324      /* The vector is provided in gcc endian-neutral fashion.
18325	 For aarch64_be Advanced SIMD, it must be laid out in the vector
18326	 register in reverse order.  */
18327      bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
18328      rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
18329
18330      if (elt_mode != elt_int_mode)
18331	elt = gen_lowpart (elt_int_mode, elt);
18332
18333      if (!CONST_INT_P (elt))
18334	return false;
18335
18336      unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
18337      for (unsigned int byte = 0; byte < elt_size; byte++)
18338	{
18339	  bytes.quick_push (elt_val & 0xff);
18340	  elt_val >>= BITS_PER_UNIT;
18341	}
18342    }
18343
18344  /* The immediate must repeat every eight bytes.  */
18345  unsigned int nbytes = bytes.length ();
18346  for (unsigned i = 8; i < nbytes; ++i)
18347    if (bytes[i] != bytes[i - 8])
18348      return false;
18349
18350  /* Get the repeating 8-byte value as an integer.  No endian correction
18351     is needed here because bytes is already in lsb-first order.  */
18352  unsigned HOST_WIDE_INT val64 = 0;
18353  for (unsigned int i = 0; i < 8; i++)
18354    val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
18355	      << (i * BITS_PER_UNIT));
18356
18357  if (vec_flags & VEC_SVE_DATA)
18358    return aarch64_sve_valid_immediate (val64, info);
18359  else
18360    return aarch64_advsimd_valid_immediate (val64, info, which);
18361}
18362
18363/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
18364   has a step in the range of INDEX.  Return the index expression if so,
18365   otherwise return null.  */
18366rtx
18367aarch64_check_zero_based_sve_index_immediate (rtx x)
18368{
18369  rtx base, step;
18370  if (const_vec_series_p (x, &base, &step)
18371      && base == const0_rtx
18372      && aarch64_sve_index_immediate_p (step))
18373    return step;
18374  return NULL_RTX;
18375}
18376
18377/* Check of immediate shift constants are within range.  */
18378bool
18379aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
18380{
18381  x = unwrap_const_vec_duplicate (x);
18382  if (!CONST_INT_P (x))
18383    return false;
18384  int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
18385  if (left)
18386    return IN_RANGE (INTVAL (x), 0, bit_width - 1);
18387  else
18388    return IN_RANGE (INTVAL (x), 1, bit_width);
18389}
18390
18391/* Return the bitmask CONST_INT to select the bits required by a zero extract
18392   operation of width WIDTH at bit position POS.  */
18393
18394rtx
18395aarch64_mask_from_zextract_ops (rtx width, rtx pos)
18396{
18397  gcc_assert (CONST_INT_P (width));
18398  gcc_assert (CONST_INT_P (pos));
18399
18400  unsigned HOST_WIDE_INT mask
18401    = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
18402  return GEN_INT (mask << UINTVAL (pos));
18403}
18404
18405bool
18406aarch64_mov_operand_p (rtx x, machine_mode mode)
18407{
18408  if (GET_CODE (x) == HIGH
18409      && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
18410    return true;
18411
18412  if (CONST_INT_P (x))
18413    return true;
18414
18415  if (VECTOR_MODE_P (GET_MODE (x)))
18416    {
18417      /* Require predicate constants to be VNx16BI before RA, so that we
18418	 force everything to have a canonical form.  */
18419      if (!lra_in_progress
18420	  && !reload_completed
18421	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
18422	  && GET_MODE (x) != VNx16BImode)
18423	return false;
18424
18425      return aarch64_simd_valid_immediate (x, NULL);
18426    }
18427
18428  x = strip_salt (x);
18429  if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
18430    return true;
18431
18432  if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
18433    return true;
18434
18435  return aarch64_classify_symbolic_expression (x)
18436    == SYMBOL_TINY_ABSOLUTE;
18437}
18438
18439/* Return a const_int vector of VAL.  */
18440rtx
18441aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
18442{
18443  rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18444  return gen_const_vec_duplicate (mode, c);
18445}
18446
18447/* Check OP is a legal scalar immediate for the MOVI instruction.  */
18448
18449bool
18450aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
18451{
18452  machine_mode vmode;
18453
18454  vmode = aarch64_simd_container_mode (mode, 64);
18455  rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
18456  return aarch64_simd_valid_immediate (op_v, NULL);
18457}
18458
18459/* Construct and return a PARALLEL RTX vector with elements numbering the
18460   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18461   the vector - from the perspective of the architecture.  This does not
18462   line up with GCC's perspective on lane numbers, so we end up with
18463   different masks depending on our target endian-ness.  The diagram
18464   below may help.  We must draw the distinction when building masks
18465   which select one half of the vector.  An instruction selecting
18466   architectural low-lanes for a big-endian target, must be described using
18467   a mask selecting GCC high-lanes.
18468
18469                 Big-Endian             Little-Endian
18470
18471GCC             0   1   2   3           3   2   1   0
18472              | x | x | x | x |       | x | x | x | x |
18473Architecture    3   2   1   0           3   2   1   0
18474
18475Low Mask:         { 2, 3 }                { 0, 1 }
18476High Mask:        { 0, 1 }                { 2, 3 }
18477
18478   MODE Is the mode of the vector and NUNITS is the number of units in it.  */
18479
18480rtx
18481aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
18482{
18483  rtvec v = rtvec_alloc (nunits / 2);
18484  int high_base = nunits / 2;
18485  int low_base = 0;
18486  int base;
18487  rtx t1;
18488  int i;
18489
18490  if (BYTES_BIG_ENDIAN)
18491    base = high ? low_base : high_base;
18492  else
18493    base = high ? high_base : low_base;
18494
18495  for (i = 0; i < nunits / 2; i++)
18496    RTVEC_ELT (v, i) = GEN_INT (base + i);
18497
18498  t1 = gen_rtx_PARALLEL (mode, v);
18499  return t1;
18500}
18501
18502/* Check OP for validity as a PARALLEL RTX vector with elements
18503   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18504   from the perspective of the architecture.  See the diagram above
18505   aarch64_simd_vect_par_cnst_half for more details.  */
18506
18507bool
18508aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
18509				       bool high)
18510{
18511  int nelts;
18512  if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
18513    return false;
18514
18515  rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
18516  HOST_WIDE_INT count_op = XVECLEN (op, 0);
18517  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18518  int i = 0;
18519
18520  if (count_op != count_ideal)
18521    return false;
18522
18523  for (i = 0; i < count_ideal; i++)
18524    {
18525      rtx elt_op = XVECEXP (op, 0, i);
18526      rtx elt_ideal = XVECEXP (ideal, 0, i);
18527
18528      if (!CONST_INT_P (elt_op)
18529	  || INTVAL (elt_ideal) != INTVAL (elt_op))
18530	return false;
18531    }
18532  return true;
18533}
18534
18535/* Return a PARALLEL containing NELTS elements, with element I equal
18536   to BASE + I * STEP.  */
18537
18538rtx
18539aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18540{
18541  rtvec vec = rtvec_alloc (nelts);
18542  for (unsigned int i = 0; i < nelts; ++i)
18543    RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18544  return gen_rtx_PARALLEL (VOIDmode, vec);
18545}
18546
18547/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18548   series with step STEP.  */
18549
18550bool
18551aarch64_stepped_int_parallel_p (rtx op, int step)
18552{
18553  if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18554    return false;
18555
18556  unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18557  for (int i = 1; i < XVECLEN (op, 0); ++i)
18558    if (!CONST_INT_P (XVECEXP (op, 0, i))
18559	|| UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18560      return false;
18561
18562  return true;
18563}
18564
18565/* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
18566   HIGH (exclusive).  */
18567void
18568aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18569			  const_tree exp)
18570{
18571  HOST_WIDE_INT lane;
18572  gcc_assert (CONST_INT_P (operand));
18573  lane = INTVAL (operand);
18574
18575  if (lane < low || lane >= high)
18576  {
18577    if (exp)
18578      error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18579    else
18580      error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18581  }
18582}
18583
18584/* Peform endian correction on lane number N, which indexes a vector
18585   of mode MODE, and return the result as an SImode rtx.  */
18586
18587rtx
18588aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18589{
18590  return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18591}
18592
18593/* Return TRUE if OP is a valid vector addressing mode.  */
18594
18595bool
18596aarch64_simd_mem_operand_p (rtx op)
18597{
18598  return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18599			|| REG_P (XEXP (op, 0)));
18600}
18601
18602/* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
18603
18604bool
18605aarch64_sve_ld1r_operand_p (rtx op)
18606{
18607  struct aarch64_address_info addr;
18608  scalar_mode mode;
18609
18610  return (MEM_P (op)
18611	  && is_a <scalar_mode> (GET_MODE (op), &mode)
18612	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18613	  && addr.type == ADDRESS_REG_IMM
18614	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18615}
18616
18617/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18618   where the size of the read data is specified by `mode` and the size of the
18619   vector elements are specified by `elem_mode`.   */
18620bool
18621aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18622				   scalar_mode elem_mode)
18623{
18624  struct aarch64_address_info addr;
18625  if (!MEM_P (op)
18626      || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18627    return false;
18628
18629  if (addr.type == ADDRESS_REG_IMM)
18630    return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18631
18632  if (addr.type == ADDRESS_REG_REG)
18633    return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18634
18635  return false;
18636}
18637
18638/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
18639bool
18640aarch64_sve_ld1rq_operand_p (rtx op)
18641{
18642  return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18643					    GET_MODE_INNER (GET_MODE (op)));
18644}
18645
18646/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18647   accessing a vector where the element size is specified by `elem_mode`.  */
18648bool
18649aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18650{
18651  return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18652}
18653
18654/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
18655bool
18656aarch64_sve_ldff1_operand_p (rtx op)
18657{
18658  if (!MEM_P (op))
18659    return false;
18660
18661  struct aarch64_address_info addr;
18662  if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18663    return false;
18664
18665  if (addr.type == ADDRESS_REG_IMM)
18666    return known_eq (addr.const_offset, 0);
18667
18668  return addr.type == ADDRESS_REG_REG;
18669}
18670
18671/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
18672bool
18673aarch64_sve_ldnf1_operand_p (rtx op)
18674{
18675  struct aarch64_address_info addr;
18676
18677  return (MEM_P (op)
18678	  && aarch64_classify_address (&addr, XEXP (op, 0),
18679				       GET_MODE (op), false)
18680	  && addr.type == ADDRESS_REG_IMM);
18681}
18682
18683/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18684   The conditions for STR are the same.  */
18685bool
18686aarch64_sve_ldr_operand_p (rtx op)
18687{
18688  struct aarch64_address_info addr;
18689
18690  return (MEM_P (op)
18691	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18692				       false, ADDR_QUERY_ANY)
18693	  && addr.type == ADDRESS_REG_IMM);
18694}
18695
18696/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18697   addressing memory of mode MODE.  */
18698bool
18699aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18700{
18701  struct aarch64_address_info addr;
18702  if (!aarch64_classify_address (&addr, op, mode, false))
18703    return false;
18704
18705  if (addr.type == ADDRESS_REG_IMM)
18706    return known_eq (addr.const_offset, 0);
18707
18708  return addr.type == ADDRESS_REG_REG;
18709}
18710
18711/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18712   We need to be able to access the individual pieces, so the range
18713   is different from LD[234] and ST[234].  */
18714bool
18715aarch64_sve_struct_memory_operand_p (rtx op)
18716{
18717  if (!MEM_P (op))
18718    return false;
18719
18720  machine_mode mode = GET_MODE (op);
18721  struct aarch64_address_info addr;
18722  if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18723				 ADDR_QUERY_ANY)
18724      || addr.type != ADDRESS_REG_IMM)
18725    return false;
18726
18727  poly_int64 first = addr.const_offset;
18728  poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18729  return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18730	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18731}
18732
18733/* Emit a register copy from operand to operand, taking care not to
18734   early-clobber source registers in the process.
18735
18736   COUNT is the number of components into which the copy needs to be
18737   decomposed.  */
18738void
18739aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18740				unsigned int count)
18741{
18742  unsigned int i;
18743  int rdest = REGNO (operands[0]);
18744  int rsrc = REGNO (operands[1]);
18745
18746  if (!reg_overlap_mentioned_p (operands[0], operands[1])
18747      || rdest < rsrc)
18748    for (i = 0; i < count; i++)
18749      emit_move_insn (gen_rtx_REG (mode, rdest + i),
18750		      gen_rtx_REG (mode, rsrc + i));
18751  else
18752    for (i = 0; i < count; i++)
18753      emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18754		      gen_rtx_REG (mode, rsrc + count - i - 1));
18755}
18756
18757/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18758   one of VSTRUCT modes: OI, CI, or XI.  */
18759int
18760aarch64_simd_attr_length_rglist (machine_mode mode)
18761{
18762  /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
18763  return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18764}
18765
18766/* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
18767   alignment of a vector to 128 bits.  SVE predicates have an alignment of
18768   16 bits.  */
18769static HOST_WIDE_INT
18770aarch64_simd_vector_alignment (const_tree type)
18771{
18772  /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18773     be set for non-predicate vectors of booleans.  Modes are the most
18774     direct way we have of identifying real SVE predicate types.  */
18775  if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18776    return 16;
18777  widest_int min_size
18778    = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18779  return wi::umin (min_size, 128).to_uhwi ();
18780}
18781
18782/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
18783static poly_uint64
18784aarch64_vectorize_preferred_vector_alignment (const_tree type)
18785{
18786  if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18787    {
18788      /* If the length of the vector is a fixed power of 2, try to align
18789	 to that length, otherwise don't try to align at all.  */
18790      HOST_WIDE_INT result;
18791      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
18792	  || !pow2p_hwi (result))
18793	result = TYPE_ALIGN (TREE_TYPE (type));
18794      return result;
18795    }
18796  return TYPE_ALIGN (type);
18797}
18798
18799/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
18800static bool
18801aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18802{
18803  if (is_packed)
18804    return false;
18805
18806  /* For fixed-length vectors, check that the vectorizer will aim for
18807     full-vector alignment.  This isn't true for generic GCC vectors
18808     that are wider than the ABI maximum of 128 bits.  */
18809  poly_uint64 preferred_alignment =
18810    aarch64_vectorize_preferred_vector_alignment (type);
18811  if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18812      && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18813		   preferred_alignment))
18814    return false;
18815
18816  /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
18817  return true;
18818}
18819
18820/* Return true if the vector misalignment factor is supported by the
18821   target.  */
18822static bool
18823aarch64_builtin_support_vector_misalignment (machine_mode mode,
18824					     const_tree type, int misalignment,
18825					     bool is_packed)
18826{
18827  if (TARGET_SIMD && STRICT_ALIGNMENT)
18828    {
18829      /* Return if movmisalign pattern is not supported for this mode.  */
18830      if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18831        return false;
18832
18833      /* Misalignment factor is unknown at compile time.  */
18834      if (misalignment == -1)
18835	return false;
18836    }
18837  return default_builtin_support_vector_misalignment (mode, type, misalignment,
18838						      is_packed);
18839}
18840
18841/* If VALS is a vector constant that can be loaded into a register
18842   using DUP, generate instructions to do so and return an RTX to
18843   assign to the register.  Otherwise return NULL_RTX.  */
18844static rtx
18845aarch64_simd_dup_constant (rtx vals)
18846{
18847  machine_mode mode = GET_MODE (vals);
18848  machine_mode inner_mode = GET_MODE_INNER (mode);
18849  rtx x;
18850
18851  if (!const_vec_duplicate_p (vals, &x))
18852    return NULL_RTX;
18853
18854  /* We can load this constant by using DUP and a constant in a
18855     single ARM register.  This will be cheaper than a vector
18856     load.  */
18857  x = copy_to_mode_reg (inner_mode, x);
18858  return gen_vec_duplicate (mode, x);
18859}
18860
18861
18862/* Generate code to load VALS, which is a PARALLEL containing only
18863   constants (for vec_init) or CONST_VECTOR, efficiently into a
18864   register.  Returns an RTX to copy into the register, or NULL_RTX
18865   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
18866static rtx
18867aarch64_simd_make_constant (rtx vals)
18868{
18869  machine_mode mode = GET_MODE (vals);
18870  rtx const_dup;
18871  rtx const_vec = NULL_RTX;
18872  int n_const = 0;
18873  int i;
18874
18875  if (GET_CODE (vals) == CONST_VECTOR)
18876    const_vec = vals;
18877  else if (GET_CODE (vals) == PARALLEL)
18878    {
18879      /* A CONST_VECTOR must contain only CONST_INTs and
18880	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18881	 Only store valid constants in a CONST_VECTOR.  */
18882      int n_elts = XVECLEN (vals, 0);
18883      for (i = 0; i < n_elts; ++i)
18884	{
18885	  rtx x = XVECEXP (vals, 0, i);
18886	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18887	    n_const++;
18888	}
18889      if (n_const == n_elts)
18890	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18891    }
18892  else
18893    gcc_unreachable ();
18894
18895  if (const_vec != NULL_RTX
18896      && aarch64_simd_valid_immediate (const_vec, NULL))
18897    /* Load using MOVI/MVNI.  */
18898    return const_vec;
18899  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18900    /* Loaded using DUP.  */
18901    return const_dup;
18902  else if (const_vec != NULL_RTX)
18903    /* Load from constant pool. We cannot take advantage of single-cycle
18904       LD1 because we need a PC-relative addressing mode.  */
18905    return const_vec;
18906  else
18907    /* A PARALLEL containing something not valid inside CONST_VECTOR.
18908       We cannot construct an initializer.  */
18909    return NULL_RTX;
18910}
18911
18912/* Expand a vector initialisation sequence, such that TARGET is
18913   initialised to contain VALS.  */
18914
18915void
18916aarch64_expand_vector_init (rtx target, rtx vals)
18917{
18918  machine_mode mode = GET_MODE (target);
18919  scalar_mode inner_mode = GET_MODE_INNER (mode);
18920  /* The number of vector elements.  */
18921  int n_elts = XVECLEN (vals, 0);
18922  /* The number of vector elements which are not constant.  */
18923  int n_var = 0;
18924  rtx any_const = NULL_RTX;
18925  /* The first element of vals.  */
18926  rtx v0 = XVECEXP (vals, 0, 0);
18927  bool all_same = true;
18928
18929  /* This is a special vec_init<M><N> where N is not an element mode but a
18930     vector mode with half the elements of M.  We expect to find two entries
18931     of mode N in VALS and we must put their concatentation into TARGET.  */
18932  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18933    {
18934      gcc_assert (known_eq (GET_MODE_SIZE (mode),
18935		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18936      rtx lo = XVECEXP (vals, 0, 0);
18937      rtx hi = XVECEXP (vals, 0, 1);
18938      machine_mode narrow_mode = GET_MODE (lo);
18939      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18940      gcc_assert (narrow_mode == GET_MODE (hi));
18941
18942      /* When we want to concatenate a half-width vector with zeroes we can
18943	 use the aarch64_combinez[_be] patterns.  Just make sure that the
18944	 zeroes are in the right half.  */
18945      if (BYTES_BIG_ENDIAN
18946	  && aarch64_simd_imm_zero (lo, narrow_mode)
18947	  && general_operand (hi, narrow_mode))
18948	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18949      else if (!BYTES_BIG_ENDIAN
18950	       && aarch64_simd_imm_zero (hi, narrow_mode)
18951	       && general_operand (lo, narrow_mode))
18952	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18953      else
18954	{
18955	  /* Else create the two half-width registers and combine them.  */
18956	  if (!REG_P (lo))
18957	    lo = force_reg (GET_MODE (lo), lo);
18958	  if (!REG_P (hi))
18959	    hi = force_reg (GET_MODE (hi), hi);
18960
18961	  if (BYTES_BIG_ENDIAN)
18962	    std::swap (lo, hi);
18963	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18964	}
18965     return;
18966   }
18967
18968  /* Count the number of variable elements to initialise.  */
18969  for (int i = 0; i < n_elts; ++i)
18970    {
18971      rtx x = XVECEXP (vals, 0, i);
18972      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18973	++n_var;
18974      else
18975	any_const = x;
18976
18977      all_same &= rtx_equal_p (x, v0);
18978    }
18979
18980  /* No variable elements, hand off to aarch64_simd_make_constant which knows
18981     how best to handle this.  */
18982  if (n_var == 0)
18983    {
18984      rtx constant = aarch64_simd_make_constant (vals);
18985      if (constant != NULL_RTX)
18986	{
18987	  emit_move_insn (target, constant);
18988	  return;
18989	}
18990    }
18991
18992  /* Splat a single non-constant element if we can.  */
18993  if (all_same)
18994    {
18995      rtx x = copy_to_mode_reg (inner_mode, v0);
18996      aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18997      return;
18998    }
18999
19000  enum insn_code icode = optab_handler (vec_set_optab, mode);
19001  gcc_assert (icode != CODE_FOR_nothing);
19002
19003  /* If there are only variable elements, try to optimize
19004     the insertion using dup for the most common element
19005     followed by insertions.  */
19006
19007  /* The algorithm will fill matches[*][0] with the earliest matching element,
19008     and matches[X][1] with the count of duplicate elements (if X is the
19009     earliest element which has duplicates).  */
19010
19011  if (n_var == n_elts && n_elts <= 16)
19012    {
19013      int matches[16][2] = {0};
19014      for (int i = 0; i < n_elts; i++)
19015	{
19016	  for (int j = 0; j <= i; j++)
19017	    {
19018	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
19019		{
19020		  matches[i][0] = j;
19021		  matches[j][1]++;
19022		  break;
19023		}
19024	    }
19025	}
19026      int maxelement = 0;
19027      int maxv = 0;
19028      for (int i = 0; i < n_elts; i++)
19029	if (matches[i][1] > maxv)
19030	  {
19031	    maxelement = i;
19032	    maxv = matches[i][1];
19033	  }
19034
19035      /* Create a duplicate of the most common element, unless all elements
19036	 are equally useless to us, in which case just immediately set the
19037	 vector register using the first element.  */
19038
19039      if (maxv == 1)
19040	{
19041	  /* For vectors of two 64-bit elements, we can do even better.  */
19042	  if (n_elts == 2
19043	      && (inner_mode == E_DImode
19044		  || inner_mode == E_DFmode))
19045
19046	    {
19047	      rtx x0 = XVECEXP (vals, 0, 0);
19048	      rtx x1 = XVECEXP (vals, 0, 1);
19049	      /* Combine can pick up this case, but handling it directly
19050		 here leaves clearer RTL.
19051
19052		 This is load_pair_lanes<mode>, and also gives us a clean-up
19053		 for store_pair_lanes<mode>.  */
19054	      if (memory_operand (x0, inner_mode)
19055		  && memory_operand (x1, inner_mode)
19056		  && !STRICT_ALIGNMENT
19057		  && rtx_equal_p (XEXP (x1, 0),
19058				  plus_constant (Pmode,
19059						 XEXP (x0, 0),
19060						 GET_MODE_SIZE (inner_mode))))
19061		{
19062		  rtx t;
19063		  if (inner_mode == DFmode)
19064		    t = gen_load_pair_lanesdf (target, x0, x1);
19065		  else
19066		    t = gen_load_pair_lanesdi (target, x0, x1);
19067		  emit_insn (t);
19068		  return;
19069		}
19070	    }
19071	  /* The subreg-move sequence below will move into lane zero of the
19072	     vector register.  For big-endian we want that position to hold
19073	     the last element of VALS.  */
19074	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
19075	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
19076	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
19077	}
19078      else
19079	{
19080	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
19081	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
19082	}
19083
19084      /* Insert the rest.  */
19085      for (int i = 0; i < n_elts; i++)
19086	{
19087	  rtx x = XVECEXP (vals, 0, i);
19088	  if (matches[i][0] == maxelement)
19089	    continue;
19090	  x = copy_to_mode_reg (inner_mode, x);
19091	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
19092	}
19093      return;
19094    }
19095
19096  /* Initialise a vector which is part-variable.  We want to first try
19097     to build those lanes which are constant in the most efficient way we
19098     can.  */
19099  if (n_var != n_elts)
19100    {
19101      rtx copy = copy_rtx (vals);
19102
19103      /* Load constant part of vector.  We really don't care what goes into the
19104	 parts we will overwrite, but we're more likely to be able to load the
19105	 constant efficiently if it has fewer, larger, repeating parts
19106	 (see aarch64_simd_valid_immediate).  */
19107      for (int i = 0; i < n_elts; i++)
19108	{
19109	  rtx x = XVECEXP (vals, 0, i);
19110	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
19111	    continue;
19112	  rtx subst = any_const;
19113	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
19114	    {
19115	      /* Look in the copied vector, as more elements are const.  */
19116	      rtx test = XVECEXP (copy, 0, i ^ bit);
19117	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
19118		{
19119		  subst = test;
19120		  break;
19121		}
19122	    }
19123	  XVECEXP (copy, 0, i) = subst;
19124	}
19125      aarch64_expand_vector_init (target, copy);
19126    }
19127
19128  /* Insert the variable lanes directly.  */
19129  for (int i = 0; i < n_elts; i++)
19130    {
19131      rtx x = XVECEXP (vals, 0, i);
19132      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
19133	continue;
19134      x = copy_to_mode_reg (inner_mode, x);
19135      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
19136    }
19137}
19138
19139/* Emit RTL corresponding to:
19140   insr TARGET, ELEM.  */
19141
19142static void
19143emit_insr (rtx target, rtx elem)
19144{
19145  machine_mode mode = GET_MODE (target);
19146  scalar_mode elem_mode = GET_MODE_INNER (mode);
19147  elem = force_reg (elem_mode, elem);
19148
19149  insn_code icode = optab_handler (vec_shl_insert_optab, mode);
19150  gcc_assert (icode != CODE_FOR_nothing);
19151  emit_insn (GEN_FCN (icode) (target, target, elem));
19152}
19153
19154/* Subroutine of aarch64_sve_expand_vector_init for handling
19155   trailing constants.
19156   This function works as follows:
19157   (a) Create a new vector consisting of trailing constants.
19158   (b) Initialize TARGET with the constant vector using emit_move_insn.
19159   (c) Insert remaining elements in TARGET using insr.
19160   NELTS is the total number of elements in original vector while
19161   while NELTS_REQD is the number of elements that are actually
19162   significant.
19163
19164   ??? The heuristic used is to do above only if number of constants
19165   is at least half the total number of elements.  May need fine tuning.  */
19166
19167static bool
19168aarch64_sve_expand_vector_init_handle_trailing_constants
19169 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
19170{
19171  machine_mode mode = GET_MODE (target);
19172  scalar_mode elem_mode = GET_MODE_INNER (mode);
19173  int n_trailing_constants = 0;
19174
19175  for (int i = nelts_reqd - 1;
19176       i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
19177       i--)
19178    n_trailing_constants++;
19179
19180  if (n_trailing_constants >= nelts_reqd / 2)
19181    {
19182      /* Try to use the natural pattern of BUILDER to extend the trailing
19183	 constant elements to a full vector.  Replace any variables in the
19184	 extra elements with zeros.
19185
19186	 ??? It would be better if the builders supported "don't care"
19187	     elements, with the builder filling in whichever elements
19188	     give the most compact encoding.  */
19189      rtx_vector_builder v (mode, nelts, 1);
19190      for (int i = 0; i < nelts; i++)
19191	{
19192	  rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
19193	  if (!valid_for_const_vector_p (elem_mode, x))
19194	    x = const0_rtx;
19195	  v.quick_push (x);
19196	}
19197      rtx const_vec = v.build ();
19198      emit_move_insn (target, const_vec);
19199
19200      for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
19201	emit_insr (target, builder.elt (i));
19202
19203      return true;
19204    }
19205
19206  return false;
19207}
19208
19209/* Subroutine of aarch64_sve_expand_vector_init.
19210   Works as follows:
19211   (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
19212   (b) Skip trailing elements from BUILDER, which are the same as
19213       element NELTS_REQD - 1.
19214   (c) Insert earlier elements in reverse order in TARGET using insr.  */
19215
19216static void
19217aarch64_sve_expand_vector_init_insert_elems (rtx target,
19218					     const rtx_vector_builder &builder,
19219					     int nelts_reqd)
19220{
19221  machine_mode mode = GET_MODE (target);
19222  scalar_mode elem_mode = GET_MODE_INNER (mode);
19223
19224  struct expand_operand ops[2];
19225  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
19226  gcc_assert (icode != CODE_FOR_nothing);
19227
19228  create_output_operand (&ops[0], target, mode);
19229  create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
19230  expand_insn (icode, 2, ops);
19231
19232  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19233  for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
19234    emit_insr (target, builder.elt (i));
19235}
19236
19237/* Subroutine of aarch64_sve_expand_vector_init to handle case
19238   when all trailing elements of builder are same.
19239   This works as follows:
19240   (a) Use expand_insn interface to broadcast last vector element in TARGET.
19241   (b) Insert remaining elements in TARGET using insr.
19242
19243   ??? The heuristic used is to do above if number of same trailing elements
19244   is at least 3/4 of total number of elements, loosely based on
19245   heuristic from mostly_zeros_p.  May need fine-tuning.  */
19246
19247static bool
19248aarch64_sve_expand_vector_init_handle_trailing_same_elem
19249 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
19250{
19251  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
19252  if (ndups >= (3 * nelts_reqd) / 4)
19253    {
19254      aarch64_sve_expand_vector_init_insert_elems (target, builder,
19255						   nelts_reqd - ndups + 1);
19256      return true;
19257    }
19258
19259  return false;
19260}
19261
19262/* Initialize register TARGET from BUILDER. NELTS is the constant number
19263   of elements in BUILDER.
19264
19265   The function tries to initialize TARGET from BUILDER if it fits one
19266   of the special cases outlined below.
19267
19268   Failing that, the function divides BUILDER into two sub-vectors:
19269   v_even = even elements of BUILDER;
19270   v_odd = odd elements of BUILDER;
19271
19272   and recursively calls itself with v_even and v_odd.
19273
19274   if (recursive call succeeded for v_even or v_odd)
19275     TARGET = zip (v_even, v_odd)
19276
19277   The function returns true if it managed to build TARGET from BUILDER
19278   with one of the special cases, false otherwise.
19279
19280   Example: {a, 1, b, 2, c, 3, d, 4}
19281
19282   The vector gets divided into:
19283   v_even = {a, b, c, d}
19284   v_odd = {1, 2, 3, 4}
19285
19286   aarch64_sve_expand_vector_init(v_odd) hits case 1 and
19287   initialize tmp2 from constant vector v_odd using emit_move_insn.
19288
19289   aarch64_sve_expand_vector_init(v_even) fails since v_even contains
19290   4 elements, so we construct tmp1 from v_even using insr:
19291   tmp1 = dup(d)
19292   insr tmp1, c
19293   insr tmp1, b
19294   insr tmp1, a
19295
19296   And finally:
19297   TARGET = zip (tmp1, tmp2)
19298   which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
19299
19300static bool
19301aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
19302				int nelts, int nelts_reqd)
19303{
19304  machine_mode mode = GET_MODE (target);
19305
19306  /* Case 1: Vector contains trailing constants.  */
19307
19308  if (aarch64_sve_expand_vector_init_handle_trailing_constants
19309       (target, builder, nelts, nelts_reqd))
19310    return true;
19311
19312  /* Case 2: Vector contains leading constants.  */
19313
19314  rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
19315  for (int i = 0; i < nelts_reqd; i++)
19316    rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
19317  rev_builder.finalize ();
19318
19319  if (aarch64_sve_expand_vector_init_handle_trailing_constants
19320       (target, rev_builder, nelts, nelts_reqd))
19321    {
19322      emit_insn (gen_aarch64_sve_rev (mode, target, target));
19323      return true;
19324    }
19325
19326  /* Case 3: Vector contains trailing same element.  */
19327
19328  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19329       (target, builder, nelts_reqd))
19330    return true;
19331
19332  /* Case 4: Vector contains leading same element.  */
19333
19334  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
19335       (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
19336    {
19337      emit_insn (gen_aarch64_sve_rev (mode, target, target));
19338      return true;
19339    }
19340
19341  /* Avoid recursing below 4-elements.
19342     ??? The threshold 4 may need fine-tuning.  */
19343
19344  if (nelts_reqd <= 4)
19345    return false;
19346
19347  rtx_vector_builder v_even (mode, nelts, 1);
19348  rtx_vector_builder v_odd (mode, nelts, 1);
19349
19350  for (int i = 0; i < nelts * 2; i += 2)
19351    {
19352      v_even.quick_push (builder.elt (i));
19353      v_odd.quick_push (builder.elt (i + 1));
19354    }
19355
19356  v_even.finalize ();
19357  v_odd.finalize ();
19358
19359  rtx tmp1 = gen_reg_rtx (mode);
19360  bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
19361						    nelts, nelts_reqd / 2);
19362
19363  rtx tmp2 = gen_reg_rtx (mode);
19364  bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
19365						   nelts, nelts_reqd / 2);
19366
19367  if (!did_even_p && !did_odd_p)
19368    return false;
19369
19370  /* Initialize v_even and v_odd using INSR if it didn't match any of the
19371     special cases and zip v_even, v_odd.  */
19372
19373  if (!did_even_p)
19374    aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
19375
19376  if (!did_odd_p)
19377    aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
19378
19379  rtvec v = gen_rtvec (2, tmp1, tmp2);
19380  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
19381  return true;
19382}
19383
19384/* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
19385
19386void
19387aarch64_sve_expand_vector_init (rtx target, rtx vals)
19388{
19389  machine_mode mode = GET_MODE (target);
19390  int nelts = XVECLEN (vals, 0);
19391
19392  rtx_vector_builder v (mode, nelts, 1);
19393  for (int i = 0; i < nelts; i++)
19394    v.quick_push (XVECEXP (vals, 0, i));
19395  v.finalize ();
19396
19397  /* If neither sub-vectors of v could be initialized specially,
19398     then use INSR to insert all elements from v into TARGET.
19399     ??? This might not be optimal for vectors with large
19400     initializers like 16-element or above.
19401     For nelts < 4, it probably isn't useful to handle specially.  */
19402
19403  if (nelts < 4
19404      || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
19405    aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
19406}
19407
19408/* Check whether VALUE is a vector constant in which every element
19409   is either a power of 2 or a negated power of 2.  If so, return
19410   a constant vector of log2s, and flip CODE between PLUS and MINUS
19411   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
19412
19413static rtx
19414aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
19415{
19416  if (GET_CODE (value) != CONST_VECTOR)
19417    return NULL_RTX;
19418
19419  rtx_vector_builder builder;
19420  if (!builder.new_unary_operation (GET_MODE (value), value, false))
19421    return NULL_RTX;
19422
19423  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
19424  /* 1 if the result of the multiplication must be negated,
19425     0 if it mustn't, or -1 if we don't yet care.  */
19426  int negate = -1;
19427  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
19428  for (unsigned int i = 0; i < encoded_nelts; ++i)
19429    {
19430      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
19431      if (!CONST_SCALAR_INT_P (elt))
19432	return NULL_RTX;
19433      rtx_mode_t val (elt, int_mode);
19434      wide_int pow2 = wi::neg (val);
19435      if (val != pow2)
19436	{
19437	  /* It matters whether we negate or not.  Make that choice,
19438	     and make sure that it's consistent with previous elements.  */
19439	  if (negate == !wi::neg_p (val))
19440	    return NULL_RTX;
19441	  negate = wi::neg_p (val);
19442	  if (!negate)
19443	    pow2 = val;
19444	}
19445      /* POW2 is now the value that we want to be a power of 2.  */
19446      int shift = wi::exact_log2 (pow2);
19447      if (shift < 0)
19448	return NULL_RTX;
19449      builder.quick_push (gen_int_mode (shift, int_mode));
19450    }
19451  if (negate == -1)
19452    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
19453    code = PLUS;
19454  else if (negate == 1)
19455    code = code == PLUS ? MINUS : PLUS;
19456  return builder.build ();
19457}
19458
19459/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19460   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
19461   operands array, in the same order as for fma_optab.  Return true if
19462   the function emitted all the necessary instructions, false if the caller
19463   should generate the pattern normally with the new OPERANDS array.  */
19464
19465bool
19466aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19467{
19468  machine_mode mode = GET_MODE (operands[0]);
19469  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19470    {
19471      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19472				  NULL_RTX, true, OPTAB_DIRECT);
19473      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19474			  operands[3], product, operands[0], true,
19475			  OPTAB_DIRECT);
19476      return true;
19477    }
19478  operands[2] = force_reg (mode, operands[2]);
19479  return false;
19480}
19481
19482/* Likewise, but for a conditional pattern.  */
19483
19484bool
19485aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19486{
19487  machine_mode mode = GET_MODE (operands[0]);
19488  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19489    {
19490      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19491				  NULL_RTX, true, OPTAB_DIRECT);
19492      emit_insn (gen_cond (code, mode, operands[0], operands[1],
19493			   operands[4], product, operands[5]));
19494      return true;
19495    }
19496  operands[3] = force_reg (mode, operands[3]);
19497  return false;
19498}
19499
19500static unsigned HOST_WIDE_INT
19501aarch64_shift_truncation_mask (machine_mode mode)
19502{
19503  if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19504    return 0;
19505  return GET_MODE_UNIT_BITSIZE (mode) - 1;
19506}
19507
19508/* Select a format to encode pointers in exception handling data.  */
19509int
19510aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19511{
19512   int type;
19513   switch (aarch64_cmodel)
19514     {
19515     case AARCH64_CMODEL_TINY:
19516     case AARCH64_CMODEL_TINY_PIC:
19517     case AARCH64_CMODEL_SMALL:
19518     case AARCH64_CMODEL_SMALL_PIC:
19519     case AARCH64_CMODEL_SMALL_SPIC:
19520       /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
19521	  for everything.  */
19522       type = DW_EH_PE_sdata4;
19523       break;
19524     default:
19525       /* No assumptions here.  8-byte relocs required.  */
19526       type = DW_EH_PE_sdata8;
19527       break;
19528     }
19529   return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19530}
19531
19532/* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
19533
19534static void
19535aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19536{
19537  if (TREE_CODE (decl) == FUNCTION_DECL)
19538    {
19539      arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19540      if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19541	{
19542	  fprintf (stream, "\t.variant_pcs\t");
19543	  assemble_name (stream, name);
19544	  fprintf (stream, "\n");
19545	}
19546    }
19547}
19548
19549/* The last .arch and .tune assembly strings that we printed.  */
19550static std::string aarch64_last_printed_arch_string;
19551static std::string aarch64_last_printed_tune_string;
19552
19553/* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
19554   by the function fndecl.  */
19555
19556void
19557aarch64_declare_function_name (FILE *stream, const char* name,
19558				tree fndecl)
19559{
19560  tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19561
19562  struct cl_target_option *targ_options;
19563  if (target_parts)
19564    targ_options = TREE_TARGET_OPTION (target_parts);
19565  else
19566    targ_options = TREE_TARGET_OPTION (target_option_current_node);
19567  gcc_assert (targ_options);
19568
19569  const struct processor *this_arch
19570    = aarch64_get_arch (targ_options->x_explicit_arch);
19571
19572  uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
19573  std::string extension
19574    = aarch64_get_extension_string_for_isa_flags (isa_flags,
19575						  this_arch->flags);
19576  /* Only update the assembler .arch string if it is distinct from the last
19577     such string we printed.  */
19578  std::string to_print = this_arch->name + extension;
19579  if (to_print != aarch64_last_printed_arch_string)
19580    {
19581      asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19582      aarch64_last_printed_arch_string = to_print;
19583    }
19584
19585  /* Print the cpu name we're tuning for in the comments, might be
19586     useful to readers of the generated asm.  Do it only when it changes
19587     from function to function and verbose assembly is requested.  */
19588  const struct processor *this_tune
19589    = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19590
19591  if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19592    {
19593      asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19594		   this_tune->name);
19595      aarch64_last_printed_tune_string = this_tune->name;
19596    }
19597
19598  aarch64_asm_output_variant_pcs (stream, fndecl, name);
19599
19600  /* Don't forget the type directive for ELF.  */
19601  ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19602  ASM_OUTPUT_LABEL (stream, name);
19603
19604  cfun->machine->label_is_assembled = true;
19605}
19606
19607/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
19608
19609void
19610aarch64_print_patchable_function_entry (FILE *file,
19611					unsigned HOST_WIDE_INT patch_area_size,
19612					bool record_p)
19613{
19614  if (!cfun->machine->label_is_assembled)
19615    {
19616      /* Emit the patching area before the entry label, if any.  */
19617      default_print_patchable_function_entry (file, patch_area_size,
19618					      record_p);
19619      return;
19620    }
19621
19622  rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
19623			       GEN_INT (record_p));
19624  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
19625
19626  if (!aarch64_bti_enabled ()
19627      || cgraph_node::get (cfun->decl)->only_called_directly_p ())
19628    {
19629      /* Emit the patchable_area at the beginning of the function.  */
19630      rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
19631      INSN_ADDRESSES_NEW (insn, -1);
19632      return;
19633    }
19634
19635  rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19636  if (!insn
19637      || !INSN_P (insn)
19638      || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
19639      || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
19640    {
19641      /* Emit a BTI_C.  */
19642      insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
19643    }
19644
19645  /* Emit the patchable_area after BTI_C.  */
19646  insn = emit_insn_after (pa, insn);
19647  INSN_ADDRESSES_NEW (insn, -1);
19648}
19649
19650/* Output patchable area.  */
19651
19652void
19653aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
19654{
19655  default_print_patchable_function_entry (asm_out_file, patch_area_size,
19656					  record_p);
19657}
19658
19659/* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
19660
19661void
19662aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19663{
19664  const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19665  const char *value = IDENTIFIER_POINTER (target);
19666  aarch64_asm_output_variant_pcs (stream, decl, name);
19667  ASM_OUTPUT_DEF (stream, name, value);
19668}
19669
19670/* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
19671   function symbol references.  */
19672
19673void
19674aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19675{
19676  default_elf_asm_output_external (stream, decl, name);
19677  aarch64_asm_output_variant_pcs (stream, decl, name);
19678}
19679
19680/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19681   Used to output the .cfi_b_key_frame directive when signing the current
19682   function with the B key.  */
19683
19684void
19685aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19686{
19687  if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19688      && aarch64_ra_sign_key == AARCH64_KEY_B)
19689	asm_fprintf (f, "\t.cfi_b_key_frame\n");
19690}
19691
19692/* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
19693
19694static void
19695aarch64_start_file (void)
19696{
19697  struct cl_target_option *default_options
19698    = TREE_TARGET_OPTION (target_option_default_node);
19699
19700  const struct processor *default_arch
19701    = aarch64_get_arch (default_options->x_explicit_arch);
19702  uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19703  std::string extension
19704    = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19705						  default_arch->flags);
19706
19707   aarch64_last_printed_arch_string = default_arch->name + extension;
19708   aarch64_last_printed_tune_string = "";
19709   asm_fprintf (asm_out_file, "\t.arch %s\n",
19710		aarch64_last_printed_arch_string.c_str ());
19711
19712   default_file_start ();
19713}
19714
19715/* Emit load exclusive.  */
19716
19717static void
19718aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19719			     rtx mem, rtx model_rtx)
19720{
19721  if (mode == TImode)
19722    emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19723						gen_highpart (DImode, rval),
19724						mem, model_rtx));
19725  else
19726    emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19727}
19728
19729/* Emit store exclusive.  */
19730
19731static void
19732aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19733			      rtx mem, rtx rval, rtx model_rtx)
19734{
19735  if (mode == TImode)
19736    emit_insn (gen_aarch64_store_exclusive_pair
19737	       (bval, mem, operand_subword (rval, 0, 0, TImode),
19738		operand_subword (rval, 1, 0, TImode), model_rtx));
19739  else
19740    emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19741}
19742
19743/* Mark the previous jump instruction as unlikely.  */
19744
19745static void
19746aarch64_emit_unlikely_jump (rtx insn)
19747{
19748  rtx_insn *jump = emit_jump_insn (insn);
19749  add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19750}
19751
19752/* We store the names of the various atomic helpers in a 5x5 array.
19753   Return the libcall function given MODE, MODEL and NAMES.  */
19754
19755rtx
19756aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19757			const atomic_ool_names *names)
19758{
19759  memmodel model = memmodel_from_int (INTVAL (model_rtx));
19760  int mode_idx, model_idx;
19761
19762  switch (mode)
19763    {
19764    case E_QImode:
19765      mode_idx = 0;
19766      break;
19767    case E_HImode:
19768      mode_idx = 1;
19769      break;
19770    case E_SImode:
19771      mode_idx = 2;
19772      break;
19773    case E_DImode:
19774      mode_idx = 3;
19775      break;
19776    case E_TImode:
19777      mode_idx = 4;
19778      break;
19779    default:
19780      gcc_unreachable ();
19781    }
19782
19783  switch (model)
19784    {
19785    case MEMMODEL_RELAXED:
19786      model_idx = 0;
19787      break;
19788    case MEMMODEL_CONSUME:
19789    case MEMMODEL_ACQUIRE:
19790      model_idx = 1;
19791      break;
19792    case MEMMODEL_RELEASE:
19793      model_idx = 2;
19794      break;
19795    case MEMMODEL_ACQ_REL:
19796    case MEMMODEL_SEQ_CST:
19797      model_idx = 3;
19798      break;
19799    case MEMMODEL_SYNC_ACQUIRE:
19800    case MEMMODEL_SYNC_RELEASE:
19801    case MEMMODEL_SYNC_SEQ_CST:
19802      model_idx = 4;
19803      break;
19804    default:
19805      gcc_unreachable ();
19806    }
19807
19808  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19809				      VISIBILITY_HIDDEN);
19810}
19811
19812#define DEF0(B, N) \
19813  { "__aarch64_" #B #N "_relax", \
19814    "__aarch64_" #B #N "_acq", \
19815    "__aarch64_" #B #N "_rel", \
19816    "__aarch64_" #B #N "_acq_rel", \
19817    "__aarch64_" #B #N "_sync" }
19818
19819#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19820		 { NULL, NULL, NULL, NULL }
19821#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19822
19823static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19824const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19825const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19826const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19827const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19828const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19829
19830#undef DEF0
19831#undef DEF4
19832#undef DEF5
19833
19834/* Expand a compare and swap pattern.  */
19835
19836void
19837aarch64_expand_compare_and_swap (rtx operands[])
19838{
19839  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19840  machine_mode mode, r_mode;
19841
19842  bval = operands[0];
19843  rval = operands[1];
19844  mem = operands[2];
19845  oldval = operands[3];
19846  newval = operands[4];
19847  is_weak = operands[5];
19848  mod_s = operands[6];
19849  mod_f = operands[7];
19850  mode = GET_MODE (mem);
19851
19852  /* Normally the succ memory model must be stronger than fail, but in the
19853     unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19854     promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
19855  if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19856      && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19857    mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19858
19859  r_mode = mode;
19860  if (mode == QImode || mode == HImode)
19861    {
19862      r_mode = SImode;
19863      rval = gen_reg_rtx (r_mode);
19864    }
19865
19866  if (TARGET_LSE)
19867    {
19868      /* The CAS insn requires oldval and rval overlap, but we need to
19869	 have a copy of oldval saved across the operation to tell if
19870	 the operation is successful.  */
19871      if (reg_overlap_mentioned_p (rval, oldval))
19872        rval = copy_to_mode_reg (r_mode, oldval);
19873      else
19874	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19875
19876      emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19877						   newval, mod_s));
19878      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19879    }
19880  else if (TARGET_OUTLINE_ATOMICS)
19881    {
19882      /* Oldval must satisfy compare afterward.  */
19883      if (!aarch64_plus_operand (oldval, mode))
19884	oldval = force_reg (mode, oldval);
19885      rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19886      rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19887				      oldval, mode, newval, mode,
19888				      XEXP (mem, 0), Pmode);
19889      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19890    }
19891  else
19892    {
19893      /* The oldval predicate varies by mode.  Test it and force to reg.  */
19894      insn_code code = code_for_aarch64_compare_and_swap (mode);
19895      if (!insn_data[code].operand[2].predicate (oldval, mode))
19896	oldval = force_reg (mode, oldval);
19897
19898      emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19899				 is_weak, mod_s, mod_f));
19900      cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19901    }
19902
19903  if (r_mode != mode)
19904    rval = gen_lowpart (mode, rval);
19905  emit_move_insn (operands[1], rval);
19906
19907  x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19908  emit_insn (gen_rtx_SET (bval, x));
19909}
19910
19911/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19912   sequence implementing an atomic operation.  */
19913
19914static void
19915aarch64_emit_post_barrier (enum memmodel model)
19916{
19917  const enum memmodel base_model = memmodel_base (model);
19918
19919  if (is_mm_sync (model)
19920      && (base_model == MEMMODEL_ACQUIRE
19921	  || base_model == MEMMODEL_ACQ_REL
19922	  || base_model == MEMMODEL_SEQ_CST))
19923    {
19924      emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19925    }
19926}
19927
19928/* Split a compare and swap pattern.  */
19929
19930void
19931aarch64_split_compare_and_swap (rtx operands[])
19932{
19933  /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19934  gcc_assert (epilogue_completed);
19935
19936  rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19937  machine_mode mode;
19938  bool is_weak;
19939  rtx_code_label *label1, *label2;
19940  enum memmodel model;
19941
19942  rval = operands[0];
19943  mem = operands[1];
19944  oldval = operands[2];
19945  newval = operands[3];
19946  is_weak = (operands[4] != const0_rtx);
19947  model_rtx = operands[5];
19948  scratch = operands[7];
19949  mode = GET_MODE (mem);
19950  model = memmodel_from_int (INTVAL (model_rtx));
19951
19952  /* When OLDVAL is zero and we want the strong version we can emit a tighter
19953    loop:
19954    .label1:
19955	LD[A]XR	rval, [mem]
19956	CBNZ	rval, .label2
19957	ST[L]XR	scratch, newval, [mem]
19958	CBNZ	scratch, .label1
19959    .label2:
19960	CMP	rval, 0.  */
19961  bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19962			oldval == const0_rtx && mode != TImode);
19963
19964  label1 = NULL;
19965  if (!is_weak)
19966    {
19967      label1 = gen_label_rtx ();
19968      emit_label (label1);
19969    }
19970  label2 = gen_label_rtx ();
19971
19972  /* The initial load can be relaxed for a __sync operation since a final
19973     barrier will be emitted to stop code hoisting.  */
19974  if (is_mm_sync (model))
19975    aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19976  else
19977    aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19978
19979  if (strong_zero_p)
19980    x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19981  else
19982    {
19983      rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19984      x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19985    }
19986  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19987			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19988  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19989
19990  aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19991
19992  if (!is_weak)
19993    {
19994      if (aarch64_track_speculation)
19995	{
19996	  /* Emit an explicit compare instruction, so that we can correctly
19997	     track the condition codes.  */
19998	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19999	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
20000	}
20001      else
20002	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
20003
20004      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20005				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
20006      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20007    }
20008  else
20009    aarch64_gen_compare_reg (NE, scratch, const0_rtx);
20010
20011  emit_label (label2);
20012
20013  /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
20014     to set the condition flags.  If this is not used it will be removed by
20015     later passes.  */
20016  if (strong_zero_p)
20017    aarch64_gen_compare_reg (NE, rval, const0_rtx);
20018
20019  /* Emit any final barrier needed for a __sync operation.  */
20020  if (is_mm_sync (model))
20021    aarch64_emit_post_barrier (model);
20022}
20023
20024/* Split an atomic operation.  */
20025
20026void
20027aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
20028			 rtx value, rtx model_rtx, rtx cond)
20029{
20030  /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
20031  gcc_assert (epilogue_completed);
20032
20033  machine_mode mode = GET_MODE (mem);
20034  machine_mode wmode = (mode == DImode ? DImode : SImode);
20035  const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
20036  const bool is_sync = is_mm_sync (model);
20037  rtx_code_label *label;
20038  rtx x;
20039
20040  /* Split the atomic operation into a sequence.  */
20041  label = gen_label_rtx ();
20042  emit_label (label);
20043
20044  if (new_out)
20045    new_out = gen_lowpart (wmode, new_out);
20046  if (old_out)
20047    old_out = gen_lowpart (wmode, old_out);
20048  else
20049    old_out = new_out;
20050  value = simplify_gen_subreg (wmode, value, mode, 0);
20051
20052  /* The initial load can be relaxed for a __sync operation since a final
20053     barrier will be emitted to stop code hoisting.  */
20054 if (is_sync)
20055    aarch64_emit_load_exclusive (mode, old_out, mem,
20056				 GEN_INT (MEMMODEL_RELAXED));
20057  else
20058    aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
20059
20060  switch (code)
20061    {
20062    case SET:
20063      new_out = value;
20064      break;
20065
20066    case NOT:
20067      x = gen_rtx_AND (wmode, old_out, value);
20068      emit_insn (gen_rtx_SET (new_out, x));
20069      x = gen_rtx_NOT (wmode, new_out);
20070      emit_insn (gen_rtx_SET (new_out, x));
20071      break;
20072
20073    case MINUS:
20074      if (CONST_INT_P (value))
20075	{
20076	  value = GEN_INT (-INTVAL (value));
20077	  code = PLUS;
20078	}
20079      /* Fall through.  */
20080
20081    default:
20082      x = gen_rtx_fmt_ee (code, wmode, old_out, value);
20083      emit_insn (gen_rtx_SET (new_out, x));
20084      break;
20085    }
20086
20087  aarch64_emit_store_exclusive (mode, cond, mem,
20088				gen_lowpart (mode, new_out), model_rtx);
20089
20090  if (aarch64_track_speculation)
20091    {
20092      /* Emit an explicit compare instruction, so that we can correctly
20093	 track the condition codes.  */
20094      rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
20095      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
20096    }
20097  else
20098    x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
20099
20100  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20101			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
20102  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20103
20104  /* Emit any final barrier needed for a __sync operation.  */
20105  if (is_sync)
20106    aarch64_emit_post_barrier (model);
20107}
20108
20109static void
20110aarch64_init_libfuncs (void)
20111{
20112   /* Half-precision float operations.  The compiler handles all operations
20113     with NULL libfuncs by converting to SFmode.  */
20114
20115  /* Conversions.  */
20116  set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
20117  set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
20118
20119  /* Arithmetic.  */
20120  set_optab_libfunc (add_optab, HFmode, NULL);
20121  set_optab_libfunc (sdiv_optab, HFmode, NULL);
20122  set_optab_libfunc (smul_optab, HFmode, NULL);
20123  set_optab_libfunc (neg_optab, HFmode, NULL);
20124  set_optab_libfunc (sub_optab, HFmode, NULL);
20125
20126  /* Comparisons.  */
20127  set_optab_libfunc (eq_optab, HFmode, NULL);
20128  set_optab_libfunc (ne_optab, HFmode, NULL);
20129  set_optab_libfunc (lt_optab, HFmode, NULL);
20130  set_optab_libfunc (le_optab, HFmode, NULL);
20131  set_optab_libfunc (ge_optab, HFmode, NULL);
20132  set_optab_libfunc (gt_optab, HFmode, NULL);
20133  set_optab_libfunc (unord_optab, HFmode, NULL);
20134}
20135
20136/* Target hook for c_mode_for_suffix.  */
20137static machine_mode
20138aarch64_c_mode_for_suffix (char suffix)
20139{
20140  if (suffix == 'q')
20141    return TFmode;
20142
20143  return VOIDmode;
20144}
20145
20146/* We can only represent floating point constants which will fit in
20147   "quarter-precision" values.  These values are characterised by
20148   a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
20149   by:
20150
20151   (-1)^s * (n/16) * 2^r
20152
20153   Where:
20154     's' is the sign bit.
20155     'n' is an integer in the range 16 <= n <= 31.
20156     'r' is an integer in the range -3 <= r <= 4.  */
20157
20158/* Return true iff X can be represented by a quarter-precision
20159   floating point immediate operand X.  Note, we cannot represent 0.0.  */
20160bool
20161aarch64_float_const_representable_p (rtx x)
20162{
20163  /* This represents our current view of how many bits
20164     make up the mantissa.  */
20165  int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
20166  int exponent;
20167  unsigned HOST_WIDE_INT mantissa, mask;
20168  REAL_VALUE_TYPE r, m;
20169  bool fail;
20170
20171  x = unwrap_const_vec_duplicate (x);
20172  if (!CONST_DOUBLE_P (x))
20173    return false;
20174
20175  if (GET_MODE (x) == VOIDmode
20176      || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
20177    return false;
20178
20179  r = *CONST_DOUBLE_REAL_VALUE (x);
20180
20181  /* We cannot represent infinities, NaNs or +/-zero.  We won't
20182     know if we have +zero until we analyse the mantissa, but we
20183     can reject the other invalid values.  */
20184  if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
20185      || REAL_VALUE_MINUS_ZERO (r))
20186    return false;
20187
20188  /* Extract exponent.  */
20189  r = real_value_abs (&r);
20190  exponent = REAL_EXP (&r);
20191
20192  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
20193     highest (sign) bit, with a fixed binary point at bit point_pos.
20194     m1 holds the low part of the mantissa, m2 the high part.
20195     WARNING: If we ever have a representation using more than 2 * H_W_I - 1
20196     bits for the mantissa, this can fail (low bits will be lost).  */
20197  real_ldexp (&m, &r, point_pos - exponent);
20198  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
20199
20200  /* If the low part of the mantissa has bits set we cannot represent
20201     the value.  */
20202  if (w.ulow () != 0)
20203    return false;
20204  /* We have rejected the lower HOST_WIDE_INT, so update our
20205     understanding of how many bits lie in the mantissa and
20206     look only at the high HOST_WIDE_INT.  */
20207  mantissa = w.elt (1);
20208  point_pos -= HOST_BITS_PER_WIDE_INT;
20209
20210  /* We can only represent values with a mantissa of the form 1.xxxx.  */
20211  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
20212  if ((mantissa & mask) != 0)
20213    return false;
20214
20215  /* Having filtered unrepresentable values, we may now remove all
20216     but the highest 5 bits.  */
20217  mantissa >>= point_pos - 5;
20218
20219  /* We cannot represent the value 0.0, so reject it.  This is handled
20220     elsewhere.  */
20221  if (mantissa == 0)
20222    return false;
20223
20224  /* Then, as bit 4 is always set, we can mask it off, leaving
20225     the mantissa in the range [0, 15].  */
20226  mantissa &= ~(1 << 4);
20227  gcc_assert (mantissa <= 15);
20228
20229  /* GCC internally does not use IEEE754-like encoding (where normalized
20230     significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
20231     Our mantissa values are shifted 4 places to the left relative to
20232     normalized IEEE754 so we must modify the exponent returned by REAL_EXP
20233     by 5 places to correct for GCC's representation.  */
20234  exponent = 5 - exponent;
20235
20236  return (exponent >= 0 && exponent <= 7);
20237}
20238
20239/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
20240   immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
20241   output MOVI/MVNI, ORR or BIC immediate.  */
20242char*
20243aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
20244				   enum simd_immediate_check which)
20245{
20246  bool is_valid;
20247  static char templ[40];
20248  const char *mnemonic;
20249  const char *shift_op;
20250  unsigned int lane_count = 0;
20251  char element_char;
20252
20253  struct simd_immediate_info info;
20254
20255  /* This will return true to show const_vector is legal for use as either
20256     a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
20257     It will also update INFO to show how the immediate should be generated.
20258     WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
20259  is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
20260  gcc_assert (is_valid);
20261
20262  element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20263  lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
20264
20265  if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20266    {
20267      gcc_assert (info.insn == simd_immediate_info::MOV
20268		  && info.u.mov.shift == 0);
20269      /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
20270	 move immediate path.  */
20271      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20272        info.u.mov.value = GEN_INT (0);
20273      else
20274	{
20275	  const unsigned int buf_size = 20;
20276	  char float_buf[buf_size] = {'\0'};
20277	  real_to_decimal_for_mode (float_buf,
20278				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20279				    buf_size, buf_size, 1, info.elt_mode);
20280
20281	  if (lane_count == 1)
20282	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
20283	  else
20284	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
20285		      lane_count, element_char, float_buf);
20286	  return templ;
20287	}
20288    }
20289
20290  gcc_assert (CONST_INT_P (info.u.mov.value));
20291
20292  if (which == AARCH64_CHECK_MOV)
20293    {
20294      mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
20295      shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
20296		  ? "msl" : "lsl");
20297      if (lane_count == 1)
20298	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
20299		  mnemonic, UINTVAL (info.u.mov.value));
20300      else if (info.u.mov.shift)
20301	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20302		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
20303		  element_char, UINTVAL (info.u.mov.value), shift_op,
20304		  info.u.mov.shift);
20305      else
20306	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
20307		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
20308		  element_char, UINTVAL (info.u.mov.value));
20309    }
20310  else
20311    {
20312      /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
20313      mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
20314      if (info.u.mov.shift)
20315	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20316		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
20317		  element_char, UINTVAL (info.u.mov.value), "lsl",
20318		  info.u.mov.shift);
20319      else
20320	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
20321		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
20322		  element_char, UINTVAL (info.u.mov.value));
20323    }
20324  return templ;
20325}
20326
20327char*
20328aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
20329{
20330
20331  /* If a floating point number was passed and we desire to use it in an
20332     integer mode do the conversion to integer.  */
20333  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
20334    {
20335      unsigned HOST_WIDE_INT ival;
20336      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
20337	  gcc_unreachable ();
20338      immediate = gen_int_mode (ival, mode);
20339    }
20340
20341  machine_mode vmode;
20342  /* use a 64 bit mode for everything except for DI/DF mode, where we use
20343     a 128 bit vector mode.  */
20344  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
20345
20346  vmode = aarch64_simd_container_mode (mode, width);
20347  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
20348  return aarch64_output_simd_mov_immediate (v_op, width);
20349}
20350
20351/* Return the output string to use for moving immediate CONST_VECTOR
20352   into an SVE register.  */
20353
20354char *
20355aarch64_output_sve_mov_immediate (rtx const_vector)
20356{
20357  static char templ[40];
20358  struct simd_immediate_info info;
20359  char element_char;
20360
20361  bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
20362  gcc_assert (is_valid);
20363
20364  element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20365
20366  machine_mode vec_mode = GET_MODE (const_vector);
20367  if (aarch64_sve_pred_mode_p (vec_mode))
20368    {
20369      static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
20370      if (info.insn == simd_immediate_info::MOV)
20371	{
20372	  gcc_assert (info.u.mov.value == const0_rtx);
20373	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
20374	}
20375      else
20376	{
20377	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
20378	  unsigned int total_bytes;
20379	  if (info.u.pattern == AARCH64_SV_ALL
20380	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
20381	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
20382		      total_bytes / GET_MODE_SIZE (info.elt_mode));
20383	  else
20384	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
20385		      svpattern_token (info.u.pattern));
20386	}
20387      return buf;
20388    }
20389
20390  if (info.insn == simd_immediate_info::INDEX)
20391    {
20392      snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
20393		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
20394		element_char, INTVAL (info.u.index.base),
20395		INTVAL (info.u.index.step));
20396      return templ;
20397    }
20398
20399  if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
20400    {
20401      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
20402	info.u.mov.value = GEN_INT (0);
20403      else
20404	{
20405	  const int buf_size = 20;
20406	  char float_buf[buf_size] = {};
20407	  real_to_decimal_for_mode (float_buf,
20408				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
20409				    buf_size, buf_size, 1, info.elt_mode);
20410
20411	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
20412		    element_char, float_buf);
20413	  return templ;
20414	}
20415    }
20416
20417  snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
20418	    element_char, INTVAL (info.u.mov.value));
20419  return templ;
20420}
20421
20422/* Return the asm template for a PTRUES.  CONST_UNSPEC is the
20423   aarch64_sve_ptrue_svpattern_immediate that describes the predicate
20424   pattern.  */
20425
20426char *
20427aarch64_output_sve_ptrues (rtx const_unspec)
20428{
20429  static char templ[40];
20430
20431  struct simd_immediate_info info;
20432  bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
20433  gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
20434
20435  char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
20436  snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
20437	    svpattern_token (info.u.pattern));
20438  return templ;
20439}
20440
20441/* Split operands into moves from op[1] + op[2] into op[0].  */
20442
20443void
20444aarch64_split_combinev16qi (rtx operands[3])
20445{
20446  unsigned int dest = REGNO (operands[0]);
20447  unsigned int src1 = REGNO (operands[1]);
20448  unsigned int src2 = REGNO (operands[2]);
20449  machine_mode halfmode = GET_MODE (operands[1]);
20450  unsigned int halfregs = REG_NREGS (operands[1]);
20451  rtx destlo, desthi;
20452
20453  gcc_assert (halfmode == V16QImode);
20454
20455  if (src1 == dest && src2 == dest + halfregs)
20456    {
20457      /* No-op move.  Can't split to nothing; emit something.  */
20458      emit_note (NOTE_INSN_DELETED);
20459      return;
20460    }
20461
20462  /* Preserve register attributes for variable tracking.  */
20463  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
20464  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
20465			       GET_MODE_SIZE (halfmode));
20466
20467  /* Special case of reversed high/low parts.  */
20468  if (reg_overlap_mentioned_p (operands[2], destlo)
20469      && reg_overlap_mentioned_p (operands[1], desthi))
20470    {
20471      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20472      emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20473      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20474    }
20475  else if (!reg_overlap_mentioned_p (operands[2], destlo))
20476    {
20477      /* Try to avoid unnecessary moves if part of the result
20478	 is in the right place already.  */
20479      if (src1 != dest)
20480	emit_move_insn (destlo, operands[1]);
20481      if (src2 != dest + halfregs)
20482	emit_move_insn (desthi, operands[2]);
20483    }
20484  else
20485    {
20486      if (src2 != dest + halfregs)
20487	emit_move_insn (desthi, operands[2]);
20488      if (src1 != dest)
20489	emit_move_insn (destlo, operands[1]);
20490    }
20491}
20492
20493/* vec_perm support.  */
20494
20495struct expand_vec_perm_d
20496{
20497  rtx target, op0, op1;
20498  vec_perm_indices perm;
20499  machine_mode vmode;
20500  unsigned int vec_flags;
20501  bool one_vector_p;
20502  bool testing_p;
20503};
20504
20505/* Generate a variable permutation.  */
20506
20507static void
20508aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20509{
20510  machine_mode vmode = GET_MODE (target);
20511  bool one_vector_p = rtx_equal_p (op0, op1);
20512
20513  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20514  gcc_checking_assert (GET_MODE (op0) == vmode);
20515  gcc_checking_assert (GET_MODE (op1) == vmode);
20516  gcc_checking_assert (GET_MODE (sel) == vmode);
20517  gcc_checking_assert (TARGET_SIMD);
20518
20519  if (one_vector_p)
20520    {
20521      if (vmode == V8QImode)
20522	{
20523	  /* Expand the argument to a V16QI mode by duplicating it.  */
20524	  rtx pair = gen_reg_rtx (V16QImode);
20525	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20526	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20527	}
20528      else
20529	{
20530	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20531	}
20532    }
20533  else
20534    {
20535      rtx pair;
20536
20537      if (vmode == V8QImode)
20538	{
20539	  pair = gen_reg_rtx (V16QImode);
20540	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20541	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20542	}
20543      else
20544	{
20545	  pair = gen_reg_rtx (OImode);
20546	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20547	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20548	}
20549    }
20550}
20551
20552/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20553   NELT is the number of elements in the vector.  */
20554
20555void
20556aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20557			 unsigned int nelt)
20558{
20559  machine_mode vmode = GET_MODE (target);
20560  bool one_vector_p = rtx_equal_p (op0, op1);
20561  rtx mask;
20562
20563  /* The TBL instruction does not use a modulo index, so we must take care
20564     of that ourselves.  */
20565  mask = aarch64_simd_gen_const_vector_dup (vmode,
20566      one_vector_p ? nelt - 1 : 2 * nelt - 1);
20567  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20568
20569  /* For big-endian, we also need to reverse the index within the vector
20570     (but not which vector).  */
20571  if (BYTES_BIG_ENDIAN)
20572    {
20573      /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
20574      if (!one_vector_p)
20575        mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20576      sel = expand_simple_binop (vmode, XOR, sel, mask,
20577				 NULL, 0, OPTAB_LIB_WIDEN);
20578    }
20579  aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20580}
20581
20582/* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
20583
20584static void
20585emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20586{
20587  emit_insn (gen_rtx_SET (target,
20588			  gen_rtx_UNSPEC (GET_MODE (target),
20589					  gen_rtvec (2, op0, op1), code)));
20590}
20591
20592/* Expand an SVE vec_perm with the given operands.  */
20593
20594void
20595aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20596{
20597  machine_mode data_mode = GET_MODE (target);
20598  machine_mode sel_mode = GET_MODE (sel);
20599  /* Enforced by the pattern condition.  */
20600  int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20601
20602  /* Note: vec_perm indices are supposed to wrap when they go beyond the
20603     size of the two value vectors, i.e. the upper bits of the indices
20604     are effectively ignored.  SVE TBL instead produces 0 for any
20605     out-of-range indices, so we need to modulo all the vec_perm indices
20606     to ensure they are all in range.  */
20607  rtx sel_reg = force_reg (sel_mode, sel);
20608
20609  /* Check if the sel only references the first values vector.  */
20610  if (GET_CODE (sel) == CONST_VECTOR
20611      && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20612    {
20613      emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20614      return;
20615    }
20616
20617  /* Check if the two values vectors are the same.  */
20618  if (rtx_equal_p (op0, op1))
20619    {
20620      rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20621      rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20622					 NULL, 0, OPTAB_DIRECT);
20623      emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20624      return;
20625    }
20626
20627  /* Run TBL on for each value vector and combine the results.  */
20628
20629  rtx res0 = gen_reg_rtx (data_mode);
20630  rtx res1 = gen_reg_rtx (data_mode);
20631  rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20632  if (GET_CODE (sel) != CONST_VECTOR
20633      || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20634    {
20635      rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20636						       2 * nunits - 1);
20637      sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20638				     NULL, 0, OPTAB_DIRECT);
20639    }
20640  emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20641  rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20642				     NULL, 0, OPTAB_DIRECT);
20643  emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20644  if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20645    emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20646  else
20647    emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20648}
20649
20650/* Recognize patterns suitable for the TRN instructions.  */
20651static bool
20652aarch64_evpc_trn (struct expand_vec_perm_d *d)
20653{
20654  HOST_WIDE_INT odd;
20655  poly_uint64 nelt = d->perm.length ();
20656  rtx out, in0, in1, x;
20657  machine_mode vmode = d->vmode;
20658
20659  if (GET_MODE_UNIT_SIZE (vmode) > 8)
20660    return false;
20661
20662  /* Note that these are little-endian tests.
20663     We correct for big-endian later.  */
20664  if (!d->perm[0].is_constant (&odd)
20665      || (odd != 0 && odd != 1)
20666      || !d->perm.series_p (0, 2, odd, 2)
20667      || !d->perm.series_p (1, 2, nelt + odd, 2))
20668    return false;
20669
20670  /* Success!  */
20671  if (d->testing_p)
20672    return true;
20673
20674  in0 = d->op0;
20675  in1 = d->op1;
20676  /* We don't need a big-endian lane correction for SVE; see the comment
20677     at the head of aarch64-sve.md for details.  */
20678  if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20679    {
20680      x = in0, in0 = in1, in1 = x;
20681      odd = !odd;
20682    }
20683  out = d->target;
20684
20685  emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20686				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20687  return true;
20688}
20689
20690/* Recognize patterns suitable for the UZP instructions.  */
20691static bool
20692aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20693{
20694  HOST_WIDE_INT odd;
20695  rtx out, in0, in1, x;
20696  machine_mode vmode = d->vmode;
20697
20698  if (GET_MODE_UNIT_SIZE (vmode) > 8)
20699    return false;
20700
20701  /* Note that these are little-endian tests.
20702     We correct for big-endian later.  */
20703  if (!d->perm[0].is_constant (&odd)
20704      || (odd != 0 && odd != 1)
20705      || !d->perm.series_p (0, 1, odd, 2))
20706    return false;
20707
20708  /* Success!  */
20709  if (d->testing_p)
20710    return true;
20711
20712  in0 = d->op0;
20713  in1 = d->op1;
20714  /* We don't need a big-endian lane correction for SVE; see the comment
20715     at the head of aarch64-sve.md for details.  */
20716  if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20717    {
20718      x = in0, in0 = in1, in1 = x;
20719      odd = !odd;
20720    }
20721  out = d->target;
20722
20723  emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20724				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20725  return true;
20726}
20727
20728/* Recognize patterns suitable for the ZIP instructions.  */
20729static bool
20730aarch64_evpc_zip (struct expand_vec_perm_d *d)
20731{
20732  unsigned int high;
20733  poly_uint64 nelt = d->perm.length ();
20734  rtx out, in0, in1, x;
20735  machine_mode vmode = d->vmode;
20736
20737  if (GET_MODE_UNIT_SIZE (vmode) > 8)
20738    return false;
20739
20740  /* Note that these are little-endian tests.
20741     We correct for big-endian later.  */
20742  poly_uint64 first = d->perm[0];
20743  if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20744      || !d->perm.series_p (0, 2, first, 1)
20745      || !d->perm.series_p (1, 2, first + nelt, 1))
20746    return false;
20747  high = maybe_ne (first, 0U);
20748
20749  /* Success!  */
20750  if (d->testing_p)
20751    return true;
20752
20753  in0 = d->op0;
20754  in1 = d->op1;
20755  /* We don't need a big-endian lane correction for SVE; see the comment
20756     at the head of aarch64-sve.md for details.  */
20757  if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20758    {
20759      x = in0, in0 = in1, in1 = x;
20760      high = !high;
20761    }
20762  out = d->target;
20763
20764  emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20765				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20766  return true;
20767}
20768
20769/* Recognize patterns for the EXT insn.  */
20770
20771static bool
20772aarch64_evpc_ext (struct expand_vec_perm_d *d)
20773{
20774  HOST_WIDE_INT location;
20775  rtx offset;
20776
20777  /* The first element always refers to the first vector.
20778     Check if the extracted indices are increasing by one.  */
20779  if (d->vec_flags == VEC_SVE_PRED
20780      || !d->perm[0].is_constant (&location)
20781      || !d->perm.series_p (0, 1, location, 1))
20782    return false;
20783
20784  /* Success! */
20785  if (d->testing_p)
20786    return true;
20787
20788  /* The case where (location == 0) is a no-op for both big- and little-endian,
20789     and is removed by the mid-end at optimization levels -O1 and higher.
20790
20791     We don't need a big-endian lane correction for SVE; see the comment
20792     at the head of aarch64-sve.md for details.  */
20793  if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20794    {
20795      /* After setup, we want the high elements of the first vector (stored
20796         at the LSB end of the register), and the low elements of the second
20797         vector (stored at the MSB end of the register). So swap.  */
20798      std::swap (d->op0, d->op1);
20799      /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20800	 to_constant () is safe since this is restricted to Advanced SIMD
20801	 vectors.  */
20802      location = d->perm.length ().to_constant () - location;
20803    }
20804
20805  offset = GEN_INT (location);
20806  emit_set_insn (d->target,
20807		 gen_rtx_UNSPEC (d->vmode,
20808				 gen_rtvec (3, d->op0, d->op1, offset),
20809				 UNSPEC_EXT));
20810  return true;
20811}
20812
20813/* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20814   within each 64-bit, 32-bit or 16-bit granule.  */
20815
20816static bool
20817aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20818{
20819  HOST_WIDE_INT diff;
20820  unsigned int i, size, unspec;
20821  machine_mode pred_mode;
20822
20823  if (d->vec_flags == VEC_SVE_PRED
20824      || !d->one_vector_p
20825      || !d->perm[0].is_constant (&diff)
20826      || !diff)
20827    return false;
20828
20829  size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
20830  if (size == 8)
20831    {
20832      unspec = UNSPEC_REV64;
20833      pred_mode = VNx2BImode;
20834    }
20835  else if (size == 4)
20836    {
20837      unspec = UNSPEC_REV32;
20838      pred_mode = VNx4BImode;
20839    }
20840  else if (size == 2)
20841    {
20842      unspec = UNSPEC_REV16;
20843      pred_mode = VNx8BImode;
20844    }
20845  else
20846    return false;
20847
20848  unsigned int step = diff + 1;
20849  for (i = 0; i < step; ++i)
20850    if (!d->perm.series_p (i, step, diff - i, step))
20851      return false;
20852
20853  /* Success! */
20854  if (d->testing_p)
20855    return true;
20856
20857  if (d->vec_flags == VEC_SVE_DATA)
20858    {
20859      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
20860      rtx target = gen_reg_rtx (int_mode);
20861      if (BYTES_BIG_ENDIAN)
20862	/* The act of taking a subreg between INT_MODE and d->vmode
20863	   is itself a reversing operation on big-endian targets;
20864	   see the comment at the head of aarch64-sve.md for details.
20865	   First reinterpret OP0 as INT_MODE without using a subreg
20866	   and without changing the contents.  */
20867	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
20868      else
20869	{
20870	  /* For SVE we use REV[BHW] unspecs derived from the element size
20871	     of v->mode and vector modes whose elements have SIZE bytes.
20872	     This ensures that the vector modes match the predicate modes.  */
20873	  int unspec = aarch64_sve_rev_unspec (d->vmode);
20874	  rtx pred = aarch64_ptrue_reg (pred_mode);
20875	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
20876				       gen_lowpart (int_mode, d->op0)));
20877	}
20878      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20879      return true;
20880    }
20881  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20882  emit_set_insn (d->target, src);
20883  return true;
20884}
20885
20886/* Recognize patterns for the REV insn, which reverses elements within
20887   a full vector.  */
20888
20889static bool
20890aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20891{
20892  poly_uint64 nelt = d->perm.length ();
20893
20894  if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20895    return false;
20896
20897  if (!d->perm.series_p (0, 1, nelt - 1, -1))
20898    return false;
20899
20900  /* Success! */
20901  if (d->testing_p)
20902    return true;
20903
20904  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20905  emit_set_insn (d->target, src);
20906  return true;
20907}
20908
20909static bool
20910aarch64_evpc_dup (struct expand_vec_perm_d *d)
20911{
20912  rtx out = d->target;
20913  rtx in0;
20914  HOST_WIDE_INT elt;
20915  machine_mode vmode = d->vmode;
20916  rtx lane;
20917
20918  if (d->vec_flags == VEC_SVE_PRED
20919      || d->perm.encoding ().encoded_nelts () != 1
20920      || !d->perm[0].is_constant (&elt))
20921    return false;
20922
20923  if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20924    return false;
20925
20926  /* Success! */
20927  if (d->testing_p)
20928    return true;
20929
20930  /* The generic preparation in aarch64_expand_vec_perm_const_1
20931     swaps the operand order and the permute indices if it finds
20932     d->perm[0] to be in the second operand.  Thus, we can always
20933     use d->op0 and need not do any extra arithmetic to get the
20934     correct lane number.  */
20935  in0 = d->op0;
20936  lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
20937
20938  rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20939  rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20940  emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20941  return true;
20942}
20943
20944static bool
20945aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20946{
20947  rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20948  machine_mode vmode = d->vmode;
20949
20950  /* Make sure that the indices are constant.  */
20951  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20952  for (unsigned int i = 0; i < encoded_nelts; ++i)
20953    if (!d->perm[i].is_constant ())
20954      return false;
20955
20956  if (d->testing_p)
20957    return true;
20958
20959  /* Generic code will try constant permutation twice.  Once with the
20960     original mode and again with the elements lowered to QImode.
20961     So wait and don't do the selector expansion ourselves.  */
20962  if (vmode != V8QImode && vmode != V16QImode)
20963    return false;
20964
20965  /* to_constant is safe since this routine is specific to Advanced SIMD
20966     vectors.  */
20967  unsigned int nelt = d->perm.length ().to_constant ();
20968  for (unsigned int i = 0; i < nelt; ++i)
20969    /* If big-endian and two vectors we end up with a weird mixed-endian
20970       mode on NEON.  Reverse the index within each word but not the word
20971       itself.  to_constant is safe because we checked is_constant above.  */
20972    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20973			? d->perm[i].to_constant () ^ (nelt - 1)
20974			: d->perm[i].to_constant ());
20975
20976  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20977  sel = force_reg (vmode, sel);
20978
20979  aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20980  return true;
20981}
20982
20983/* Try to implement D using an SVE TBL instruction.  */
20984
20985static bool
20986aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20987{
20988  unsigned HOST_WIDE_INT nelt;
20989
20990  /* Permuting two variable-length vectors could overflow the
20991     index range.  */
20992  if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20993    return false;
20994
20995  if (d->testing_p)
20996    return true;
20997
20998  machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20999  rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
21000  if (d->one_vector_p)
21001    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
21002  else
21003    aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
21004  return true;
21005}
21006
21007/* Try to implement D using SVE SEL instruction.  */
21008
21009static bool
21010aarch64_evpc_sel (struct expand_vec_perm_d *d)
21011{
21012  machine_mode vmode = d->vmode;
21013  int unit_size = GET_MODE_UNIT_SIZE (vmode);
21014
21015  if (d->vec_flags != VEC_SVE_DATA
21016      || unit_size > 8)
21017    return false;
21018
21019  int n_patterns = d->perm.encoding ().npatterns ();
21020  poly_int64 vec_len = d->perm.length ();
21021
21022  for (int i = 0; i < n_patterns; ++i)
21023    if (!known_eq (d->perm[i], i)
21024	&& !known_eq (d->perm[i], vec_len + i))
21025      return false;
21026
21027  for (int i = n_patterns; i < n_patterns * 2; i++)
21028    if (!d->perm.series_p (i, n_patterns, i, n_patterns)
21029	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
21030      return false;
21031
21032  if (d->testing_p)
21033    return true;
21034
21035  machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
21036
21037  /* Build a predicate that is true when op0 elements should be used.  */
21038  rtx_vector_builder builder (pred_mode, n_patterns, 2);
21039  for (int i = 0; i < n_patterns * 2; i++)
21040    {
21041      rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
21042					  : CONST0_RTX (BImode);
21043      builder.quick_push (elem);
21044    }
21045
21046  rtx const_vec = builder.build ();
21047  rtx pred = force_reg (pred_mode, const_vec);
21048  /* TARGET = PRED ? OP0 : OP1.  */
21049  emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
21050  return true;
21051}
21052
21053static bool
21054aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
21055{
21056  /* The pattern matching functions above are written to look for a small
21057     number to begin the sequence (0, 1, N/2).  If we begin with an index
21058     from the second operand, we can swap the operands.  */
21059  poly_int64 nelt = d->perm.length ();
21060  if (known_ge (d->perm[0], nelt))
21061    {
21062      d->perm.rotate_inputs (1);
21063      std::swap (d->op0, d->op1);
21064    }
21065
21066  if ((d->vec_flags == VEC_ADVSIMD
21067       || d->vec_flags == VEC_SVE_DATA
21068       || d->vec_flags == VEC_SVE_PRED)
21069      && known_gt (nelt, 1))
21070    {
21071      if (aarch64_evpc_rev_local (d))
21072	return true;
21073      else if (aarch64_evpc_rev_global (d))
21074	return true;
21075      else if (aarch64_evpc_ext (d))
21076	return true;
21077      else if (aarch64_evpc_dup (d))
21078	return true;
21079      else if (aarch64_evpc_zip (d))
21080	return true;
21081      else if (aarch64_evpc_uzp (d))
21082	return true;
21083      else if (aarch64_evpc_trn (d))
21084	return true;
21085      else if (aarch64_evpc_sel (d))
21086	return true;
21087      if (d->vec_flags == VEC_SVE_DATA)
21088	return aarch64_evpc_sve_tbl (d);
21089      else if (d->vec_flags == VEC_ADVSIMD)
21090	return aarch64_evpc_tbl (d);
21091    }
21092  return false;
21093}
21094
21095/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
21096
21097static bool
21098aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
21099				  rtx op1, const vec_perm_indices &sel)
21100{
21101  struct expand_vec_perm_d d;
21102
21103  /* Check whether the mask can be applied to a single vector.  */
21104  if (sel.ninputs () == 1
21105      || (op0 && rtx_equal_p (op0, op1)))
21106    d.one_vector_p = true;
21107  else if (sel.all_from_input_p (0))
21108    {
21109      d.one_vector_p = true;
21110      op1 = op0;
21111    }
21112  else if (sel.all_from_input_p (1))
21113    {
21114      d.one_vector_p = true;
21115      op0 = op1;
21116    }
21117  else
21118    d.one_vector_p = false;
21119
21120  d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
21121		     sel.nelts_per_input ());
21122  d.vmode = vmode;
21123  d.vec_flags = aarch64_classify_vector_mode (d.vmode);
21124  d.target = target;
21125  d.op0 = op0;
21126  d.op1 = op1;
21127  d.testing_p = !target;
21128
21129  if (!d.testing_p)
21130    return aarch64_expand_vec_perm_const_1 (&d);
21131
21132  rtx_insn *last = get_last_insn ();
21133  bool ret = aarch64_expand_vec_perm_const_1 (&d);
21134  gcc_assert (last == get_last_insn ());
21135
21136  return ret;
21137}
21138
21139/* Generate a byte permute mask for a register of mode MODE,
21140   which has NUNITS units.  */
21141
21142rtx
21143aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
21144{
21145  /* We have to reverse each vector because we dont have
21146     a permuted load that can reverse-load according to ABI rules.  */
21147  rtx mask;
21148  rtvec v = rtvec_alloc (16);
21149  unsigned int i, j;
21150  unsigned int usize = GET_MODE_UNIT_SIZE (mode);
21151
21152  gcc_assert (BYTES_BIG_ENDIAN);
21153  gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
21154
21155  for (i = 0; i < nunits; i++)
21156    for (j = 0; j < usize; j++)
21157      RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
21158  mask = gen_rtx_CONST_VECTOR (V16QImode, v);
21159  return force_reg (V16QImode, mask);
21160}
21161
21162/* Expand an SVE integer comparison using the SVE equivalent of:
21163
21164     (set TARGET (CODE OP0 OP1)).  */
21165
21166void
21167aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
21168{
21169  machine_mode pred_mode = GET_MODE (target);
21170  machine_mode data_mode = GET_MODE (op0);
21171  rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
21172				      op0, op1);
21173  if (!rtx_equal_p (target, res))
21174    emit_move_insn (target, res);
21175}
21176
21177/* Return the UNSPEC_COND_* code for comparison CODE.  */
21178
21179static unsigned int
21180aarch64_unspec_cond_code (rtx_code code)
21181{
21182  switch (code)
21183    {
21184    case NE:
21185      return UNSPEC_COND_FCMNE;
21186    case EQ:
21187      return UNSPEC_COND_FCMEQ;
21188    case LT:
21189      return UNSPEC_COND_FCMLT;
21190    case GT:
21191      return UNSPEC_COND_FCMGT;
21192    case LE:
21193      return UNSPEC_COND_FCMLE;
21194    case GE:
21195      return UNSPEC_COND_FCMGE;
21196    case UNORDERED:
21197      return UNSPEC_COND_FCMUO;
21198    default:
21199      gcc_unreachable ();
21200    }
21201}
21202
21203/* Emit:
21204
21205      (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21206
21207   where <X> is the operation associated with comparison CODE.
21208   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21209
21210static void
21211aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
21212			  bool known_ptrue_p, rtx op0, rtx op1)
21213{
21214  rtx flag = gen_int_mode (known_ptrue_p, SImode);
21215  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
21216			       gen_rtvec (4, pred, flag, op0, op1),
21217			       aarch64_unspec_cond_code (code));
21218  emit_set_insn (target, unspec);
21219}
21220
21221/* Emit the SVE equivalent of:
21222
21223      (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
21224      (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
21225      (set TARGET (ior:PRED_MODE TMP1 TMP2))
21226
21227   where <Xi> is the operation associated with comparison CODEi.
21228   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21229
21230static void
21231aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
21232			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
21233{
21234  machine_mode pred_mode = GET_MODE (pred);
21235  rtx tmp1 = gen_reg_rtx (pred_mode);
21236  aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
21237  rtx tmp2 = gen_reg_rtx (pred_mode);
21238  aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
21239  aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
21240}
21241
21242/* Emit the SVE equivalent of:
21243
21244      (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
21245      (set TARGET (not TMP))
21246
21247   where <X> is the operation associated with comparison CODE.
21248   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
21249
21250static void
21251aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
21252				 bool known_ptrue_p, rtx op0, rtx op1)
21253{
21254  machine_mode pred_mode = GET_MODE (pred);
21255  rtx tmp = gen_reg_rtx (pred_mode);
21256  aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
21257  aarch64_emit_unop (target, one_cmpl_optab, tmp);
21258}
21259
21260/* Expand an SVE floating-point comparison using the SVE equivalent of:
21261
21262     (set TARGET (CODE OP0 OP1))
21263
21264   If CAN_INVERT_P is true, the caller can also handle inverted results;
21265   return true if the result is in fact inverted.  */
21266
21267bool
21268aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
21269				  rtx op0, rtx op1, bool can_invert_p)
21270{
21271  machine_mode pred_mode = GET_MODE (target);
21272  machine_mode data_mode = GET_MODE (op0);
21273
21274  rtx ptrue = aarch64_ptrue_reg (pred_mode);
21275  switch (code)
21276    {
21277    case UNORDERED:
21278      /* UNORDERED has no immediate form.  */
21279      op1 = force_reg (data_mode, op1);
21280      /* fall through */
21281    case LT:
21282    case LE:
21283    case GT:
21284    case GE:
21285    case EQ:
21286    case NE:
21287      {
21288	/* There is native support for the comparison.  */
21289	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21290	return false;
21291      }
21292
21293    case LTGT:
21294      /* This is a trapping operation (LT or GT).  */
21295      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
21296      return false;
21297
21298    case UNEQ:
21299      if (!flag_trapping_math)
21300	{
21301	  /* This would trap for signaling NaNs.  */
21302	  op1 = force_reg (data_mode, op1);
21303	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
21304					ptrue, true, op0, op1);
21305	  return false;
21306	}
21307      /* fall through */
21308    case UNLT:
21309    case UNLE:
21310    case UNGT:
21311    case UNGE:
21312      if (flag_trapping_math)
21313	{
21314	  /* Work out which elements are ordered.  */
21315	  rtx ordered = gen_reg_rtx (pred_mode);
21316	  op1 = force_reg (data_mode, op1);
21317	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
21318					   ptrue, true, op0, op1);
21319
21320	  /* Test the opposite condition for the ordered elements,
21321	     then invert the result.  */
21322	  if (code == UNEQ)
21323	    code = NE;
21324	  else
21325	    code = reverse_condition_maybe_unordered (code);
21326	  if (can_invert_p)
21327	    {
21328	      aarch64_emit_sve_fp_cond (target, code,
21329					ordered, false, op0, op1);
21330	      return true;
21331	    }
21332	  aarch64_emit_sve_invert_fp_cond (target, code,
21333					   ordered, false, op0, op1);
21334	  return false;
21335	}
21336      break;
21337
21338    case ORDERED:
21339      /* ORDERED has no immediate form.  */
21340      op1 = force_reg (data_mode, op1);
21341      break;
21342
21343    default:
21344      gcc_unreachable ();
21345    }
21346
21347  /* There is native support for the inverse comparison.  */
21348  code = reverse_condition_maybe_unordered (code);
21349  if (can_invert_p)
21350    {
21351      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21352      return true;
21353    }
21354  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
21355  return false;
21356}
21357
21358/* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
21359   of the data being selected and CMP_MODE is the mode of the values being
21360   compared.  */
21361
21362void
21363aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21364			  rtx *ops)
21365{
21366  machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
21367  rtx pred = gen_reg_rtx (pred_mode);
21368  if (FLOAT_MODE_P (cmp_mode))
21369    {
21370      if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21371					    ops[4], ops[5], true))
21372	std::swap (ops[1], ops[2]);
21373    }
21374  else
21375    aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21376
21377  if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21378    ops[1] = force_reg (data_mode, ops[1]);
21379  /* The "false" value can only be zero if the "true" value is a constant.  */
21380  if (register_operand (ops[1], data_mode)
21381      || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21382    ops[2] = force_reg (data_mode, ops[2]);
21383
21384  rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21385  emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21386}
21387
21388/* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
21389   true.  However due to issues with register allocation it is preferable
21390   to avoid tieing integer scalar and FP scalar modes.  Executing integer
21391   operations in general registers is better than treating them as scalar
21392   vector operations.  This reduces latency and avoids redundant int<->FP
21393   moves.  So tie modes if they are either the same class, or vector modes
21394   with other vector modes, vector structs or any scalar mode.  */
21395
21396static bool
21397aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
21398{
21399  if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21400    return true;
21401
21402  /* We specifically want to allow elements of "structure" modes to
21403     be tieable to the structure.  This more general condition allows
21404     other rarer situations too.  The reason we don't extend this to
21405     predicate modes is that there are no predicate structure modes
21406     nor any specific instructions for extracting part of a predicate
21407     register.  */
21408  if (aarch64_vector_data_mode_p (mode1)
21409      && aarch64_vector_data_mode_p (mode2))
21410    return true;
21411
21412  /* Also allow any scalar modes with vectors.  */
21413  if (aarch64_vector_mode_supported_p (mode1)
21414      || aarch64_vector_mode_supported_p (mode2))
21415    return true;
21416
21417  return false;
21418}
21419
21420/* Return a new RTX holding the result of moving POINTER forward by
21421   AMOUNT bytes.  */
21422
21423static rtx
21424aarch64_move_pointer (rtx pointer, poly_int64 amount)
21425{
21426  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21427
21428  return adjust_automodify_address (pointer, GET_MODE (pointer),
21429				    next, amount);
21430}
21431
21432/* Return a new RTX holding the result of moving POINTER forward by the
21433   size of the mode it points to.  */
21434
21435static rtx
21436aarch64_progress_pointer (rtx pointer)
21437{
21438  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
21439}
21440
21441/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21442   MODE bytes.  */
21443
21444static void
21445aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
21446					      machine_mode mode)
21447{
21448  rtx reg = gen_reg_rtx (mode);
21449
21450  /* "Cast" the pointers to the correct mode.  */
21451  *src = adjust_address (*src, mode, 0);
21452  *dst = adjust_address (*dst, mode, 0);
21453  /* Emit the memcpy.  */
21454  emit_move_insn (reg, *src);
21455  emit_move_insn (*dst, reg);
21456  /* Move the pointers forward.  */
21457  *src = aarch64_progress_pointer (*src);
21458  *dst = aarch64_progress_pointer (*dst);
21459}
21460
21461/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
21462   we succeed, otherwise return false.  */
21463
21464bool
21465aarch64_expand_cpymem (rtx *operands)
21466{
21467  /* These need to be signed as we need to perform arithmetic on n as
21468     signed operations.  */
21469  int n, mode_bits;
21470  rtx dst = operands[0];
21471  rtx src = operands[1];
21472  rtx base;
21473  machine_mode cur_mode = BLKmode, next_mode;
21474  bool speed_p = !optimize_function_for_size_p (cfun);
21475
21476  /* When optimizing for size, give a better estimate of the length of a
21477     memcpy call, but use the default otherwise.  Moves larger than 8 bytes
21478     will always require an even number of instructions to do now.  And each
21479     operation requires both a load+store, so divide the max number by 2.  */
21480  unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
21481
21482  /* We can't do anything smart if the amount to copy is not constant.  */
21483  if (!CONST_INT_P (operands[2]))
21484    return false;
21485
21486  unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
21487
21488  /* Try to keep the number of instructions low.  For all cases we will do at
21489     most two moves for the residual amount, since we'll always overlap the
21490     remainder.  */
21491  if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
21492    return false;
21493
21494  /* At this point tmp is known to have to fit inside an int.  */
21495  n = tmp;
21496
21497  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21498  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21499
21500  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21501  src = adjust_automodify_address (src, VOIDmode, base, 0);
21502
21503  /* Convert n to bits to make the rest of the code simpler.  */
21504  n = n * BITS_PER_UNIT;
21505
21506  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
21507     larger than TImode, but we should not use them for loads/stores here.  */
21508  const int copy_limit = GET_MODE_BITSIZE (TImode);
21509
21510  while (n > 0)
21511    {
21512      /* Find the largest mode in which to do the copy in without over reading
21513	 or writing.  */
21514      opt_scalar_int_mode mode_iter;
21515      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21516	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21517	  cur_mode = mode_iter.require ();
21518
21519      gcc_assert (cur_mode != BLKmode);
21520
21521      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21522      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
21523
21524      n -= mode_bits;
21525
21526      /* Do certain trailing copies as overlapping if it's going to be
21527	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
21528	 byte copy it's more efficient to do two overlapping 8 byte copies than
21529	 8 + 6 + 1.  */
21530      if (n > 0 && n <= 8 * BITS_PER_UNIT)
21531	{
21532	  next_mode = smallest_mode_for_size (n, MODE_INT);
21533	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21534	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21535	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21536	  n = n_bits;
21537	}
21538    }
21539
21540  return true;
21541}
21542
21543/* Split a DImode store of a CONST_INT SRC to MEM DST as two
21544   SImode stores.  Handle the case when the constant has identical
21545   bottom and top halves.  This is beneficial when the two stores can be
21546   merged into an STP and we avoid synthesising potentially expensive
21547   immediates twice.  Return true if such a split is possible.  */
21548
21549bool
21550aarch64_split_dimode_const_store (rtx dst, rtx src)
21551{
21552  rtx lo = gen_lowpart (SImode, src);
21553  rtx hi = gen_highpart_mode (SImode, DImode, src);
21554
21555  bool size_p = optimize_function_for_size_p (cfun);
21556
21557  if (!rtx_equal_p (lo, hi))
21558    return false;
21559
21560  unsigned int orig_cost
21561    = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21562  unsigned int lo_cost
21563    = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21564
21565  /* We want to transform:
21566     MOV	x1, 49370
21567     MOVK	x1, 0x140, lsl 16
21568     MOVK	x1, 0xc0da, lsl 32
21569     MOVK	x1, 0x140, lsl 48
21570     STR	x1, [x0]
21571   into:
21572     MOV	w1, 49370
21573     MOVK	w1, 0x140, lsl 16
21574     STP	w1, w1, [x0]
21575   So we want to perform this only when we save two instructions
21576   or more.  When optimizing for size, however, accept any code size
21577   savings we can.  */
21578  if (size_p && orig_cost <= lo_cost)
21579    return false;
21580
21581  if (!size_p
21582      && (orig_cost <= lo_cost + 1))
21583    return false;
21584
21585  rtx mem_lo = adjust_address (dst, SImode, 0);
21586  if (!aarch64_mem_pair_operand (mem_lo, SImode))
21587    return false;
21588
21589  rtx tmp_reg = gen_reg_rtx (SImode);
21590  aarch64_expand_mov_immediate (tmp_reg, lo);
21591  rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21592  /* Don't emit an explicit store pair as this may not be always profitable.
21593     Let the sched-fusion logic decide whether to merge them.  */
21594  emit_move_insn (mem_lo, tmp_reg);
21595  emit_move_insn (mem_hi, tmp_reg);
21596
21597  return true;
21598}
21599
21600/* Generate RTL for a conditional branch with rtx comparison CODE in
21601   mode CC_MODE.  The destination of the unlikely conditional branch
21602   is LABEL_REF.  */
21603
21604void
21605aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21606			      rtx label_ref)
21607{
21608  rtx x;
21609  x = gen_rtx_fmt_ee (code, VOIDmode,
21610		      gen_rtx_REG (cc_mode, CC_REGNUM),
21611		      const0_rtx);
21612
21613  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21614			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
21615			    pc_rtx);
21616  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21617}
21618
21619/* Generate DImode scratch registers for 128-bit (TImode) addition.
21620
21621   OP1 represents the TImode destination operand 1
21622   OP2 represents the TImode destination operand 2
21623   LOW_DEST represents the low half (DImode) of TImode operand 0
21624   LOW_IN1 represents the low half (DImode) of TImode operand 1
21625   LOW_IN2 represents the low half (DImode) of TImode operand 2
21626   HIGH_DEST represents the high half (DImode) of TImode operand 0
21627   HIGH_IN1 represents the high half (DImode) of TImode operand 1
21628   HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21629
21630void
21631aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21632			    rtx *low_in1, rtx *low_in2,
21633			    rtx *high_dest, rtx *high_in1,
21634			    rtx *high_in2)
21635{
21636  *low_dest = gen_reg_rtx (DImode);
21637  *low_in1 = gen_lowpart (DImode, op1);
21638  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21639				  subreg_lowpart_offset (DImode, TImode));
21640  *high_dest = gen_reg_rtx (DImode);
21641  *high_in1 = gen_highpart (DImode, op1);
21642  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21643				   subreg_highpart_offset (DImode, TImode));
21644}
21645
21646/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21647
21648   This function differs from 'arch64_addti_scratch_regs' in that
21649   OP1 can be an immediate constant (zero). We must call
21650   subreg_highpart_offset with DImode and TImode arguments, otherwise
21651   VOIDmode will be used for the const_int which generates an internal
21652   error from subreg_size_highpart_offset which does not expect a size of zero.
21653
21654   OP1 represents the TImode destination operand 1
21655   OP2 represents the TImode destination operand 2
21656   LOW_DEST represents the low half (DImode) of TImode operand 0
21657   LOW_IN1 represents the low half (DImode) of TImode operand 1
21658   LOW_IN2 represents the low half (DImode) of TImode operand 2
21659   HIGH_DEST represents the high half (DImode) of TImode operand 0
21660   HIGH_IN1 represents the high half (DImode) of TImode operand 1
21661   HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21662
21663
21664void
21665aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21666			     rtx *low_in1, rtx *low_in2,
21667			     rtx *high_dest, rtx *high_in1,
21668			     rtx *high_in2)
21669{
21670  *low_dest = gen_reg_rtx (DImode);
21671  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21672				  subreg_lowpart_offset (DImode, TImode));
21673
21674  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21675				  subreg_lowpart_offset (DImode, TImode));
21676  *high_dest = gen_reg_rtx (DImode);
21677
21678  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21679				   subreg_highpart_offset (DImode, TImode));
21680  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21681				   subreg_highpart_offset (DImode, TImode));
21682}
21683
21684/* Generate RTL for 128-bit (TImode) subtraction with overflow.
21685
21686   OP0 represents the TImode destination operand 0
21687   LOW_DEST represents the low half (DImode) of TImode operand 0
21688   LOW_IN1 represents the low half (DImode) of TImode operand 1
21689   LOW_IN2 represents the low half (DImode) of TImode operand 2
21690   HIGH_DEST represents the high half (DImode) of TImode operand 0
21691   HIGH_IN1 represents the high half (DImode) of TImode operand 1
21692   HIGH_IN2 represents the high half (DImode) of TImode operand 2
21693   UNSIGNED_P is true if the operation is being performed on unsigned
21694   values.  */
21695void
21696aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21697		       rtx low_in2, rtx high_dest, rtx high_in1,
21698		       rtx high_in2, bool unsigned_p)
21699{
21700  if (low_in2 == const0_rtx)
21701    {
21702      low_dest = low_in1;
21703      high_in2 = force_reg (DImode, high_in2);
21704      if (unsigned_p)
21705	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21706      else
21707	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21708    }
21709  else
21710    {
21711      if (aarch64_plus_immediate (low_in2, DImode))
21712	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21713					    GEN_INT (-INTVAL (low_in2))));
21714      else
21715	{
21716	  low_in2 = force_reg (DImode, low_in2);
21717	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21718	}
21719      high_in2 = force_reg (DImode, high_in2);
21720
21721      if (unsigned_p)
21722	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21723      else
21724	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21725    }
21726
21727  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21728  emit_move_insn (gen_highpart (DImode, op0), high_dest);
21729
21730}
21731
21732/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
21733
21734static unsigned HOST_WIDE_INT
21735aarch64_asan_shadow_offset (void)
21736{
21737  if (TARGET_ILP32)
21738    return (HOST_WIDE_INT_1 << 29);
21739  else
21740    return (HOST_WIDE_INT_1 << 36);
21741}
21742
21743static rtx
21744aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21745			int code, tree treeop0, tree treeop1)
21746{
21747  machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21748  rtx op0, op1;
21749  int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21750  insn_code icode;
21751  struct expand_operand ops[4];
21752
21753  start_sequence ();
21754  expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21755
21756  op_mode = GET_MODE (op0);
21757  if (op_mode == VOIDmode)
21758    op_mode = GET_MODE (op1);
21759
21760  switch (op_mode)
21761    {
21762    case E_QImode:
21763    case E_HImode:
21764    case E_SImode:
21765      cmp_mode = SImode;
21766      icode = CODE_FOR_cmpsi;
21767      break;
21768
21769    case E_DImode:
21770      cmp_mode = DImode;
21771      icode = CODE_FOR_cmpdi;
21772      break;
21773
21774    case E_SFmode:
21775      cmp_mode = SFmode;
21776      cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21777      icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21778      break;
21779
21780    case E_DFmode:
21781      cmp_mode = DFmode;
21782      cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21783      icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21784      break;
21785
21786    default:
21787      end_sequence ();
21788      return NULL_RTX;
21789    }
21790
21791  op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21792  op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21793  if (!op0 || !op1)
21794    {
21795      end_sequence ();
21796      return NULL_RTX;
21797    }
21798  *prep_seq = get_insns ();
21799  end_sequence ();
21800
21801  create_fixed_operand (&ops[0], op0);
21802  create_fixed_operand (&ops[1], op1);
21803
21804  start_sequence ();
21805  if (!maybe_expand_insn (icode, 2, ops))
21806    {
21807      end_sequence ();
21808      return NULL_RTX;
21809    }
21810  *gen_seq = get_insns ();
21811  end_sequence ();
21812
21813  return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21814			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21815}
21816
21817static rtx
21818aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21819		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
21820{
21821  rtx op0, op1, target;
21822  machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21823  int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21824  insn_code icode;
21825  struct expand_operand ops[6];
21826  int aarch64_cond;
21827
21828  push_to_sequence (*prep_seq);
21829  expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21830
21831  op_mode = GET_MODE (op0);
21832  if (op_mode == VOIDmode)
21833    op_mode = GET_MODE (op1);
21834
21835  switch (op_mode)
21836    {
21837    case E_QImode:
21838    case E_HImode:
21839    case E_SImode:
21840      cmp_mode = SImode;
21841      break;
21842
21843    case E_DImode:
21844      cmp_mode = DImode;
21845      break;
21846
21847    case E_SFmode:
21848      cmp_mode = SFmode;
21849      cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21850      break;
21851
21852    case E_DFmode:
21853      cmp_mode = DFmode;
21854      cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21855      break;
21856
21857    default:
21858      end_sequence ();
21859      return NULL_RTX;
21860    }
21861
21862  icode = code_for_ccmp (cc_mode, cmp_mode);
21863
21864  op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21865  op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21866  if (!op0 || !op1)
21867    {
21868      end_sequence ();
21869      return NULL_RTX;
21870    }
21871  *prep_seq = get_insns ();
21872  end_sequence ();
21873
21874  target = gen_rtx_REG (cc_mode, CC_REGNUM);
21875  aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21876
21877  if (bit_code != AND)
21878    {
21879      /* Treat the ccmp patterns as canonical and use them where possible,
21880	 but fall back to ccmp_rev patterns if there's no other option.  */
21881      rtx_code prev_code = GET_CODE (prev);
21882      machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21883      if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21884	  && !(prev_code == EQ
21885	       || prev_code == NE
21886	       || prev_code == ORDERED
21887	       || prev_code == UNORDERED))
21888	icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21889      else
21890	{
21891	  rtx_code code = reverse_condition (prev_code);
21892	  prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21893	}
21894      aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21895    }
21896
21897  create_fixed_operand (&ops[0], XEXP (prev, 0));
21898  create_fixed_operand (&ops[1], target);
21899  create_fixed_operand (&ops[2], op0);
21900  create_fixed_operand (&ops[3], op1);
21901  create_fixed_operand (&ops[4], prev);
21902  create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21903
21904  push_to_sequence (*gen_seq);
21905  if (!maybe_expand_insn (icode, 6, ops))
21906    {
21907      end_sequence ();
21908      return NULL_RTX;
21909    }
21910
21911  *gen_seq = get_insns ();
21912  end_sequence ();
21913
21914  return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21915}
21916
21917#undef TARGET_GEN_CCMP_FIRST
21918#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21919
21920#undef TARGET_GEN_CCMP_NEXT
21921#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21922
21923/* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
21924   instruction fusion of some sort.  */
21925
21926static bool
21927aarch64_macro_fusion_p (void)
21928{
21929  return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21930}
21931
21932
21933/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
21934   should be kept together during scheduling.  */
21935
21936static bool
21937aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21938{
21939  rtx set_dest;
21940  rtx prev_set = single_set (prev);
21941  rtx curr_set = single_set (curr);
21942  /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
21943  bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21944
21945  if (!aarch64_macro_fusion_p ())
21946    return false;
21947
21948  if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21949    {
21950      /* We are trying to match:
21951         prev (mov)  == (set (reg r0) (const_int imm16))
21952         curr (movk) == (set (zero_extract (reg r0)
21953                                           (const_int 16)
21954                                           (const_int 16))
21955                             (const_int imm16_1))  */
21956
21957      set_dest = SET_DEST (curr_set);
21958
21959      if (GET_CODE (set_dest) == ZERO_EXTRACT
21960          && CONST_INT_P (SET_SRC (curr_set))
21961          && CONST_INT_P (SET_SRC (prev_set))
21962          && CONST_INT_P (XEXP (set_dest, 2))
21963          && INTVAL (XEXP (set_dest, 2)) == 16
21964          && REG_P (XEXP (set_dest, 0))
21965          && REG_P (SET_DEST (prev_set))
21966          && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21967        {
21968          return true;
21969        }
21970    }
21971
21972  if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21973    {
21974
21975      /*  We're trying to match:
21976          prev (adrp) == (set (reg r1)
21977                              (high (symbol_ref ("SYM"))))
21978          curr (add) == (set (reg r0)
21979                             (lo_sum (reg r1)
21980                                     (symbol_ref ("SYM"))))
21981          Note that r0 need not necessarily be the same as r1, especially
21982          during pre-regalloc scheduling.  */
21983
21984      if (satisfies_constraint_Ush (SET_SRC (prev_set))
21985          && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21986        {
21987          if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21988              && REG_P (XEXP (SET_SRC (curr_set), 0))
21989              && REGNO (XEXP (SET_SRC (curr_set), 0))
21990                 == REGNO (SET_DEST (prev_set))
21991              && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21992                              XEXP (SET_SRC (curr_set), 1)))
21993            return true;
21994        }
21995    }
21996
21997  if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21998    {
21999
22000      /* We're trying to match:
22001         prev (movk) == (set (zero_extract (reg r0)
22002                                           (const_int 16)
22003                                           (const_int 32))
22004                             (const_int imm16_1))
22005         curr (movk) == (set (zero_extract (reg r0)
22006                                           (const_int 16)
22007                                           (const_int 48))
22008                             (const_int imm16_2))  */
22009
22010      if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
22011          && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
22012          && REG_P (XEXP (SET_DEST (prev_set), 0))
22013          && REG_P (XEXP (SET_DEST (curr_set), 0))
22014          && REGNO (XEXP (SET_DEST (prev_set), 0))
22015             == REGNO (XEXP (SET_DEST (curr_set), 0))
22016          && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
22017          && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
22018          && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
22019          && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
22020          && CONST_INT_P (SET_SRC (prev_set))
22021          && CONST_INT_P (SET_SRC (curr_set)))
22022        return true;
22023
22024    }
22025  if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
22026    {
22027      /* We're trying to match:
22028          prev (adrp) == (set (reg r0)
22029                              (high (symbol_ref ("SYM"))))
22030          curr (ldr) == (set (reg r1)
22031                             (mem (lo_sum (reg r0)
22032                                             (symbol_ref ("SYM")))))
22033                 or
22034          curr (ldr) == (set (reg r1)
22035                             (zero_extend (mem
22036                                           (lo_sum (reg r0)
22037                                                   (symbol_ref ("SYM"))))))  */
22038      if (satisfies_constraint_Ush (SET_SRC (prev_set))
22039          && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
22040        {
22041          rtx curr_src = SET_SRC (curr_set);
22042
22043          if (GET_CODE (curr_src) == ZERO_EXTEND)
22044            curr_src = XEXP (curr_src, 0);
22045
22046          if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
22047              && REG_P (XEXP (XEXP (curr_src, 0), 0))
22048              && REGNO (XEXP (XEXP (curr_src, 0), 0))
22049                 == REGNO (SET_DEST (prev_set))
22050              && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
22051                              XEXP (SET_SRC (prev_set), 0)))
22052              return true;
22053        }
22054    }
22055
22056  /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
22057  if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
22058      && prev_set && curr_set && any_condjump_p (curr)
22059      && GET_CODE (SET_SRC (prev_set)) == COMPARE
22060      && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
22061      && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
22062    return true;
22063
22064  /* Fuse flag-setting ALU instructions and conditional branch.  */
22065  if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
22066      && any_condjump_p (curr))
22067    {
22068      unsigned int condreg1, condreg2;
22069      rtx cc_reg_1;
22070      aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
22071      cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
22072
22073      if (reg_referenced_p (cc_reg_1, PATTERN (curr))
22074	  && prev
22075	  && modified_in_p (cc_reg_1, prev))
22076	{
22077	  enum attr_type prev_type = get_attr_type (prev);
22078
22079	  /* FIXME: this misses some which is considered simple arthematic
22080	     instructions for ThunderX.  Simple shifts are missed here.  */
22081	  if (prev_type == TYPE_ALUS_SREG
22082	      || prev_type == TYPE_ALUS_IMM
22083	      || prev_type == TYPE_LOGICS_REG
22084	      || prev_type == TYPE_LOGICS_IMM)
22085	    return true;
22086	}
22087    }
22088
22089  /* Fuse ALU instructions and CBZ/CBNZ.  */
22090  if (prev_set
22091      && curr_set
22092      && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
22093      && any_condjump_p (curr))
22094    {
22095      /* We're trying to match:
22096	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
22097	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
22098							 (const_int 0))
22099						 (label_ref ("SYM"))
22100						 (pc))  */
22101      if (SET_DEST (curr_set) == (pc_rtx)
22102	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
22103	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
22104	  && REG_P (SET_DEST (prev_set))
22105	  && REGNO (SET_DEST (prev_set))
22106	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
22107	{
22108	  /* Fuse ALU operations followed by conditional branch instruction.  */
22109	  switch (get_attr_type (prev))
22110	    {
22111	    case TYPE_ALU_IMM:
22112	    case TYPE_ALU_SREG:
22113	    case TYPE_ADC_REG:
22114	    case TYPE_ADC_IMM:
22115	    case TYPE_ADCS_REG:
22116	    case TYPE_ADCS_IMM:
22117	    case TYPE_LOGIC_REG:
22118	    case TYPE_LOGIC_IMM:
22119	    case TYPE_CSEL:
22120	    case TYPE_ADR:
22121	    case TYPE_MOV_IMM:
22122	    case TYPE_SHIFT_REG:
22123	    case TYPE_SHIFT_IMM:
22124	    case TYPE_BFM:
22125	    case TYPE_RBIT:
22126	    case TYPE_REV:
22127	    case TYPE_EXTEND:
22128	      return true;
22129
22130	    default:;
22131	    }
22132	}
22133    }
22134
22135  /* Fuse A+B+1 and A-B-1 */
22136  if (simple_sets_p
22137      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
22138    {
22139      /* We're trying to match:
22140	  prev == (set (r0) (plus (r0) (r1)))
22141	  curr == (set (r0) (plus (r0) (const_int 1)))
22142	or:
22143	  prev == (set (r0) (minus (r0) (r1)))
22144	  curr == (set (r0) (plus (r0) (const_int -1))) */
22145
22146      rtx prev_src = SET_SRC (prev_set);
22147      rtx curr_src = SET_SRC (curr_set);
22148
22149      int polarity = 1;
22150      if (GET_CODE (prev_src) == MINUS)
22151	polarity = -1;
22152
22153      if (GET_CODE (curr_src) == PLUS
22154	  && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
22155	  && CONST_INT_P (XEXP (curr_src, 1))
22156	  && INTVAL (XEXP (curr_src, 1)) == polarity
22157	  && REG_P (XEXP (curr_src, 0))
22158	  && REG_P (SET_DEST (prev_set))
22159	  && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
22160	return true;
22161    }
22162
22163  return false;
22164}
22165
22166/* Return true iff the instruction fusion described by OP is enabled.  */
22167
22168bool
22169aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
22170{
22171  return (aarch64_tune_params.fusible_ops & op) != 0;
22172}
22173
22174/* If MEM is in the form of [base+offset], extract the two parts
22175   of address and set to BASE and OFFSET, otherwise return false
22176   after clearing BASE and OFFSET.  */
22177
22178bool
22179extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
22180{
22181  rtx addr;
22182
22183  gcc_assert (MEM_P (mem));
22184
22185  addr = XEXP (mem, 0);
22186
22187  if (REG_P (addr))
22188    {
22189      *base = addr;
22190      *offset = const0_rtx;
22191      return true;
22192    }
22193
22194  if (GET_CODE (addr) == PLUS
22195      && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
22196    {
22197      *base = XEXP (addr, 0);
22198      *offset = XEXP (addr, 1);
22199      return true;
22200    }
22201
22202  *base = NULL_RTX;
22203  *offset = NULL_RTX;
22204
22205  return false;
22206}
22207
22208/* Types for scheduling fusion.  */
22209enum sched_fusion_type
22210{
22211  SCHED_FUSION_NONE = 0,
22212  SCHED_FUSION_LD_SIGN_EXTEND,
22213  SCHED_FUSION_LD_ZERO_EXTEND,
22214  SCHED_FUSION_LD,
22215  SCHED_FUSION_ST,
22216  SCHED_FUSION_NUM
22217};
22218
22219/* If INSN is a load or store of address in the form of [base+offset],
22220   extract the two parts and set to BASE and OFFSET.  Return scheduling
22221   fusion type this INSN is.  */
22222
22223static enum sched_fusion_type
22224fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
22225{
22226  rtx x, dest, src;
22227  enum sched_fusion_type fusion = SCHED_FUSION_LD;
22228
22229  gcc_assert (INSN_P (insn));
22230  x = PATTERN (insn);
22231  if (GET_CODE (x) != SET)
22232    return SCHED_FUSION_NONE;
22233
22234  src = SET_SRC (x);
22235  dest = SET_DEST (x);
22236
22237  machine_mode dest_mode = GET_MODE (dest);
22238
22239  if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
22240    return SCHED_FUSION_NONE;
22241
22242  if (GET_CODE (src) == SIGN_EXTEND)
22243    {
22244      fusion = SCHED_FUSION_LD_SIGN_EXTEND;
22245      src = XEXP (src, 0);
22246      if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
22247	return SCHED_FUSION_NONE;
22248    }
22249  else if (GET_CODE (src) == ZERO_EXTEND)
22250    {
22251      fusion = SCHED_FUSION_LD_ZERO_EXTEND;
22252      src = XEXP (src, 0);
22253      if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
22254	return SCHED_FUSION_NONE;
22255    }
22256
22257  if (GET_CODE (src) == MEM && REG_P (dest))
22258    extract_base_offset_in_addr (src, base, offset);
22259  else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
22260    {
22261      fusion = SCHED_FUSION_ST;
22262      extract_base_offset_in_addr (dest, base, offset);
22263    }
22264  else
22265    return SCHED_FUSION_NONE;
22266
22267  if (*base == NULL_RTX || *offset == NULL_RTX)
22268    fusion = SCHED_FUSION_NONE;
22269
22270  return fusion;
22271}
22272
22273/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
22274
22275   Currently we only support to fuse ldr or str instructions, so FUSION_PRI
22276   and PRI are only calculated for these instructions.  For other instruction,
22277   FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
22278   type instruction fusion can be added by returning different priorities.
22279
22280   It's important that irrelevant instructions get the largest FUSION_PRI.  */
22281
22282static void
22283aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
22284			       int *fusion_pri, int *pri)
22285{
22286  int tmp, off_val;
22287  rtx base, offset;
22288  enum sched_fusion_type fusion;
22289
22290  gcc_assert (INSN_P (insn));
22291
22292  tmp = max_pri - 1;
22293  fusion = fusion_load_store (insn, &base, &offset);
22294  if (fusion == SCHED_FUSION_NONE)
22295    {
22296      *pri = tmp;
22297      *fusion_pri = tmp;
22298      return;
22299    }
22300
22301  /* Set FUSION_PRI according to fusion type and base register.  */
22302  *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
22303
22304  /* Calculate PRI.  */
22305  tmp /= 2;
22306
22307  /* INSN with smaller offset goes first.  */
22308  off_val = (int)(INTVAL (offset));
22309  if (off_val >= 0)
22310    tmp -= (off_val & 0xfffff);
22311  else
22312    tmp += ((- off_val) & 0xfffff);
22313
22314  *pri = tmp;
22315  return;
22316}
22317
22318/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
22319   Adjust priority of sha1h instructions so they are scheduled before
22320   other SHA1 instructions.  */
22321
22322static int
22323aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
22324{
22325  rtx x = PATTERN (insn);
22326
22327  if (GET_CODE (x) == SET)
22328    {
22329      x = SET_SRC (x);
22330
22331      if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
22332	return priority + 10;
22333    }
22334
22335  return priority;
22336}
22337
22338/* Given OPERANDS of consecutive load/store, check if we can merge
22339   them into ldp/stp.  LOAD is true if they are load instructions.
22340   MODE is the mode of memory operands.  */
22341
22342bool
22343aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
22344				machine_mode mode)
22345{
22346  HOST_WIDE_INT offval_1, offval_2, msize;
22347  enum reg_class rclass_1, rclass_2;
22348  rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
22349
22350  /* Allow the tuning structure to disable LDP instruction formation
22351     from combining instructions (e.g., in peephole2).  */
22352  if (load && (aarch64_tune_params.extra_tuning_flags
22353	       & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE))
22354    return false;
22355
22356  if (load)
22357    {
22358      mem_1 = operands[1];
22359      mem_2 = operands[3];
22360      reg_1 = operands[0];
22361      reg_2 = operands[2];
22362      gcc_assert (REG_P (reg_1) && REG_P (reg_2));
22363      if (REGNO (reg_1) == REGNO (reg_2))
22364	return false;
22365    }
22366  else
22367    {
22368      mem_1 = operands[0];
22369      mem_2 = operands[2];
22370      reg_1 = operands[1];
22371      reg_2 = operands[3];
22372    }
22373
22374  /* The mems cannot be volatile.  */
22375  if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
22376    return false;
22377
22378  /* If we have SImode and slow unaligned ldp,
22379     check the alignment to be at least 8 byte. */
22380  if (mode == SImode
22381      && (aarch64_tune_params.extra_tuning_flags
22382          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22383      && !optimize_size
22384      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22385    return false;
22386
22387  /* Check if the addresses are in the form of [base+offset].  */
22388  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22389  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22390    return false;
22391  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22392  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22393    return false;
22394
22395  /* Check if the bases are same.  */
22396  if (!rtx_equal_p (base_1, base_2))
22397    return false;
22398
22399  /* The operands must be of the same size.  */
22400  gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22401			 GET_MODE_SIZE (GET_MODE (mem_2))));
22402
22403  offval_1 = INTVAL (offset_1);
22404  offval_2 = INTVAL (offset_2);
22405  /* We should only be trying this for fixed-sized modes.  There is no
22406     SVE LDP/STP instruction.  */
22407  msize = GET_MODE_SIZE (mode).to_constant ();
22408  /* Check if the offsets are consecutive.  */
22409  if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22410    return false;
22411
22412  /* Check if the addresses are clobbered by load.  */
22413  if (load)
22414    {
22415      if (reg_mentioned_p (reg_1, mem_1))
22416	return false;
22417
22418      /* In increasing order, the last load can clobber the address.  */
22419      if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
22420	return false;
22421    }
22422
22423  /* One of the memory accesses must be a mempair operand.
22424     If it is not the first one, they need to be swapped by the
22425     peephole.  */
22426  if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22427       && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22428    return false;
22429
22430  if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22431    rclass_1 = FP_REGS;
22432  else
22433    rclass_1 = GENERAL_REGS;
22434
22435  if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22436    rclass_2 = FP_REGS;
22437  else
22438    rclass_2 = GENERAL_REGS;
22439
22440  /* Check if the registers are of same class.  */
22441  if (rclass_1 != rclass_2)
22442    return false;
22443
22444  return true;
22445}
22446
22447/* Given OPERANDS of consecutive load/store that can be merged,
22448   swap them if they are not in ascending order.  */
22449void
22450aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22451{
22452  rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22453  HOST_WIDE_INT offval_1, offval_2;
22454
22455  if (load)
22456    {
22457      mem_1 = operands[1];
22458      mem_2 = operands[3];
22459    }
22460  else
22461    {
22462      mem_1 = operands[0];
22463      mem_2 = operands[2];
22464    }
22465
22466  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22467  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22468
22469  offval_1 = INTVAL (offset_1);
22470  offval_2 = INTVAL (offset_2);
22471
22472  if (offval_1 > offval_2)
22473    {
22474      /* Irrespective of whether this is a load or a store,
22475	 we do the same swap.  */
22476      std::swap (operands[0], operands[2]);
22477      std::swap (operands[1], operands[3]);
22478    }
22479}
22480
22481/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22482   comparison between the two.  */
22483int
22484aarch64_host_wide_int_compare (const void *x, const void *y)
22485{
22486  return wi::cmps (* ((const HOST_WIDE_INT *) x),
22487		   * ((const HOST_WIDE_INT *) y));
22488}
22489
22490/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22491   other pointing to a REG rtx containing an offset, compare the offsets
22492   of the two pairs.
22493
22494   Return:
22495
22496	1 iff offset (X) > offset (Y)
22497	0 iff offset (X) == offset (Y)
22498	-1 iff offset (X) < offset (Y)  */
22499int
22500aarch64_ldrstr_offset_compare (const void *x, const void *y)
22501{
22502  const rtx * operands_1 = (const rtx *) x;
22503  const rtx * operands_2 = (const rtx *) y;
22504  rtx mem_1, mem_2, base, offset_1, offset_2;
22505
22506  if (MEM_P (operands_1[0]))
22507    mem_1 = operands_1[0];
22508  else
22509    mem_1 = operands_1[1];
22510
22511  if (MEM_P (operands_2[0]))
22512    mem_2 = operands_2[0];
22513  else
22514    mem_2 = operands_2[1];
22515
22516  /* Extract the offsets.  */
22517  extract_base_offset_in_addr (mem_1, &base, &offset_1);
22518  extract_base_offset_in_addr (mem_2, &base, &offset_2);
22519
22520  gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22521
22522  return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22523}
22524
22525/* Given OPERANDS of consecutive load/store, check if we can merge
22526   them into ldp/stp by adjusting the offset.  LOAD is true if they
22527   are load instructions.  MODE is the mode of memory operands.
22528
22529   Given below consecutive stores:
22530
22531     str  w1, [xb, 0x100]
22532     str  w1, [xb, 0x104]
22533     str  w1, [xb, 0x108]
22534     str  w1, [xb, 0x10c]
22535
22536   Though the offsets are out of the range supported by stp, we can
22537   still pair them after adjusting the offset, like:
22538
22539     add  scratch, xb, 0x100
22540     stp  w1, w1, [scratch]
22541     stp  w1, w1, [scratch, 0x8]
22542
22543   The peephole patterns detecting this opportunity should guarantee
22544   the scratch register is avaliable.  */
22545
22546bool
22547aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
22548				       scalar_mode mode)
22549{
22550  const int num_insns = 4;
22551  enum reg_class rclass;
22552  HOST_WIDE_INT offvals[num_insns], msize;
22553  rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
22554
22555  if (load)
22556    {
22557      for (int i = 0; i < num_insns; i++)
22558	{
22559	  reg[i] = operands[2 * i];
22560	  mem[i] = operands[2 * i + 1];
22561
22562	  gcc_assert (REG_P (reg[i]));
22563	}
22564
22565      /* Do not attempt to merge the loads if the loads clobber each other.  */
22566      for (int i = 0; i < 8; i += 2)
22567	for (int j = i + 2; j < 8; j += 2)
22568	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
22569	    return false;
22570    }
22571  else
22572    for (int i = 0; i < num_insns; i++)
22573      {
22574	mem[i] = operands[2 * i];
22575	reg[i] = operands[2 * i + 1];
22576      }
22577
22578  /* Skip if memory operand is by itself valid for ldp/stp.  */
22579  if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
22580    return false;
22581
22582  for (int i = 0; i < num_insns; i++)
22583    {
22584      /* The mems cannot be volatile.  */
22585      if (MEM_VOLATILE_P (mem[i]))
22586	return false;
22587
22588      /* Check if the addresses are in the form of [base+offset].  */
22589      extract_base_offset_in_addr (mem[i], base + i, offset + i);
22590      if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22591	return false;
22592    }
22593
22594  /* Check if the registers are of same class.  */
22595  rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22596    ? FP_REGS : GENERAL_REGS;
22597
22598  for (int i = 1; i < num_insns; i++)
22599    if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22600      {
22601	if (rclass != FP_REGS)
22602	  return false;
22603      }
22604    else
22605      {
22606	if (rclass != GENERAL_REGS)
22607	  return false;
22608      }
22609
22610  /* Only the last register in the order in which they occur
22611     may be clobbered by the load.  */
22612  if (rclass == GENERAL_REGS && load)
22613    for (int i = 0; i < num_insns - 1; i++)
22614      if (reg_mentioned_p (reg[i], mem[i]))
22615	return false;
22616
22617  /* Check if the bases are same.  */
22618  for (int i = 0; i < num_insns - 1; i++)
22619    if (!rtx_equal_p (base[i], base[i + 1]))
22620      return false;
22621
22622  for (int i = 0; i < num_insns; i++)
22623    offvals[i] = INTVAL (offset[i]);
22624
22625  msize = GET_MODE_SIZE (mode);
22626
22627  /* Check if the offsets can be put in the right order to do a ldp/stp.  */
22628  qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22629	 aarch64_host_wide_int_compare);
22630
22631  if (!(offvals[1] == offvals[0] + msize
22632	&& offvals[3] == offvals[2] + msize))
22633    return false;
22634
22635  /* Check that offsets are within range of each other.  The ldp/stp
22636     instructions have 7 bit immediate offsets, so use 0x80.  */
22637  if (offvals[2] - offvals[0] >= msize * 0x80)
22638    return false;
22639
22640  /* The offsets must be aligned with respect to each other.  */
22641  if (offvals[0] % msize != offvals[2] % msize)
22642    return false;
22643
22644  /* If we have SImode and slow unaligned ldp,
22645     check the alignment to be at least 8 byte. */
22646  if (mode == SImode
22647      && (aarch64_tune_params.extra_tuning_flags
22648	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22649      && !optimize_size
22650      && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22651    return false;
22652
22653  return true;
22654}
22655
22656/* Given OPERANDS of consecutive load/store, this function pairs them
22657   into LDP/STP after adjusting the offset.  It depends on the fact
22658   that the operands can be sorted so the offsets are correct for STP.
22659   MODE is the mode of memory operands.  CODE is the rtl operator
22660   which should be applied to all memory operands, it's SIGN_EXTEND,
22661   ZERO_EXTEND or UNKNOWN.  */
22662
22663bool
22664aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22665			     scalar_mode mode, RTX_CODE code)
22666{
22667  rtx base, offset_1, offset_3, t1, t2;
22668  rtx mem_1, mem_2, mem_3, mem_4;
22669  rtx temp_operands[8];
22670  HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22671		stp_off_upper_limit, stp_off_lower_limit, msize;
22672
22673  /* We make changes on a copy as we may still bail out.  */
22674  for (int i = 0; i < 8; i ++)
22675    temp_operands[i] = operands[i];
22676
22677  /* Sort the operands.  */
22678  qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22679
22680  /* Copy the memory operands so that if we have to bail for some
22681     reason the original addresses are unchanged.  */
22682  if (load)
22683    {
22684      mem_1 = copy_rtx (temp_operands[1]);
22685      mem_2 = copy_rtx (temp_operands[3]);
22686      mem_3 = copy_rtx (temp_operands[5]);
22687      mem_4 = copy_rtx (temp_operands[7]);
22688    }
22689  else
22690    {
22691      mem_1 = copy_rtx (temp_operands[0]);
22692      mem_2 = copy_rtx (temp_operands[2]);
22693      mem_3 = copy_rtx (temp_operands[4]);
22694      mem_4 = copy_rtx (temp_operands[6]);
22695      gcc_assert (code == UNKNOWN);
22696    }
22697
22698  extract_base_offset_in_addr (mem_1, &base, &offset_1);
22699  extract_base_offset_in_addr (mem_3, &base, &offset_3);
22700  gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22701	      && offset_3 != NULL_RTX);
22702
22703  /* Adjust offset so it can fit in LDP/STP instruction.  */
22704  msize = GET_MODE_SIZE (mode);
22705  stp_off_upper_limit = msize * (0x40 - 1);
22706  stp_off_lower_limit = - msize * 0x40;
22707
22708  off_val_1 = INTVAL (offset_1);
22709  off_val_3 = INTVAL (offset_3);
22710
22711  /* The base offset is optimally half way between the two STP/LDP offsets.  */
22712  if (msize <= 4)
22713    base_off = (off_val_1 + off_val_3) / 2;
22714  else
22715    /* However, due to issues with negative LDP/STP offset generation for
22716       larger modes, for DF, DI and vector modes. we must not use negative
22717       addresses smaller than 9 signed unadjusted bits can store.  This
22718       provides the most range in this case.  */
22719    base_off = off_val_1;
22720
22721  /* Adjust the base so that it is aligned with the addresses but still
22722     optimal.  */
22723  if (base_off % msize != off_val_1 % msize)
22724    /* Fix the offset, bearing in mind we want to make it bigger not
22725       smaller.  */
22726    base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22727  else if (msize <= 4)
22728    /* The negative range of LDP/STP is one larger than the positive range.  */
22729    base_off += msize;
22730
22731  /* Check if base offset is too big or too small.  We can attempt to resolve
22732     this issue by setting it to the maximum value and seeing if the offsets
22733     still fit.  */
22734  if (base_off >= 0x1000)
22735    {
22736      base_off = 0x1000 - 1;
22737      /* We must still make sure that the base offset is aligned with respect
22738	 to the address.  But it may not be made any bigger.  */
22739      base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22740    }
22741
22742  /* Likewise for the case where the base is too small.  */
22743  if (base_off <= -0x1000)
22744    {
22745      base_off = -0x1000 + 1;
22746      base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22747    }
22748
22749  /* Offset of the first STP/LDP.  */
22750  new_off_1 = off_val_1 - base_off;
22751
22752  /* Offset of the second STP/LDP.  */
22753  new_off_3 = off_val_3 - base_off;
22754
22755  /* The offsets must be within the range of the LDP/STP instructions.  */
22756  if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22757      || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22758    return false;
22759
22760  replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22761						  new_off_1), true);
22762  replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22763						  new_off_1 + msize), true);
22764  replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22765						  new_off_3), true);
22766  replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22767						  new_off_3 + msize), true);
22768
22769  if (!aarch64_mem_pair_operand (mem_1, mode)
22770      || !aarch64_mem_pair_operand (mem_3, mode))
22771    return false;
22772
22773  if (code == ZERO_EXTEND)
22774    {
22775      mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22776      mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22777      mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22778      mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22779    }
22780  else if (code == SIGN_EXTEND)
22781    {
22782      mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22783      mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22784      mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22785      mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22786    }
22787
22788  if (load)
22789    {
22790      operands[0] = temp_operands[0];
22791      operands[1] = mem_1;
22792      operands[2] = temp_operands[2];
22793      operands[3] = mem_2;
22794      operands[4] = temp_operands[4];
22795      operands[5] = mem_3;
22796      operands[6] = temp_operands[6];
22797      operands[7] = mem_4;
22798    }
22799  else
22800    {
22801      operands[0] = mem_1;
22802      operands[1] = temp_operands[1];
22803      operands[2] = mem_2;
22804      operands[3] = temp_operands[3];
22805      operands[4] = mem_3;
22806      operands[5] = temp_operands[5];
22807      operands[6] = mem_4;
22808      operands[7] = temp_operands[7];
22809    }
22810
22811  /* Emit adjusting instruction.  */
22812  emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22813  /* Emit ldp/stp instructions.  */
22814  t1 = gen_rtx_SET (operands[0], operands[1]);
22815  t2 = gen_rtx_SET (operands[2], operands[3]);
22816  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22817  t1 = gen_rtx_SET (operands[4], operands[5]);
22818  t2 = gen_rtx_SET (operands[6], operands[7]);
22819  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22820  return true;
22821}
22822
22823/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
22824   it isn't worth branching around empty masked ops (including masked
22825   stores).  */
22826
22827static bool
22828aarch64_empty_mask_is_expensive (unsigned)
22829{
22830  return false;
22831}
22832
22833/* Return 1 if pseudo register should be created and used to hold
22834   GOT address for PIC code.  */
22835
22836bool
22837aarch64_use_pseudo_pic_reg (void)
22838{
22839  return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22840}
22841
22842/* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
22843
22844static int
22845aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22846{
22847  switch (XINT (x, 1))
22848    {
22849    case UNSPEC_GOTSMALLPIC:
22850    case UNSPEC_GOTSMALLPIC28K:
22851    case UNSPEC_GOTTINYPIC:
22852      return 0;
22853    default:
22854      break;
22855    }
22856
22857  return default_unspec_may_trap_p (x, flags);
22858}
22859
22860
22861/* If X is a positive CONST_DOUBLE with a value that is a power of 2
22862   return the log2 of that value.  Otherwise return -1.  */
22863
22864int
22865aarch64_fpconst_pow_of_2 (rtx x)
22866{
22867  const REAL_VALUE_TYPE *r;
22868
22869  if (!CONST_DOUBLE_P (x))
22870    return -1;
22871
22872  r = CONST_DOUBLE_REAL_VALUE (x);
22873
22874  if (REAL_VALUE_NEGATIVE (*r)
22875      || REAL_VALUE_ISNAN (*r)
22876      || REAL_VALUE_ISINF (*r)
22877      || !real_isinteger (r, DFmode))
22878    return -1;
22879
22880  return exact_log2 (real_to_integer (r));
22881}
22882
22883/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22884   power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22885   return n. Otherwise return -1.  */
22886
22887int
22888aarch64_fpconst_pow2_recip (rtx x)
22889{
22890  REAL_VALUE_TYPE r0;
22891
22892  if (!CONST_DOUBLE_P (x))
22893    return -1;
22894
22895  r0 = *CONST_DOUBLE_REAL_VALUE (x);
22896  if (exact_real_inverse (DFmode, &r0)
22897      && !REAL_VALUE_NEGATIVE (r0))
22898    {
22899	int ret = exact_log2 (real_to_integer (&r0));
22900	if (ret >= 1 && ret <= 32)
22901	    return ret;
22902    }
22903  return -1;
22904}
22905
22906/* If X is a vector of equal CONST_DOUBLE values and that value is
22907   Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
22908
22909int
22910aarch64_vec_fpconst_pow_of_2 (rtx x)
22911{
22912  int nelts;
22913  if (GET_CODE (x) != CONST_VECTOR
22914      || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22915    return -1;
22916
22917  if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22918    return -1;
22919
22920  int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22921  if (firstval <= 0)
22922    return -1;
22923
22924  for (int i = 1; i < nelts; i++)
22925    if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22926      return -1;
22927
22928  return firstval;
22929}
22930
22931/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22932   to float.
22933
22934   __fp16 always promotes through this hook.
22935   _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22936   through the generic excess precision logic rather than here.  */
22937
22938static tree
22939aarch64_promoted_type (const_tree t)
22940{
22941  if (SCALAR_FLOAT_TYPE_P (t)
22942      && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22943    return float_type_node;
22944
22945  return NULL_TREE;
22946}
22947
22948/* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
22949
22950static bool
22951aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22952			   optimization_type opt_type)
22953{
22954  switch (op)
22955    {
22956    case rsqrt_optab:
22957      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22958
22959    default:
22960      return true;
22961    }
22962}
22963
22964/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
22965
22966static unsigned int
22967aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22968					int *offset)
22969{
22970  /* Polynomial invariant 1 == (VG / 2) - 1.  */
22971  gcc_assert (i == 1);
22972  *factor = 2;
22973  *offset = 1;
22974  return AARCH64_DWARF_VG;
22975}
22976
22977/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22978   if MODE is HFmode, and punt to the generic implementation otherwise.  */
22979
22980static bool
22981aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22982{
22983  return (mode == HFmode
22984	  ? true
22985	  : default_libgcc_floating_mode_supported_p (mode));
22986}
22987
22988/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22989   if MODE is HFmode, and punt to the generic implementation otherwise.  */
22990
22991static bool
22992aarch64_scalar_mode_supported_p (scalar_mode mode)
22993{
22994  return (mode == HFmode
22995	  ? true
22996	  : default_scalar_mode_supported_p (mode));
22997}
22998
22999/* Set the value of FLT_EVAL_METHOD.
23000   ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
23001
23002    0: evaluate all operations and constants, whose semantic type has at
23003       most the range and precision of type float, to the range and
23004       precision of float; evaluate all other operations and constants to
23005       the range and precision of the semantic type;
23006
23007    N, where _FloatN is a supported interchange floating type
23008       evaluate all operations and constants, whose semantic type has at
23009       most the range and precision of _FloatN type, to the range and
23010       precision of the _FloatN type; evaluate all other operations and
23011       constants to the range and precision of the semantic type;
23012
23013   If we have the ARMv8.2-A extensions then we support _Float16 in native
23014   precision, so we should set this to 16.  Otherwise, we support the type,
23015   but want to evaluate expressions in float precision, so set this to
23016   0.  */
23017
23018static enum flt_eval_method
23019aarch64_excess_precision (enum excess_precision_type type)
23020{
23021  switch (type)
23022    {
23023      case EXCESS_PRECISION_TYPE_FAST:
23024      case EXCESS_PRECISION_TYPE_STANDARD:
23025	/* We can calculate either in 16-bit range and precision or
23026	   32-bit range and precision.  Make that decision based on whether
23027	   we have native support for the ARMv8.2-A 16-bit floating-point
23028	   instructions or not.  */
23029	return (TARGET_FP_F16INST
23030		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
23031		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
23032      case EXCESS_PRECISION_TYPE_IMPLICIT:
23033	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
23034      default:
23035	gcc_unreachable ();
23036    }
23037  return FLT_EVAL_METHOD_UNPREDICTABLE;
23038}
23039
23040/* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
23041   scheduled for speculative execution.  Reject the long-running division
23042   and square-root instructions.  */
23043
23044static bool
23045aarch64_sched_can_speculate_insn (rtx_insn *insn)
23046{
23047  switch (get_attr_type (insn))
23048    {
23049      case TYPE_SDIV:
23050      case TYPE_UDIV:
23051      case TYPE_FDIVS:
23052      case TYPE_FDIVD:
23053      case TYPE_FSQRTS:
23054      case TYPE_FSQRTD:
23055      case TYPE_NEON_FP_SQRT_S:
23056      case TYPE_NEON_FP_SQRT_D:
23057      case TYPE_NEON_FP_SQRT_S_Q:
23058      case TYPE_NEON_FP_SQRT_D_Q:
23059      case TYPE_NEON_FP_DIV_S:
23060      case TYPE_NEON_FP_DIV_D:
23061      case TYPE_NEON_FP_DIV_S_Q:
23062      case TYPE_NEON_FP_DIV_D_Q:
23063	return false;
23064      default:
23065	return true;
23066    }
23067}
23068
23069/* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
23070
23071static int
23072aarch64_compute_pressure_classes (reg_class *classes)
23073{
23074  int i = 0;
23075  classes[i++] = GENERAL_REGS;
23076  classes[i++] = FP_REGS;
23077  /* PR_REGS isn't a useful pressure class because many predicate pseudo
23078     registers need to go in PR_LO_REGS at some point during their
23079     lifetime.  Splitting it into two halves has the effect of making
23080     all predicates count against PR_LO_REGS, so that we try whenever
23081     possible to restrict the number of live predicates to 8.  This
23082     greatly reduces the amount of spilling in certain loops.  */
23083  classes[i++] = PR_LO_REGS;
23084  classes[i++] = PR_HI_REGS;
23085  return i;
23086}
23087
23088/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
23089
23090static bool
23091aarch64_can_change_mode_class (machine_mode from,
23092			       machine_mode to, reg_class_t)
23093{
23094  unsigned int from_flags = aarch64_classify_vector_mode (from);
23095  unsigned int to_flags = aarch64_classify_vector_mode (to);
23096
23097  bool from_sve_p = (from_flags & VEC_ANY_SVE);
23098  bool to_sve_p = (to_flags & VEC_ANY_SVE);
23099
23100  bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
23101  bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
23102
23103  bool from_pred_p = (from_flags & VEC_SVE_PRED);
23104  bool to_pred_p = (to_flags & VEC_SVE_PRED);
23105
23106  /* Don't allow changes between predicate modes and other modes.
23107     Only predicate registers can hold predicate modes and only
23108     non-predicate registers can hold non-predicate modes, so any
23109     attempt to mix them would require a round trip through memory.  */
23110  if (from_pred_p != to_pred_p)
23111    return false;
23112
23113  /* Don't allow changes between partial SVE modes and other modes.
23114     The contents of partial SVE modes are distributed evenly across
23115     the register, whereas GCC expects them to be clustered together.  */
23116  if (from_partial_sve_p != to_partial_sve_p)
23117    return false;
23118
23119  /* Similarly reject changes between partial SVE modes that have
23120     different patterns of significant and insignificant bits.  */
23121  if (from_partial_sve_p
23122      && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
23123	  || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
23124    return false;
23125
23126  if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
23127    {
23128      /* Don't allow changes between SVE modes and other modes that might
23129	 be bigger than 128 bits.  In particular, OImode, CImode and XImode
23130	 divide into 128-bit quantities while SVE modes divide into
23131	 BITS_PER_SVE_VECTOR quantities.  */
23132      if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
23133	return false;
23134      if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
23135	return false;
23136    }
23137
23138  if (BYTES_BIG_ENDIAN)
23139    {
23140      /* Don't allow changes between SVE data modes and non-SVE modes.
23141	 See the comment at the head of aarch64-sve.md for details.  */
23142      if (from_sve_p != to_sve_p)
23143	return false;
23144
23145      /* Don't allow changes in element size: lane 0 of the new vector
23146	 would not then be lane 0 of the old vector.  See the comment
23147	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
23148	 description.
23149
23150	 In the worst case, this forces a register to be spilled in
23151	 one mode and reloaded in the other, which handles the
23152	 endianness correctly.  */
23153      if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
23154	return false;
23155    }
23156  return true;
23157}
23158
23159/* Implement TARGET_EARLY_REMAT_MODES.  */
23160
23161static void
23162aarch64_select_early_remat_modes (sbitmap modes)
23163{
23164  /* SVE values are not normally live across a call, so it should be
23165     worth doing early rematerialization even in VL-specific mode.  */
23166  for (int i = 0; i < NUM_MACHINE_MODES; ++i)
23167    if (aarch64_sve_mode_p ((machine_mode) i))
23168      bitmap_set_bit (modes, i);
23169}
23170
23171/* Override the default target speculation_safe_value.  */
23172static rtx
23173aarch64_speculation_safe_value (machine_mode mode,
23174				rtx result, rtx val, rtx failval)
23175{
23176  /* Maybe we should warn if falling back to hard barriers.  They are
23177     likely to be noticably more expensive than the alternative below.  */
23178  if (!aarch64_track_speculation)
23179    return default_speculation_safe_value (mode, result, val, failval);
23180
23181  if (!REG_P (val))
23182    val = copy_to_mode_reg (mode, val);
23183
23184  if (!aarch64_reg_or_zero (failval, mode))
23185    failval = copy_to_mode_reg (mode, failval);
23186
23187  emit_insn (gen_despeculate_copy (mode, result, val, failval));
23188  return result;
23189}
23190
23191/* Implement TARGET_ESTIMATED_POLY_VALUE.
23192   Look into the tuning structure for an estimate.
23193   VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
23194   Advanced SIMD 128 bits.  */
23195
23196static HOST_WIDE_INT
23197aarch64_estimated_poly_value (poly_int64 val)
23198{
23199  enum aarch64_sve_vector_bits_enum width_source
23200    = aarch64_tune_params.sve_width;
23201
23202  /* If we still don't have an estimate, use the default.  */
23203  if (width_source == SVE_SCALABLE)
23204    return default_estimated_poly_value (val);
23205
23206  HOST_WIDE_INT over_128 = width_source - 128;
23207  return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
23208}
23209
23210
23211/* Return true for types that could be supported as SIMD return or
23212   argument types.  */
23213
23214static bool
23215supported_simd_type (tree t)
23216{
23217  if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
23218    {
23219      HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
23220      return s == 1 || s == 2 || s == 4 || s == 8;
23221    }
23222  return false;
23223}
23224
23225/* Return true for types that currently are supported as SIMD return
23226   or argument types.  */
23227
23228static bool
23229currently_supported_simd_type (tree t, tree b)
23230{
23231  if (COMPLEX_FLOAT_TYPE_P (t))
23232    return false;
23233
23234  if (TYPE_SIZE (t) != TYPE_SIZE (b))
23235    return false;
23236
23237  return supported_simd_type (t);
23238}
23239
23240/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
23241
23242static int
23243aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
23244					struct cgraph_simd_clone *clonei,
23245					tree base_type, int num)
23246{
23247  tree t, ret_type, arg_type;
23248  unsigned int elt_bits, vec_bits, count;
23249
23250  if (!TARGET_SIMD)
23251    return 0;
23252
23253  if (clonei->simdlen
23254      && (clonei->simdlen < 2
23255	  || clonei->simdlen > 1024
23256	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
23257    {
23258      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23259		  "unsupported simdlen %d", clonei->simdlen);
23260      return 0;
23261    }
23262
23263  ret_type = TREE_TYPE (TREE_TYPE (node->decl));
23264  if (TREE_CODE (ret_type) != VOID_TYPE
23265      && !currently_supported_simd_type (ret_type, base_type))
23266    {
23267      if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
23268	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23269		    "GCC does not currently support mixed size types "
23270		    "for %<simd%> functions");
23271      else if (supported_simd_type (ret_type))
23272	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23273		    "GCC does not currently support return type %qT "
23274		    "for %<simd%> functions", ret_type);
23275      else
23276	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23277		    "unsupported return type %qT for %<simd%> functions",
23278		    ret_type);
23279      return 0;
23280    }
23281
23282  int i;
23283  tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
23284  bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
23285
23286  for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
23287       t && t != void_list_node; t = TREE_CHAIN (t), i++)
23288    {
23289      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
23290
23291      if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
23292	  && !currently_supported_simd_type (arg_type, base_type))
23293	{
23294	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
23295	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23296			"GCC does not currently support mixed size types "
23297			"for %<simd%> functions");
23298	  else
23299	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23300			"GCC does not currently support argument type %qT "
23301			"for %<simd%> functions", arg_type);
23302	  return 0;
23303	}
23304    }
23305
23306  clonei->vecsize_mangle = 'n';
23307  clonei->mask_mode = VOIDmode;
23308  elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
23309  if (clonei->simdlen == 0)
23310    {
23311      count = 2;
23312      vec_bits = (num == 0 ? 64 : 128);
23313      clonei->simdlen = vec_bits / elt_bits;
23314    }
23315  else
23316    {
23317      count = 1;
23318      vec_bits = clonei->simdlen * elt_bits;
23319      if (vec_bits != 64 && vec_bits != 128)
23320	{
23321	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
23322		      "GCC does not currently support simdlen %d for type %qT",
23323		      clonei->simdlen, base_type);
23324	  return 0;
23325	}
23326    }
23327  clonei->vecsize_int = vec_bits;
23328  clonei->vecsize_float = vec_bits;
23329  return count;
23330}
23331
23332/* Implement TARGET_SIMD_CLONE_ADJUST.  */
23333
23334static void
23335aarch64_simd_clone_adjust (struct cgraph_node *node)
23336{
23337  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
23338     use the correct ABI.  */
23339
23340  tree t = TREE_TYPE (node->decl);
23341  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
23342					TYPE_ATTRIBUTES (t));
23343}
23344
23345/* Implement TARGET_SIMD_CLONE_USABLE.  */
23346
23347static int
23348aarch64_simd_clone_usable (struct cgraph_node *node)
23349{
23350  switch (node->simdclone->vecsize_mangle)
23351    {
23352    case 'n':
23353      if (!TARGET_SIMD)
23354	return -1;
23355      return 0;
23356    default:
23357      gcc_unreachable ();
23358    }
23359}
23360
23361/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
23362
23363static int
23364aarch64_comp_type_attributes (const_tree type1, const_tree type2)
23365{
23366  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
23367      != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
23368    return 0;
23369  return 1;
23370}
23371
23372/* Implement TARGET_GET_MULTILIB_ABI_NAME */
23373
23374static const char *
23375aarch64_get_multilib_abi_name (void)
23376{
23377  if (TARGET_BIG_END)
23378    return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23379  return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23380}
23381
23382/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23383   global variable based guard use the default else
23384   return a null tree.  */
23385static tree
23386aarch64_stack_protect_guard (void)
23387{
23388  if (aarch64_stack_protector_guard == SSP_GLOBAL)
23389    return default_stack_protect_guard ();
23390
23391  return NULL_TREE;
23392}
23393
23394/* Return the diagnostic message string if conversion from FROMTYPE to
23395   TOTYPE is not allowed, NULL otherwise.  */
23396
23397static const char *
23398aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23399{
23400  if (element_mode (fromtype) != element_mode (totype))
23401    {
23402      /* Do no allow conversions to/from BFmode scalar types.  */
23403      if (TYPE_MODE (fromtype) == BFmode)
23404	return N_("invalid conversion from type %<bfloat16_t%>");
23405      if (TYPE_MODE (totype) == BFmode)
23406	return N_("invalid conversion to type %<bfloat16_t%>");
23407    }
23408
23409  /* Conversion allowed.  */
23410  return NULL;
23411}
23412
23413/* Return the diagnostic message string if the unary operation OP is
23414   not permitted on TYPE, NULL otherwise.  */
23415
23416static const char *
23417aarch64_invalid_unary_op (int op, const_tree type)
23418{
23419  /* Reject all single-operand operations on BFmode except for &.  */
23420  if (element_mode (type) == BFmode && op != ADDR_EXPR)
23421    return N_("operation not permitted on type %<bfloat16_t%>");
23422
23423  /* Operation allowed.  */
23424  return NULL;
23425}
23426
23427/* Return the diagnostic message string if the binary operation OP is
23428   not permitted on TYPE1 and TYPE2, NULL otherwise.  */
23429
23430static const char *
23431aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23432			   const_tree type2)
23433{
23434  /* Reject all 2-operand operations on BFmode.  */
23435  if (element_mode (type1) == BFmode
23436      || element_mode (type2) == BFmode)
23437    return N_("operation not permitted on type %<bfloat16_t%>");
23438
23439  if (VECTOR_TYPE_P (type1)
23440      && VECTOR_TYPE_P (type2)
23441      && !TYPE_INDIVISIBLE_P (type1)
23442      && !TYPE_INDIVISIBLE_P (type2)
23443      && (aarch64_sve::builtin_type_p (type1)
23444	  != aarch64_sve::builtin_type_p (type2)))
23445    return N_("cannot combine GNU and SVE vectors in a binary operation");
23446
23447  /* Operation allowed.  */
23448  return NULL;
23449}
23450
23451/* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
23452   section at the end if needed.  */
23453#define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
23454#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
23455#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
23456void
23457aarch64_file_end_indicate_exec_stack ()
23458{
23459  file_end_indicate_exec_stack ();
23460
23461  unsigned feature_1_and = 0;
23462  if (aarch64_bti_enabled ())
23463    feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23464
23465  if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23466    feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23467
23468  if (feature_1_and)
23469    {
23470      /* Generate .note.gnu.property section.  */
23471      switch_to_section (get_section (".note.gnu.property",
23472				      SECTION_NOTYPE, NULL));
23473
23474      /* PT_NOTE header: namesz, descsz, type.
23475	 namesz = 4 ("GNU\0")
23476	 descsz = 16 (Size of the program property array)
23477		  [(12 + padding) * Number of array elements]
23478	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
23479      assemble_align (POINTER_SIZE);
23480      assemble_integer (GEN_INT (4), 4, 32, 1);
23481      assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23482      assemble_integer (GEN_INT (5), 4, 32, 1);
23483
23484      /* PT_NOTE name.  */
23485      assemble_string ("GNU", 4);
23486
23487      /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23488	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23489	 datasz = 4
23490	 data   = feature_1_and.  */
23491      assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23492      assemble_integer (GEN_INT (4), 4, 32, 1);
23493      assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23494
23495      /* Pad the size of the note to the required alignment.  */
23496      assemble_align (POINTER_SIZE);
23497    }
23498}
23499#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23500#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23501#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
23502
23503/* Helper function for straight line speculation.
23504   Return what barrier should be emitted for straight line speculation
23505   mitigation.
23506   When not mitigating against straight line speculation this function returns
23507   an empty string.
23508   When mitigating against straight line speculation, use:
23509   * SB when the v8.5-A SB extension is enabled.
23510   * DSB+ISB otherwise.  */
23511const char *
23512aarch64_sls_barrier (int mitigation_required)
23513{
23514  return mitigation_required
23515    ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23516    : "";
23517}
23518
23519static GTY (()) tree aarch64_sls_shared_thunks[30];
23520static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23521const char *indirect_symbol_names[30] = {
23522    "__call_indirect_x0",
23523    "__call_indirect_x1",
23524    "__call_indirect_x2",
23525    "__call_indirect_x3",
23526    "__call_indirect_x4",
23527    "__call_indirect_x5",
23528    "__call_indirect_x6",
23529    "__call_indirect_x7",
23530    "__call_indirect_x8",
23531    "__call_indirect_x9",
23532    "__call_indirect_x10",
23533    "__call_indirect_x11",
23534    "__call_indirect_x12",
23535    "__call_indirect_x13",
23536    "__call_indirect_x14",
23537    "__call_indirect_x15",
23538    "", /* "__call_indirect_x16",  */
23539    "", /* "__call_indirect_x17",  */
23540    "__call_indirect_x18",
23541    "__call_indirect_x19",
23542    "__call_indirect_x20",
23543    "__call_indirect_x21",
23544    "__call_indirect_x22",
23545    "__call_indirect_x23",
23546    "__call_indirect_x24",
23547    "__call_indirect_x25",
23548    "__call_indirect_x26",
23549    "__call_indirect_x27",
23550    "__call_indirect_x28",
23551    "__call_indirect_x29",
23552};
23553
23554/* Function to create a BLR thunk.  This thunk is used to mitigate straight
23555   line speculation.  Instead of a simple BLR that can be speculated past,
23556   we emit a BL to this thunk, and this thunk contains a BR to the relevant
23557   register.  These thunks have the relevant speculation barries put after
23558   their indirect branch so that speculation is blocked.
23559
23560   We use such a thunk so the speculation barriers are kept off the
23561   architecturally executed path in order to reduce the performance overhead.
23562
23563   When optimizing for size we use stubs shared by the linked object.
23564   When optimizing for performance we emit stubs for each function in the hope
23565   that the branch predictor can better train on jumps specific for a given
23566   function.  */
23567rtx
23568aarch64_sls_create_blr_label (int regnum)
23569{
23570  gcc_assert (STUB_REGNUM_P (regnum));
23571  if (optimize_function_for_size_p (cfun))
23572    {
23573      /* For the thunks shared between different functions in this compilation
23574	 unit we use a named symbol -- this is just for users to more easily
23575	 understand the generated assembly.  */
23576      aarch64_sls_shared_thunks_needed = true;
23577      const char *thunk_name = indirect_symbol_names[regnum];
23578      if (aarch64_sls_shared_thunks[regnum] == NULL)
23579	{
23580	  /* Build a decl representing this function stub and record it for
23581	     later.  We build a decl here so we can use the GCC machinery for
23582	     handling sections automatically (through `get_named_section` and
23583	     `make_decl_one_only`).  That saves us a lot of trouble handling
23584	     the specifics of different output file formats.  */
23585	  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23586				  get_identifier (thunk_name),
23587				  build_function_type_list (void_type_node,
23588							    NULL_TREE));
23589	  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23590					   NULL_TREE, void_type_node);
23591	  TREE_PUBLIC (decl) = 1;
23592	  TREE_STATIC (decl) = 1;
23593	  DECL_IGNORED_P (decl) = 1;
23594	  DECL_ARTIFICIAL (decl) = 1;
23595	  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23596	  resolve_unique_section (decl, 0, false);
23597	  aarch64_sls_shared_thunks[regnum] = decl;
23598	}
23599
23600      return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23601    }
23602
23603  if (cfun->machine->call_via[regnum] == NULL)
23604    cfun->machine->call_via[regnum]
23605      = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23606  return cfun->machine->call_via[regnum];
23607}
23608
23609/* Helper function for aarch64_sls_emit_blr_function_thunks and
23610   aarch64_sls_emit_shared_blr_thunks below.  */
23611static void
23612aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23613{
23614  /* Save in x16 and branch to that function so this transformation does
23615     not prevent jumping to `BTI c` instructions.  */
23616  asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23617  asm_fprintf (out_file, "\tbr\tx16\n");
23618}
23619
23620/* Emit all BLR stubs for this particular function.
23621   Here we emit all the BLR stubs needed for the current function.  Since we
23622   emit these stubs in a consecutive block we know there will be no speculation
23623   gadgets between each stub, and hence we only emit a speculation barrier at
23624   the end of the stub sequences.
23625
23626   This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
23627void
23628aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23629{
23630  if (! aarch64_harden_sls_blr_p ())
23631    return;
23632
23633  bool any_functions_emitted = false;
23634  /* We must save and restore the current function section since this assembly
23635     is emitted at the end of the function.  This means it can be emitted *just
23636     after* the cold section of a function.  That cold part would be emitted in
23637     a different section.  That switch would trigger a `.cfi_endproc` directive
23638     to be emitted in the original section and a `.cfi_startproc` directive to
23639     be emitted in the new section.  Switching to the original section without
23640     restoring would mean that the `.cfi_endproc` emitted as a function ends
23641     would happen in a different section -- leaving an unmatched
23642     `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23643     in the standard text section.  */
23644  section *save_text_section = in_section;
23645  switch_to_section (function_section (current_function_decl));
23646  for (int regnum = 0; regnum < 30; ++regnum)
23647    {
23648      rtx specu_label = cfun->machine->call_via[regnum];
23649      if (specu_label == NULL)
23650	continue;
23651
23652      targetm.asm_out.print_operand (out_file, specu_label, 0);
23653      asm_fprintf (out_file, ":\n");
23654      aarch64_sls_emit_function_stub (out_file, regnum);
23655      any_functions_emitted = true;
23656    }
23657  if (any_functions_emitted)
23658    /* Can use the SB if needs be here, since this stub will only be used
23659      by the current function, and hence for the current target.  */
23660    asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23661  switch_to_section (save_text_section);
23662}
23663
23664/* Emit shared BLR stubs for the current compilation unit.
23665   Over the course of compiling this unit we may have converted some BLR
23666   instructions to a BL to a shared stub function.  This is where we emit those
23667   stub functions.
23668   This function is for the stubs shared between different functions in this
23669   compilation unit.  We share when optimizing for size instead of speed.
23670
23671   This function is called through the TARGET_ASM_FILE_END hook.  */
23672void
23673aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23674{
23675  if (! aarch64_sls_shared_thunks_needed)
23676    return;
23677
23678  for (int regnum = 0; regnum < 30; ++regnum)
23679    {
23680      tree decl = aarch64_sls_shared_thunks[regnum];
23681      if (!decl)
23682	continue;
23683
23684      const char *name = indirect_symbol_names[regnum];
23685      switch_to_section (get_named_section (decl, NULL, 0));
23686      ASM_OUTPUT_ALIGN (out_file, 2);
23687      targetm.asm_out.globalize_label (out_file, name);
23688      /* Only emits if the compiler is configured for an assembler that can
23689	 handle visibility directives.  */
23690      targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23691      ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23692      ASM_OUTPUT_LABEL (out_file, name);
23693      aarch64_sls_emit_function_stub (out_file, regnum);
23694      /* Use the most conservative target to ensure it can always be used by any
23695	 function in the translation unit.  */
23696      asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23697      ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23698    }
23699}
23700
23701/* Implement TARGET_ASM_FILE_END.  */
23702void
23703aarch64_asm_file_end ()
23704{
23705  aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23706  /* Since this function will be called for the ASM_FILE_END hook, we ensure
23707     that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23708     for FreeBSD) still gets called.  */
23709#ifdef TARGET_ASM_FILE_END
23710  TARGET_ASM_FILE_END ();
23711#endif
23712}
23713
23714const char *
23715aarch64_indirect_call_asm (rtx addr)
23716{
23717  gcc_assert (REG_P (addr));
23718  if (aarch64_harden_sls_blr_p ())
23719    {
23720      rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23721      output_asm_insn ("bl\t%0", &stub_label);
23722    }
23723  else
23724   output_asm_insn ("blr\t%0", &addr);
23725  return "";
23726}
23727
23728/* Target-specific selftests.  */
23729
23730#if CHECKING_P
23731
23732namespace selftest {
23733
23734/* Selftest for the RTL loader.
23735   Verify that the RTL loader copes with a dump from
23736   print_rtx_function.  This is essentially just a test that class
23737   function_reader can handle a real dump, but it also verifies
23738   that lookup_reg_by_dump_name correctly handles hard regs.
23739   The presence of hard reg names in the dump means that the test is
23740   target-specific, hence it is in this file.  */
23741
23742static void
23743aarch64_test_loading_full_dump ()
23744{
23745  rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23746
23747  ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23748
23749  rtx_insn *insn_1 = get_insn_by_uid (1);
23750  ASSERT_EQ (NOTE, GET_CODE (insn_1));
23751
23752  rtx_insn *insn_15 = get_insn_by_uid (15);
23753  ASSERT_EQ (INSN, GET_CODE (insn_15));
23754  ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23755
23756  /* Verify crtl->return_rtx.  */
23757  ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23758  ASSERT_EQ (0, REGNO (crtl->return_rtx));
23759  ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23760}
23761
23762/* Run all target-specific selftests.  */
23763
23764static void
23765aarch64_run_selftests (void)
23766{
23767  aarch64_test_loading_full_dump ();
23768}
23769
23770} // namespace selftest
23771
23772#endif /* #if CHECKING_P */
23773
23774#undef TARGET_STACK_PROTECT_GUARD
23775#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23776
23777#undef TARGET_ADDRESS_COST
23778#define TARGET_ADDRESS_COST aarch64_address_cost
23779
23780/* This hook will determines whether unnamed bitfields affect the alignment
23781   of the containing structure.  The hook returns true if the structure
23782   should inherit the alignment requirements of an unnamed bitfield's
23783   type.  */
23784#undef TARGET_ALIGN_ANON_BITFIELD
23785#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23786
23787#undef TARGET_ASM_ALIGNED_DI_OP
23788#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23789
23790#undef TARGET_ASM_ALIGNED_HI_OP
23791#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23792
23793#undef TARGET_ASM_ALIGNED_SI_OP
23794#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23795
23796#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23797#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23798  hook_bool_const_tree_hwi_hwi_const_tree_true
23799
23800#undef TARGET_ASM_FILE_START
23801#define TARGET_ASM_FILE_START aarch64_start_file
23802
23803#undef TARGET_ASM_OUTPUT_MI_THUNK
23804#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23805
23806#undef TARGET_ASM_SELECT_RTX_SECTION
23807#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23808
23809#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23810#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23811
23812#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23813#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23814
23815#undef TARGET_BUILD_BUILTIN_VA_LIST
23816#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23817
23818#undef TARGET_CALLEE_COPIES
23819#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
23820
23821#undef TARGET_CAN_ELIMINATE
23822#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23823
23824#undef TARGET_CAN_INLINE_P
23825#define TARGET_CAN_INLINE_P aarch64_can_inline_p
23826
23827#undef TARGET_CANNOT_FORCE_CONST_MEM
23828#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23829
23830#undef TARGET_CASE_VALUES_THRESHOLD
23831#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23832
23833#undef TARGET_CONDITIONAL_REGISTER_USAGE
23834#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23835
23836#undef TARGET_MEMBER_TYPE_FORCES_BLK
23837#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23838
23839/* Only the least significant bit is used for initialization guard
23840   variables.  */
23841#undef TARGET_CXX_GUARD_MASK_BIT
23842#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23843
23844#undef TARGET_C_MODE_FOR_SUFFIX
23845#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23846
23847#ifdef TARGET_BIG_ENDIAN_DEFAULT
23848#undef  TARGET_DEFAULT_TARGET_FLAGS
23849#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23850#endif
23851
23852#undef TARGET_CLASS_MAX_NREGS
23853#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23854
23855#undef TARGET_BUILTIN_DECL
23856#define TARGET_BUILTIN_DECL aarch64_builtin_decl
23857
23858#undef TARGET_BUILTIN_RECIPROCAL
23859#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23860
23861#undef TARGET_C_EXCESS_PRECISION
23862#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23863
23864#undef  TARGET_EXPAND_BUILTIN
23865#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23866
23867#undef TARGET_EXPAND_BUILTIN_VA_START
23868#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23869
23870#undef TARGET_FOLD_BUILTIN
23871#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23872
23873#undef TARGET_FUNCTION_ARG
23874#define TARGET_FUNCTION_ARG aarch64_function_arg
23875
23876#undef TARGET_FUNCTION_ARG_ADVANCE
23877#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23878
23879#undef TARGET_FUNCTION_ARG_BOUNDARY
23880#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23881
23882#undef TARGET_FUNCTION_ARG_PADDING
23883#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23884
23885#undef TARGET_GET_RAW_RESULT_MODE
23886#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23887#undef TARGET_GET_RAW_ARG_MODE
23888#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23889
23890#undef TARGET_FUNCTION_OK_FOR_SIBCALL
23891#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23892
23893#undef TARGET_FUNCTION_VALUE
23894#define TARGET_FUNCTION_VALUE aarch64_function_value
23895
23896#undef TARGET_FUNCTION_VALUE_REGNO_P
23897#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23898
23899#undef TARGET_GIMPLE_FOLD_BUILTIN
23900#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23901
23902#undef TARGET_GIMPLIFY_VA_ARG_EXPR
23903#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23904
23905#undef  TARGET_INIT_BUILTINS
23906#define TARGET_INIT_BUILTINS  aarch64_init_builtins
23907
23908#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23909#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23910  aarch64_ira_change_pseudo_allocno_class
23911
23912#undef TARGET_LEGITIMATE_ADDRESS_P
23913#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23914
23915#undef TARGET_LEGITIMATE_CONSTANT_P
23916#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23917
23918#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23919#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23920  aarch64_legitimize_address_displacement
23921
23922#undef TARGET_LIBGCC_CMP_RETURN_MODE
23923#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23924
23925#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23926#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23927aarch64_libgcc_floating_mode_supported_p
23928
23929#undef TARGET_MANGLE_TYPE
23930#define TARGET_MANGLE_TYPE aarch64_mangle_type
23931
23932#undef TARGET_INVALID_CONVERSION
23933#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23934
23935#undef TARGET_INVALID_UNARY_OP
23936#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23937
23938#undef TARGET_INVALID_BINARY_OP
23939#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23940
23941#undef TARGET_VERIFY_TYPE_CONTEXT
23942#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23943
23944#undef TARGET_MEMORY_MOVE_COST
23945#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23946
23947#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23948#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23949
23950#undef TARGET_MUST_PASS_IN_STACK
23951#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23952
23953/* This target hook should return true if accesses to volatile bitfields
23954   should use the narrowest mode possible.  It should return false if these
23955   accesses should use the bitfield container type.  */
23956#undef TARGET_NARROW_VOLATILE_BITFIELD
23957#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23958
23959#undef  TARGET_OPTION_OVERRIDE
23960#define TARGET_OPTION_OVERRIDE aarch64_override_options
23961
23962#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23963#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23964  aarch64_override_options_after_change
23965
23966#undef TARGET_OPTION_SAVE
23967#define TARGET_OPTION_SAVE aarch64_option_save
23968
23969#undef TARGET_OPTION_RESTORE
23970#define TARGET_OPTION_RESTORE aarch64_option_restore
23971
23972#undef TARGET_OPTION_PRINT
23973#define TARGET_OPTION_PRINT aarch64_option_print
23974
23975#undef TARGET_OPTION_VALID_ATTRIBUTE_P
23976#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
23977
23978#undef TARGET_SET_CURRENT_FUNCTION
23979#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
23980
23981#undef TARGET_PASS_BY_REFERENCE
23982#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
23983
23984#undef TARGET_PREFERRED_RELOAD_CLASS
23985#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
23986
23987#undef TARGET_SCHED_REASSOCIATION_WIDTH
23988#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
23989
23990#undef TARGET_PROMOTED_TYPE
23991#define TARGET_PROMOTED_TYPE aarch64_promoted_type
23992
23993#undef TARGET_SECONDARY_RELOAD
23994#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
23995
23996#undef TARGET_SHIFT_TRUNCATION_MASK
23997#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
23998
23999#undef TARGET_SETUP_INCOMING_VARARGS
24000#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
24001
24002#undef TARGET_STRUCT_VALUE_RTX
24003#define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
24004
24005#undef TARGET_REGISTER_MOVE_COST
24006#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
24007
24008#undef TARGET_RETURN_IN_MEMORY
24009#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
24010
24011#undef TARGET_RETURN_IN_MSB
24012#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
24013
24014#undef TARGET_RTX_COSTS
24015#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
24016
24017#undef TARGET_SCALAR_MODE_SUPPORTED_P
24018#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
24019
24020#undef TARGET_SCHED_ISSUE_RATE
24021#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
24022
24023#undef TARGET_SCHED_VARIABLE_ISSUE
24024#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
24025
24026#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
24027#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
24028  aarch64_sched_first_cycle_multipass_dfa_lookahead
24029
24030#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
24031#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
24032  aarch64_first_cycle_multipass_dfa_lookahead_guard
24033
24034#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
24035#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
24036  aarch64_get_separate_components
24037
24038#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
24039#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
24040  aarch64_components_for_bb
24041
24042#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
24043#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
24044  aarch64_disqualify_components
24045
24046#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
24047#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
24048  aarch64_emit_prologue_components
24049
24050#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
24051#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
24052  aarch64_emit_epilogue_components
24053
24054#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
24055#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
24056  aarch64_set_handled_components
24057
24058#undef TARGET_TRAMPOLINE_INIT
24059#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
24060
24061#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
24062#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
24063
24064#undef TARGET_VECTOR_MODE_SUPPORTED_P
24065#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
24066
24067#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
24068#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
24069
24070#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
24071#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
24072  aarch64_builtin_support_vector_misalignment
24073
24074#undef TARGET_ARRAY_MODE
24075#define TARGET_ARRAY_MODE aarch64_array_mode
24076
24077#undef TARGET_ARRAY_MODE_SUPPORTED_P
24078#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
24079
24080#undef TARGET_VECTORIZE_ADD_STMT_COST
24081#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
24082
24083#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
24084#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
24085  aarch64_builtin_vectorization_cost
24086
24087#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
24088#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
24089
24090#undef TARGET_VECTORIZE_BUILTINS
24091#define TARGET_VECTORIZE_BUILTINS
24092
24093#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
24094#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
24095  aarch64_builtin_vectorized_function
24096
24097#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
24098#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
24099  aarch64_autovectorize_vector_modes
24100
24101#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
24102#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
24103  aarch64_atomic_assign_expand_fenv
24104
24105/* Section anchor support.  */
24106
24107#undef TARGET_MIN_ANCHOR_OFFSET
24108#define TARGET_MIN_ANCHOR_OFFSET -256
24109
24110/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
24111   byte offset; we can do much more for larger data types, but have no way
24112   to determine the size of the access.  We assume accesses are aligned.  */
24113#undef TARGET_MAX_ANCHOR_OFFSET
24114#define TARGET_MAX_ANCHOR_OFFSET 4095
24115
24116#undef TARGET_VECTOR_ALIGNMENT
24117#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
24118
24119#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
24120#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
24121  aarch64_vectorize_preferred_vector_alignment
24122#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
24123#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
24124  aarch64_simd_vector_alignment_reachable
24125
24126/* vec_perm support.  */
24127
24128#undef TARGET_VECTORIZE_VEC_PERM_CONST
24129#define TARGET_VECTORIZE_VEC_PERM_CONST \
24130  aarch64_vectorize_vec_perm_const
24131
24132#undef TARGET_VECTORIZE_RELATED_MODE
24133#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
24134#undef TARGET_VECTORIZE_GET_MASK_MODE
24135#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
24136#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
24137#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
24138  aarch64_empty_mask_is_expensive
24139#undef TARGET_PREFERRED_ELSE_VALUE
24140#define TARGET_PREFERRED_ELSE_VALUE \
24141  aarch64_preferred_else_value
24142
24143#undef TARGET_INIT_LIBFUNCS
24144#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
24145
24146#undef TARGET_FIXED_CONDITION_CODE_REGS
24147#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
24148
24149#undef TARGET_FLAGS_REGNUM
24150#define TARGET_FLAGS_REGNUM CC_REGNUM
24151
24152#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
24153#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
24154
24155#undef TARGET_ASAN_SHADOW_OFFSET
24156#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
24157
24158#undef TARGET_LEGITIMIZE_ADDRESS
24159#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
24160
24161#undef TARGET_SCHED_CAN_SPECULATE_INSN
24162#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
24163
24164#undef TARGET_CAN_USE_DOLOOP_P
24165#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
24166
24167#undef TARGET_SCHED_ADJUST_PRIORITY
24168#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
24169
24170#undef TARGET_SCHED_MACRO_FUSION_P
24171#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
24172
24173#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
24174#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
24175
24176#undef TARGET_SCHED_FUSION_PRIORITY
24177#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
24178
24179#undef TARGET_UNSPEC_MAY_TRAP_P
24180#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
24181
24182#undef TARGET_USE_PSEUDO_PIC_REG
24183#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
24184
24185#undef TARGET_PRINT_OPERAND
24186#define TARGET_PRINT_OPERAND aarch64_print_operand
24187
24188#undef TARGET_PRINT_OPERAND_ADDRESS
24189#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
24190
24191#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
24192#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
24193
24194#undef TARGET_OPTAB_SUPPORTED_P
24195#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
24196
24197#undef TARGET_OMIT_STRUCT_RETURN_REG
24198#define TARGET_OMIT_STRUCT_RETURN_REG true
24199
24200#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
24201#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
24202  aarch64_dwarf_poly_indeterminate_value
24203
24204/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
24205#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
24206#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
24207
24208#undef TARGET_HARD_REGNO_NREGS
24209#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
24210#undef TARGET_HARD_REGNO_MODE_OK
24211#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
24212
24213#undef TARGET_MODES_TIEABLE_P
24214#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
24215
24216#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
24217#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
24218  aarch64_hard_regno_call_part_clobbered
24219
24220#undef TARGET_INSN_CALLEE_ABI
24221#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
24222
24223#undef TARGET_CONSTANT_ALIGNMENT
24224#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
24225
24226#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
24227#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
24228  aarch64_stack_clash_protection_alloca_probe_range
24229
24230#undef TARGET_COMPUTE_PRESSURE_CLASSES
24231#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
24232
24233#undef TARGET_CAN_CHANGE_MODE_CLASS
24234#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
24235
24236#undef TARGET_SELECT_EARLY_REMAT_MODES
24237#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
24238
24239#undef TARGET_SPECULATION_SAFE_VALUE
24240#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
24241
24242#undef TARGET_ESTIMATED_POLY_VALUE
24243#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
24244
24245#undef TARGET_ATTRIBUTE_TABLE
24246#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
24247
24248#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
24249#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
24250  aarch64_simd_clone_compute_vecsize_and_simdlen
24251
24252#undef TARGET_SIMD_CLONE_ADJUST
24253#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
24254
24255#undef TARGET_SIMD_CLONE_USABLE
24256#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
24257
24258#undef TARGET_COMP_TYPE_ATTRIBUTES
24259#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
24260
24261#undef TARGET_GET_MULTILIB_ABI_NAME
24262#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
24263
24264#undef TARGET_FNTYPE_ABI
24265#define TARGET_FNTYPE_ABI aarch64_fntype_abi
24266
24267#if CHECKING_P
24268#undef TARGET_RUN_TARGET_SELFTESTS
24269#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
24270#endif /* #if CHECKING_P */
24271
24272#undef TARGET_ASM_POST_CFI_STARTPROC
24273#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
24274
24275#undef TARGET_STRICT_ARGUMENT_NAMING
24276#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
24277
24278#undef TARGET_MD_ASM_ADJUST
24279#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
24280
24281#undef TARGET_ASM_FILE_END
24282#define TARGET_ASM_FILE_END aarch64_asm_file_end
24283
24284#undef TARGET_ASM_FUNCTION_EPILOGUE
24285#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
24286
24287struct gcc_target targetm = TARGET_INITIALIZER;
24288
24289#include "gt-aarch64.h"
24290