1/* Machine description for AArch64 architecture. 2 Copyright (C) 2009-2020 Free Software Foundation, Inc. 3 Contributed by ARM Ltd. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 GCC is distributed in the hope that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21#define IN_TARGET_CODE 1 22 23#include "config.h" 24#define INCLUDE_STRING 25#include "system.h" 26#include "coretypes.h" 27#include "backend.h" 28#include "target.h" 29#include "rtl.h" 30#include "tree.h" 31#include "memmodel.h" 32#include "gimple.h" 33#include "cfghooks.h" 34#include "cfgloop.h" 35#include "df.h" 36#include "tm_p.h" 37#include "stringpool.h" 38#include "attribs.h" 39#include "optabs.h" 40#include "regs.h" 41#include "emit-rtl.h" 42#include "recog.h" 43#include "cgraph.h" 44#include "diagnostic.h" 45#include "insn-attr.h" 46#include "alias.h" 47#include "fold-const.h" 48#include "stor-layout.h" 49#include "calls.h" 50#include "varasm.h" 51#include "output.h" 52#include "flags.h" 53#include "explow.h" 54#include "expr.h" 55#include "reload.h" 56#include "langhooks.h" 57#include "opts.h" 58#include "gimplify.h" 59#include "dwarf2.h" 60#include "gimple-iterator.h" 61#include "tree-vectorizer.h" 62#include "aarch64-cost-tables.h" 63#include "dumpfile.h" 64#include "builtins.h" 65#include "rtl-iter.h" 66#include "tm-constrs.h" 67#include "sched-int.h" 68#include "target-globals.h" 69#include "common/common-target.h" 70#include "cfgrtl.h" 71#include "selftest.h" 72#include "selftest-rtl.h" 73#include "rtx-vector-builder.h" 74#include "intl.h" 75#include "expmed.h" 76#include "function-abi.h" 77 78/* This file should be included last. */ 79#include "target-def.h" 80 81/* Defined for convenience. */ 82#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) 83 84/* Information about a legitimate vector immediate operand. */ 85struct simd_immediate_info 86{ 87 enum insn_type { MOV, MVN, INDEX, PTRUE }; 88 enum modifier_type { LSL, MSL }; 89 90 simd_immediate_info () {} 91 simd_immediate_info (scalar_float_mode, rtx); 92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, 93 insn_type = MOV, modifier_type = LSL, 94 unsigned int = 0); 95 simd_immediate_info (scalar_mode, rtx, rtx); 96 simd_immediate_info (scalar_int_mode, aarch64_svpattern); 97 98 /* The mode of the elements. */ 99 scalar_mode elt_mode; 100 101 /* The instruction to use to move the immediate into a vector. */ 102 insn_type insn; 103 104 union 105 { 106 /* For MOV and MVN. */ 107 struct 108 { 109 /* The value of each element. */ 110 rtx value; 111 112 /* The kind of shift modifier to use, and the number of bits to shift. 113 This is (LSL, 0) if no shift is needed. */ 114 modifier_type modifier; 115 unsigned int shift; 116 } mov; 117 118 /* For INDEX. */ 119 struct 120 { 121 /* The value of the first element and the step to be added for each 122 subsequent element. */ 123 rtx base, step; 124 } index; 125 126 /* For PTRUE. */ 127 aarch64_svpattern pattern; 128 } u; 129}; 130 131/* Construct a floating-point immediate in which each element has mode 132 ELT_MODE_IN and value VALUE_IN. */ 133inline simd_immediate_info 134::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) 135 : elt_mode (elt_mode_in), insn (MOV) 136{ 137 u.mov.value = value_in; 138 u.mov.modifier = LSL; 139 u.mov.shift = 0; 140} 141 142/* Construct an integer immediate in which each element has mode ELT_MODE_IN 143 and value VALUE_IN. The other parameters are as for the structure 144 fields. */ 145inline simd_immediate_info 146::simd_immediate_info (scalar_int_mode elt_mode_in, 147 unsigned HOST_WIDE_INT value_in, 148 insn_type insn_in, modifier_type modifier_in, 149 unsigned int shift_in) 150 : elt_mode (elt_mode_in), insn (insn_in) 151{ 152 u.mov.value = gen_int_mode (value_in, elt_mode_in); 153 u.mov.modifier = modifier_in; 154 u.mov.shift = shift_in; 155} 156 157/* Construct an integer immediate in which each element has mode ELT_MODE_IN 158 and where element I is equal to BASE_IN + I * STEP_IN. */ 159inline simd_immediate_info 160::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in) 161 : elt_mode (elt_mode_in), insn (INDEX) 162{ 163 u.index.base = base_in; 164 u.index.step = step_in; 165} 166 167/* Construct a predicate that controls elements of mode ELT_MODE_IN 168 and has PTRUE pattern PATTERN_IN. */ 169inline simd_immediate_info 170::simd_immediate_info (scalar_int_mode elt_mode_in, 171 aarch64_svpattern pattern_in) 172 : elt_mode (elt_mode_in), insn (PTRUE) 173{ 174 u.pattern = pattern_in; 175} 176 177namespace { 178 179/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */ 180class pure_scalable_type_info 181{ 182public: 183 /* Represents the result of analyzing a type. All values are nonzero, 184 in the possibly forlorn hope that accidental conversions to bool 185 trigger a warning. */ 186 enum analysis_result 187 { 188 /* The type does not have an ABI identity; i.e. it doesn't contain 189 at least one object whose type is a Fundamental Data Type. */ 190 NO_ABI_IDENTITY = 1, 191 192 /* The type is definitely a Pure Scalable Type. */ 193 IS_PST, 194 195 /* The type is definitely not a Pure Scalable Type. */ 196 ISNT_PST, 197 198 /* It doesn't matter for PCS purposes whether the type is a Pure 199 Scalable Type or not, since the type will be handled the same 200 way regardless. 201 202 Specifically, this means that if the type is a Pure Scalable Type, 203 there aren't enough argument registers to hold it, and so it will 204 need to be passed or returned in memory. If the type isn't a 205 Pure Scalable Type, it's too big to be passed or returned in core 206 or SIMD&FP registers, and so again will need to go in memory. */ 207 DOESNT_MATTER 208 }; 209 210 /* Aggregates of 17 bytes or more are normally passed and returned 211 in memory, so aggregates of that size can safely be analyzed as 212 DOESNT_MATTER. We need to be able to collect enough pieces to 213 represent a PST that is smaller than that. Since predicates are 214 2 bytes in size for -msve-vector-bits=128, that means we need to be 215 able to store at least 8 pieces. 216 217 We also need to be able to store enough pieces to represent 218 a single vector in each vector argument register and a single 219 predicate in each predicate argument register. This means that 220 we need at least 12 pieces. */ 221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS; 222#if __cplusplus >= 201103L 223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates"); 224#endif 225 226 /* Describes one piece of a PST. Each piece is one of: 227 228 - a single Scalable Vector Type (SVT) 229 - a single Scalable Predicate Type (SPT) 230 - a PST containing 2, 3 or 4 SVTs, with no padding 231 232 It either represents a single built-in type or a PST formed from 233 multiple homogeneous built-in types. */ 234 struct piece 235 { 236 rtx get_rtx (unsigned int, unsigned int) const; 237 238 /* The number of vector and predicate registers that the piece 239 occupies. One of the two is always zero. */ 240 unsigned int num_zr; 241 unsigned int num_pr; 242 243 /* The mode of the registers described above. */ 244 machine_mode mode; 245 246 /* If this piece is formed from multiple homogeneous built-in types, 247 this is the mode of the built-in types, otherwise it is MODE. */ 248 machine_mode orig_mode; 249 250 /* The offset in bytes of the piece from the start of the type. */ 251 poly_uint64_pod offset; 252 }; 253 254 /* Divides types analyzed as IS_PST into individual pieces. The pieces 255 are in memory order. */ 256 auto_vec<piece, MAX_PIECES> pieces; 257 258 unsigned int num_zr () const; 259 unsigned int num_pr () const; 260 261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const; 262 263 analysis_result analyze (const_tree); 264 bool analyze_registers (const_tree); 265 266private: 267 analysis_result analyze_array (const_tree); 268 analysis_result analyze_record (const_tree); 269 void add_piece (const piece &); 270}; 271} 272 273/* The current code model. */ 274enum aarch64_code_model aarch64_cmodel; 275 276/* The number of 64-bit elements in an SVE vector. */ 277poly_uint16 aarch64_sve_vg; 278 279#ifdef HAVE_AS_TLS 280#undef TARGET_HAVE_TLS 281#define TARGET_HAVE_TLS 1 282#endif 283 284static bool aarch64_composite_type_p (const_tree, machine_mode); 285static bool aarch64_return_in_memory_1 (const_tree); 286static bool aarch64_vfp_is_call_or_return_candidate (machine_mode, 287 const_tree, 288 machine_mode *, int *, 289 bool *, bool); 290static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; 291static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; 292static void aarch64_override_options_after_change (void); 293static bool aarch64_vector_mode_supported_p (machine_mode); 294static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); 295static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, 296 const_tree type, 297 int misalignment, 298 bool is_packed); 299static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); 300static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, 301 aarch64_addr_query_type); 302static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); 303 304/* Major revision number of the ARM Architecture implemented by the target. */ 305unsigned aarch64_architecture_version; 306 307/* The processor for which instructions should be scheduled. */ 308enum aarch64_processor aarch64_tune = cortexa53; 309 310/* Mask to specify which instruction scheduling options should be used. */ 311uint64_t aarch64_tune_flags = 0; 312 313/* Global flag for PC relative loads. */ 314bool aarch64_pcrelative_literal_loads; 315 316/* Global flag for whether frame pointer is enabled. */ 317bool aarch64_use_frame_pointer; 318 319#define BRANCH_PROTECT_STR_MAX 255 320char *accepted_branch_protection_string = NULL; 321 322static enum aarch64_parse_opt_result 323aarch64_parse_branch_protection (const char*, char**); 324 325/* Support for command line parsing of boolean flags in the tuning 326 structures. */ 327struct aarch64_flag_desc 328{ 329 const char* name; 330 unsigned int flag; 331}; 332 333#define AARCH64_FUSION_PAIR(name, internal_name) \ 334 { name, AARCH64_FUSE_##internal_name }, 335static const struct aarch64_flag_desc aarch64_fusible_pairs[] = 336{ 337 { "none", AARCH64_FUSE_NOTHING }, 338#include "aarch64-fusion-pairs.def" 339 { "all", AARCH64_FUSE_ALL }, 340 { NULL, AARCH64_FUSE_NOTHING } 341}; 342 343#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \ 344 { name, AARCH64_EXTRA_TUNE_##internal_name }, 345static const struct aarch64_flag_desc aarch64_tuning_flags[] = 346{ 347 { "none", AARCH64_EXTRA_TUNE_NONE }, 348#include "aarch64-tuning-flags.def" 349 { "all", AARCH64_EXTRA_TUNE_ALL }, 350 { NULL, AARCH64_EXTRA_TUNE_NONE } 351}; 352 353/* Tuning parameters. */ 354 355static const struct cpu_addrcost_table generic_addrcost_table = 356{ 357 { 358 1, /* hi */ 359 0, /* si */ 360 0, /* di */ 361 1, /* ti */ 362 }, 363 0, /* pre_modify */ 364 0, /* post_modify */ 365 0, /* register_offset */ 366 0, /* register_sextend */ 367 0, /* register_zextend */ 368 0 /* imm_offset */ 369}; 370 371static const struct cpu_addrcost_table exynosm1_addrcost_table = 372{ 373 { 374 0, /* hi */ 375 0, /* si */ 376 0, /* di */ 377 2, /* ti */ 378 }, 379 0, /* pre_modify */ 380 0, /* post_modify */ 381 1, /* register_offset */ 382 1, /* register_sextend */ 383 2, /* register_zextend */ 384 0, /* imm_offset */ 385}; 386 387static const struct cpu_addrcost_table xgene1_addrcost_table = 388{ 389 { 390 1, /* hi */ 391 0, /* si */ 392 0, /* di */ 393 1, /* ti */ 394 }, 395 1, /* pre_modify */ 396 1, /* post_modify */ 397 0, /* register_offset */ 398 1, /* register_sextend */ 399 1, /* register_zextend */ 400 0, /* imm_offset */ 401}; 402 403static const struct cpu_addrcost_table thunderx2t99_addrcost_table = 404{ 405 { 406 1, /* hi */ 407 1, /* si */ 408 1, /* di */ 409 2, /* ti */ 410 }, 411 0, /* pre_modify */ 412 0, /* post_modify */ 413 2, /* register_offset */ 414 3, /* register_sextend */ 415 3, /* register_zextend */ 416 0, /* imm_offset */ 417}; 418 419static const struct cpu_addrcost_table thunderx3t110_addrcost_table = 420{ 421 { 422 1, /* hi */ 423 1, /* si */ 424 1, /* di */ 425 2, /* ti */ 426 }, 427 0, /* pre_modify */ 428 0, /* post_modify */ 429 2, /* register_offset */ 430 3, /* register_sextend */ 431 3, /* register_zextend */ 432 0, /* imm_offset */ 433}; 434 435static const struct cpu_addrcost_table tsv110_addrcost_table = 436{ 437 { 438 1, /* hi */ 439 0, /* si */ 440 0, /* di */ 441 1, /* ti */ 442 }, 443 0, /* pre_modify */ 444 0, /* post_modify */ 445 0, /* register_offset */ 446 1, /* register_sextend */ 447 1, /* register_zextend */ 448 0, /* imm_offset */ 449}; 450 451static const struct cpu_addrcost_table qdf24xx_addrcost_table = 452{ 453 { 454 1, /* hi */ 455 1, /* si */ 456 1, /* di */ 457 2, /* ti */ 458 }, 459 1, /* pre_modify */ 460 1, /* post_modify */ 461 3, /* register_offset */ 462 3, /* register_sextend */ 463 3, /* register_zextend */ 464 2, /* imm_offset */ 465}; 466 467static const struct cpu_addrcost_table a64fx_addrcost_table = 468{ 469 { 470 1, /* hi */ 471 1, /* si */ 472 1, /* di */ 473 2, /* ti */ 474 }, 475 0, /* pre_modify */ 476 0, /* post_modify */ 477 2, /* register_offset */ 478 3, /* register_sextend */ 479 3, /* register_zextend */ 480 0, /* imm_offset */ 481}; 482 483static const struct cpu_regmove_cost generic_regmove_cost = 484{ 485 1, /* GP2GP */ 486 /* Avoid the use of slow int<->fp moves for spilling by setting 487 their cost higher than memmov_cost. */ 488 5, /* GP2FP */ 489 5, /* FP2GP */ 490 2 /* FP2FP */ 491}; 492 493static const struct cpu_regmove_cost cortexa57_regmove_cost = 494{ 495 1, /* GP2GP */ 496 /* Avoid the use of slow int<->fp moves for spilling by setting 497 their cost higher than memmov_cost. */ 498 5, /* GP2FP */ 499 5, /* FP2GP */ 500 2 /* FP2FP */ 501}; 502 503static const struct cpu_regmove_cost cortexa53_regmove_cost = 504{ 505 1, /* GP2GP */ 506 /* Avoid the use of slow int<->fp moves for spilling by setting 507 their cost higher than memmov_cost. */ 508 5, /* GP2FP */ 509 5, /* FP2GP */ 510 2 /* FP2FP */ 511}; 512 513static const struct cpu_regmove_cost exynosm1_regmove_cost = 514{ 515 1, /* GP2GP */ 516 /* Avoid the use of slow int<->fp moves for spilling by setting 517 their cost higher than memmov_cost (actual, 4 and 9). */ 518 9, /* GP2FP */ 519 9, /* FP2GP */ 520 1 /* FP2FP */ 521}; 522 523static const struct cpu_regmove_cost thunderx_regmove_cost = 524{ 525 2, /* GP2GP */ 526 2, /* GP2FP */ 527 6, /* FP2GP */ 528 4 /* FP2FP */ 529}; 530 531static const struct cpu_regmove_cost xgene1_regmove_cost = 532{ 533 1, /* GP2GP */ 534 /* Avoid the use of slow int<->fp moves for spilling by setting 535 their cost higher than memmov_cost. */ 536 8, /* GP2FP */ 537 8, /* FP2GP */ 538 2 /* FP2FP */ 539}; 540 541static const struct cpu_regmove_cost qdf24xx_regmove_cost = 542{ 543 2, /* GP2GP */ 544 /* Avoid the use of int<->fp moves for spilling. */ 545 6, /* GP2FP */ 546 6, /* FP2GP */ 547 4 /* FP2FP */ 548}; 549 550static const struct cpu_regmove_cost thunderx2t99_regmove_cost = 551{ 552 1, /* GP2GP */ 553 /* Avoid the use of int<->fp moves for spilling. */ 554 5, /* GP2FP */ 555 6, /* FP2GP */ 556 3, /* FP2FP */ 557}; 558 559static const struct cpu_regmove_cost thunderx3t110_regmove_cost = 560{ 561 1, /* GP2GP */ 562 /* Avoid the use of int<->fp moves for spilling. */ 563 4, /* GP2FP */ 564 5, /* FP2GP */ 565 4 /* FP2FP */ 566}; 567 568static const struct cpu_regmove_cost tsv110_regmove_cost = 569{ 570 1, /* GP2GP */ 571 /* Avoid the use of slow int<->fp moves for spilling by setting 572 their cost higher than memmov_cost. */ 573 2, /* GP2FP */ 574 3, /* FP2GP */ 575 2 /* FP2FP */ 576}; 577 578static const struct cpu_regmove_cost a64fx_regmove_cost = 579{ 580 1, /* GP2GP */ 581 /* Avoid the use of slow int<->fp moves for spilling by setting 582 their cost higher than memmov_cost. */ 583 5, /* GP2FP */ 584 7, /* FP2GP */ 585 2 /* FP2FP */ 586}; 587 588/* Generic costs for vector insn classes. */ 589static const struct cpu_vector_cost generic_vector_cost = 590{ 591 1, /* scalar_int_stmt_cost */ 592 1, /* scalar_fp_stmt_cost */ 593 1, /* scalar_load_cost */ 594 1, /* scalar_store_cost */ 595 1, /* vec_int_stmt_cost */ 596 1, /* vec_fp_stmt_cost */ 597 2, /* vec_permute_cost */ 598 2, /* vec_to_scalar_cost */ 599 1, /* scalar_to_vec_cost */ 600 1, /* vec_align_load_cost */ 601 1, /* vec_unalign_load_cost */ 602 1, /* vec_unalign_store_cost */ 603 1, /* vec_store_cost */ 604 3, /* cond_taken_branch_cost */ 605 1 /* cond_not_taken_branch_cost */ 606}; 607 608/* QDF24XX costs for vector insn classes. */ 609static const struct cpu_vector_cost qdf24xx_vector_cost = 610{ 611 1, /* scalar_int_stmt_cost */ 612 1, /* scalar_fp_stmt_cost */ 613 1, /* scalar_load_cost */ 614 1, /* scalar_store_cost */ 615 1, /* vec_int_stmt_cost */ 616 3, /* vec_fp_stmt_cost */ 617 2, /* vec_permute_cost */ 618 1, /* vec_to_scalar_cost */ 619 1, /* scalar_to_vec_cost */ 620 1, /* vec_align_load_cost */ 621 1, /* vec_unalign_load_cost */ 622 1, /* vec_unalign_store_cost */ 623 1, /* vec_store_cost */ 624 3, /* cond_taken_branch_cost */ 625 1 /* cond_not_taken_branch_cost */ 626}; 627 628/* ThunderX costs for vector insn classes. */ 629static const struct cpu_vector_cost thunderx_vector_cost = 630{ 631 1, /* scalar_int_stmt_cost */ 632 1, /* scalar_fp_stmt_cost */ 633 3, /* scalar_load_cost */ 634 1, /* scalar_store_cost */ 635 4, /* vec_int_stmt_cost */ 636 1, /* vec_fp_stmt_cost */ 637 4, /* vec_permute_cost */ 638 2, /* vec_to_scalar_cost */ 639 2, /* scalar_to_vec_cost */ 640 3, /* vec_align_load_cost */ 641 5, /* vec_unalign_load_cost */ 642 5, /* vec_unalign_store_cost */ 643 1, /* vec_store_cost */ 644 3, /* cond_taken_branch_cost */ 645 3 /* cond_not_taken_branch_cost */ 646}; 647 648static const struct cpu_vector_cost tsv110_vector_cost = 649{ 650 1, /* scalar_int_stmt_cost */ 651 1, /* scalar_fp_stmt_cost */ 652 5, /* scalar_load_cost */ 653 1, /* scalar_store_cost */ 654 2, /* vec_int_stmt_cost */ 655 2, /* vec_fp_stmt_cost */ 656 2, /* vec_permute_cost */ 657 3, /* vec_to_scalar_cost */ 658 2, /* scalar_to_vec_cost */ 659 5, /* vec_align_load_cost */ 660 5, /* vec_unalign_load_cost */ 661 1, /* vec_unalign_store_cost */ 662 1, /* vec_store_cost */ 663 1, /* cond_taken_branch_cost */ 664 1 /* cond_not_taken_branch_cost */ 665}; 666 667/* Generic costs for vector insn classes. */ 668static const struct cpu_vector_cost cortexa57_vector_cost = 669{ 670 1, /* scalar_int_stmt_cost */ 671 1, /* scalar_fp_stmt_cost */ 672 4, /* scalar_load_cost */ 673 1, /* scalar_store_cost */ 674 2, /* vec_int_stmt_cost */ 675 2, /* vec_fp_stmt_cost */ 676 3, /* vec_permute_cost */ 677 8, /* vec_to_scalar_cost */ 678 8, /* scalar_to_vec_cost */ 679 4, /* vec_align_load_cost */ 680 4, /* vec_unalign_load_cost */ 681 1, /* vec_unalign_store_cost */ 682 1, /* vec_store_cost */ 683 1, /* cond_taken_branch_cost */ 684 1 /* cond_not_taken_branch_cost */ 685}; 686 687static const struct cpu_vector_cost exynosm1_vector_cost = 688{ 689 1, /* scalar_int_stmt_cost */ 690 1, /* scalar_fp_stmt_cost */ 691 5, /* scalar_load_cost */ 692 1, /* scalar_store_cost */ 693 3, /* vec_int_stmt_cost */ 694 3, /* vec_fp_stmt_cost */ 695 3, /* vec_permute_cost */ 696 3, /* vec_to_scalar_cost */ 697 3, /* scalar_to_vec_cost */ 698 5, /* vec_align_load_cost */ 699 5, /* vec_unalign_load_cost */ 700 1, /* vec_unalign_store_cost */ 701 1, /* vec_store_cost */ 702 1, /* cond_taken_branch_cost */ 703 1 /* cond_not_taken_branch_cost */ 704}; 705 706/* Generic costs for vector insn classes. */ 707static const struct cpu_vector_cost xgene1_vector_cost = 708{ 709 1, /* scalar_int_stmt_cost */ 710 1, /* scalar_fp_stmt_cost */ 711 5, /* scalar_load_cost */ 712 1, /* scalar_store_cost */ 713 2, /* vec_int_stmt_cost */ 714 2, /* vec_fp_stmt_cost */ 715 2, /* vec_permute_cost */ 716 4, /* vec_to_scalar_cost */ 717 4, /* scalar_to_vec_cost */ 718 10, /* vec_align_load_cost */ 719 10, /* vec_unalign_load_cost */ 720 2, /* vec_unalign_store_cost */ 721 2, /* vec_store_cost */ 722 2, /* cond_taken_branch_cost */ 723 1 /* cond_not_taken_branch_cost */ 724}; 725 726/* Costs for vector insn classes for Vulcan. */ 727static const struct cpu_vector_cost thunderx2t99_vector_cost = 728{ 729 1, /* scalar_int_stmt_cost */ 730 6, /* scalar_fp_stmt_cost */ 731 4, /* scalar_load_cost */ 732 1, /* scalar_store_cost */ 733 4, /* vec_int_stmt_cost */ 734 5, /* vec_fp_stmt_cost */ 735 10, /* vec_permute_cost */ 736 6, /* vec_to_scalar_cost */ 737 5, /* scalar_to_vec_cost */ 738 4, /* vec_align_load_cost */ 739 4, /* vec_unalign_load_cost */ 740 1, /* vec_unalign_store_cost */ 741 1, /* vec_store_cost */ 742 2, /* cond_taken_branch_cost */ 743 1 /* cond_not_taken_branch_cost */ 744}; 745 746static const struct cpu_vector_cost thunderx3t110_vector_cost = 747{ 748 1, /* scalar_int_stmt_cost */ 749 5, /* scalar_fp_stmt_cost */ 750 4, /* scalar_load_cost */ 751 1, /* scalar_store_cost */ 752 5, /* vec_int_stmt_cost */ 753 5, /* vec_fp_stmt_cost */ 754 10, /* vec_permute_cost */ 755 5, /* vec_to_scalar_cost */ 756 5, /* scalar_to_vec_cost */ 757 4, /* vec_align_load_cost */ 758 4, /* vec_unalign_load_cost */ 759 4, /* vec_unalign_store_cost */ 760 4, /* vec_store_cost */ 761 2, /* cond_taken_branch_cost */ 762 1 /* cond_not_taken_branch_cost */ 763}; 764 765static const struct cpu_vector_cost a64fx_vector_cost = 766{ 767 1, /* scalar_int_stmt_cost */ 768 5, /* scalar_fp_stmt_cost */ 769 4, /* scalar_load_cost */ 770 1, /* scalar_store_cost */ 771 2, /* vec_int_stmt_cost */ 772 5, /* vec_fp_stmt_cost */ 773 3, /* vec_permute_cost */ 774 13, /* vec_to_scalar_cost */ 775 4, /* scalar_to_vec_cost */ 776 6, /* vec_align_load_cost */ 777 6, /* vec_unalign_load_cost */ 778 1, /* vec_unalign_store_cost */ 779 1, /* vec_store_cost */ 780 3, /* cond_taken_branch_cost */ 781 1 /* cond_not_taken_branch_cost */ 782}; 783 784/* Ampere-1 costs for vector insn classes. */ 785static const struct cpu_vector_cost ampere1_vector_cost = 786{ 787 1, /* scalar_int_stmt_cost */ 788 3, /* scalar_fp_stmt_cost */ 789 4, /* scalar_load_cost */ 790 1, /* scalar_store_cost */ 791 1, /* int_stmt_cost */ 792 3, /* fp_stmt_cost */ 793 2, /* permute_cost */ 794 6, /* vec_to_scalar_cost */ 795 7, /* scalar_to_vec_cost */ 796 4, /* align_load_cost */ 797 4, /* unalign_load_cost */ 798 1, /* unalign_store_cost */ 799 1, /* store_cost */ 800 1, /* cond_taken_branch_cost */ 801 1 /* cond_not_taken_branch_cost */ 802}; 803 804/* Generic costs for branch instructions. */ 805static const struct cpu_branch_cost generic_branch_cost = 806{ 807 1, /* Predictable. */ 808 3 /* Unpredictable. */ 809}; 810 811/* Generic approximation modes. */ 812static const cpu_approx_modes generic_approx_modes = 813{ 814 AARCH64_APPROX_NONE, /* division */ 815 AARCH64_APPROX_NONE, /* sqrt */ 816 AARCH64_APPROX_NONE /* recip_sqrt */ 817}; 818 819/* Approximation modes for Exynos M1. */ 820static const cpu_approx_modes exynosm1_approx_modes = 821{ 822 AARCH64_APPROX_NONE, /* division */ 823 AARCH64_APPROX_ALL, /* sqrt */ 824 AARCH64_APPROX_ALL /* recip_sqrt */ 825}; 826 827/* Approximation modes for X-Gene 1. */ 828static const cpu_approx_modes xgene1_approx_modes = 829{ 830 AARCH64_APPROX_NONE, /* division */ 831 AARCH64_APPROX_NONE, /* sqrt */ 832 AARCH64_APPROX_ALL /* recip_sqrt */ 833}; 834 835/* Generic prefetch settings (which disable prefetch). */ 836static const cpu_prefetch_tune generic_prefetch_tune = 837{ 838 0, /* num_slots */ 839 -1, /* l1_cache_size */ 840 -1, /* l1_cache_line_size */ 841 -1, /* l2_cache_size */ 842 true, /* prefetch_dynamic_strides */ 843 -1, /* minimum_stride */ 844 -1 /* default_opt_level */ 845}; 846 847static const cpu_prefetch_tune exynosm1_prefetch_tune = 848{ 849 0, /* num_slots */ 850 -1, /* l1_cache_size */ 851 64, /* l1_cache_line_size */ 852 -1, /* l2_cache_size */ 853 true, /* prefetch_dynamic_strides */ 854 -1, /* minimum_stride */ 855 -1 /* default_opt_level */ 856}; 857 858static const cpu_prefetch_tune qdf24xx_prefetch_tune = 859{ 860 4, /* num_slots */ 861 32, /* l1_cache_size */ 862 64, /* l1_cache_line_size */ 863 512, /* l2_cache_size */ 864 false, /* prefetch_dynamic_strides */ 865 2048, /* minimum_stride */ 866 3 /* default_opt_level */ 867}; 868 869static const cpu_prefetch_tune thunderxt88_prefetch_tune = 870{ 871 8, /* num_slots */ 872 32, /* l1_cache_size */ 873 128, /* l1_cache_line_size */ 874 16*1024, /* l2_cache_size */ 875 true, /* prefetch_dynamic_strides */ 876 -1, /* minimum_stride */ 877 3 /* default_opt_level */ 878}; 879 880static const cpu_prefetch_tune thunderx_prefetch_tune = 881{ 882 8, /* num_slots */ 883 32, /* l1_cache_size */ 884 128, /* l1_cache_line_size */ 885 -1, /* l2_cache_size */ 886 true, /* prefetch_dynamic_strides */ 887 -1, /* minimum_stride */ 888 -1 /* default_opt_level */ 889}; 890 891static const cpu_prefetch_tune thunderx2t99_prefetch_tune = 892{ 893 8, /* num_slots */ 894 32, /* l1_cache_size */ 895 64, /* l1_cache_line_size */ 896 256, /* l2_cache_size */ 897 true, /* prefetch_dynamic_strides */ 898 -1, /* minimum_stride */ 899 -1 /* default_opt_level */ 900}; 901 902static const cpu_prefetch_tune thunderx3t110_prefetch_tune = 903{ 904 8, /* num_slots */ 905 32, /* l1_cache_size */ 906 64, /* l1_cache_line_size */ 907 256, /* l2_cache_size */ 908 true, /* prefetch_dynamic_strides */ 909 -1, /* minimum_stride */ 910 -1 /* default_opt_level */ 911}; 912 913static const cpu_prefetch_tune tsv110_prefetch_tune = 914{ 915 0, /* num_slots */ 916 64, /* l1_cache_size */ 917 64, /* l1_cache_line_size */ 918 512, /* l2_cache_size */ 919 true, /* prefetch_dynamic_strides */ 920 -1, /* minimum_stride */ 921 -1 /* default_opt_level */ 922}; 923 924static const cpu_prefetch_tune xgene1_prefetch_tune = 925{ 926 8, /* num_slots */ 927 32, /* l1_cache_size */ 928 64, /* l1_cache_line_size */ 929 256, /* l2_cache_size */ 930 true, /* prefetch_dynamic_strides */ 931 -1, /* minimum_stride */ 932 -1 /* default_opt_level */ 933}; 934 935static const cpu_prefetch_tune a64fx_prefetch_tune = 936{ 937 8, /* num_slots */ 938 64, /* l1_cache_size */ 939 256, /* l1_cache_line_size */ 940 32768, /* l2_cache_size */ 941 true, /* prefetch_dynamic_strides */ 942 -1, /* minimum_stride */ 943 -1 /* default_opt_level */ 944}; 945 946static const cpu_prefetch_tune ampere1_prefetch_tune = 947{ 948 0, /* num_slots */ 949 64, /* l1_cache_size */ 950 64, /* l1_cache_line_size */ 951 2048, /* l2_cache_size */ 952 true, /* prefetch_dynamic_strides */ 953 -1, /* minimum_stride */ 954 -1 /* default_opt_level */ 955}; 956 957static const struct tune_params generic_tunings = 958{ 959 &cortexa57_extra_costs, 960 &generic_addrcost_table, 961 &generic_regmove_cost, 962 &generic_vector_cost, 963 &generic_branch_cost, 964 &generic_approx_modes, 965 SVE_NOT_IMPLEMENTED, /* sve_width */ 966 4, /* memmov_cost */ 967 2, /* issue_rate */ 968 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ 969 "16:12", /* function_align. */ 970 "4", /* jump_align. */ 971 "8", /* loop_align. */ 972 2, /* int_reassoc_width. */ 973 4, /* fp_reassoc_width. */ 974 1, /* vec_reassoc_width. */ 975 2, /* min_div_recip_mul_sf. */ 976 2, /* min_div_recip_mul_df. */ 977 0, /* max_case_values. */ 978 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 979 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits 980 Neoverse V1. It does not have a noticeable effect on A64FX and should 981 have at most a very minor effect on SVE2 cores. */ 982 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ 983 &generic_prefetch_tune 984}; 985 986static const struct tune_params cortexa35_tunings = 987{ 988 &cortexa53_extra_costs, 989 &generic_addrcost_table, 990 &cortexa53_regmove_cost, 991 &generic_vector_cost, 992 &generic_branch_cost, 993 &generic_approx_modes, 994 SVE_NOT_IMPLEMENTED, /* sve_width */ 995 4, /* memmov_cost */ 996 1, /* issue_rate */ 997 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 998 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 999 "16", /* function_align. */ 1000 "4", /* jump_align. */ 1001 "8", /* loop_align. */ 1002 2, /* int_reassoc_width. */ 1003 4, /* fp_reassoc_width. */ 1004 1, /* vec_reassoc_width. */ 1005 2, /* min_div_recip_mul_sf. */ 1006 2, /* min_div_recip_mul_df. */ 1007 0, /* max_case_values. */ 1008 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1009 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1010 &generic_prefetch_tune 1011}; 1012 1013static const struct tune_params cortexa53_tunings = 1014{ 1015 &cortexa53_extra_costs, 1016 &generic_addrcost_table, 1017 &cortexa53_regmove_cost, 1018 &generic_vector_cost, 1019 &generic_branch_cost, 1020 &generic_approx_modes, 1021 SVE_NOT_IMPLEMENTED, /* sve_width */ 1022 4, /* memmov_cost */ 1023 2, /* issue_rate */ 1024 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1025 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 1026 "16", /* function_align. */ 1027 "4", /* jump_align. */ 1028 "8", /* loop_align. */ 1029 2, /* int_reassoc_width. */ 1030 4, /* fp_reassoc_width. */ 1031 1, /* vec_reassoc_width. */ 1032 2, /* min_div_recip_mul_sf. */ 1033 2, /* min_div_recip_mul_df. */ 1034 0, /* max_case_values. */ 1035 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1036 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1037 &generic_prefetch_tune 1038}; 1039 1040static const struct tune_params cortexa57_tunings = 1041{ 1042 &cortexa57_extra_costs, 1043 &generic_addrcost_table, 1044 &cortexa57_regmove_cost, 1045 &cortexa57_vector_cost, 1046 &generic_branch_cost, 1047 &generic_approx_modes, 1048 SVE_NOT_IMPLEMENTED, /* sve_width */ 1049 4, /* memmov_cost */ 1050 3, /* issue_rate */ 1051 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1052 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 1053 "16", /* function_align. */ 1054 "4", /* jump_align. */ 1055 "8", /* loop_align. */ 1056 2, /* int_reassoc_width. */ 1057 4, /* fp_reassoc_width. */ 1058 1, /* vec_reassoc_width. */ 1059 2, /* min_div_recip_mul_sf. */ 1060 2, /* min_div_recip_mul_df. */ 1061 0, /* max_case_values. */ 1062 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1063 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ 1064 &generic_prefetch_tune 1065}; 1066 1067static const struct tune_params cortexa72_tunings = 1068{ 1069 &cortexa57_extra_costs, 1070 &generic_addrcost_table, 1071 &cortexa57_regmove_cost, 1072 &cortexa57_vector_cost, 1073 &generic_branch_cost, 1074 &generic_approx_modes, 1075 SVE_NOT_IMPLEMENTED, /* sve_width */ 1076 4, /* memmov_cost */ 1077 3, /* issue_rate */ 1078 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1079 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 1080 "16", /* function_align. */ 1081 "4", /* jump_align. */ 1082 "8", /* loop_align. */ 1083 2, /* int_reassoc_width. */ 1084 4, /* fp_reassoc_width. */ 1085 1, /* vec_reassoc_width. */ 1086 2, /* min_div_recip_mul_sf. */ 1087 2, /* min_div_recip_mul_df. */ 1088 0, /* max_case_values. */ 1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1091 &generic_prefetch_tune 1092}; 1093 1094static const struct tune_params cortexa73_tunings = 1095{ 1096 &cortexa57_extra_costs, 1097 &generic_addrcost_table, 1098 &cortexa57_regmove_cost, 1099 &cortexa57_vector_cost, 1100 &generic_branch_cost, 1101 &generic_approx_modes, 1102 SVE_NOT_IMPLEMENTED, /* sve_width */ 1103 4, /* memmov_cost. */ 1104 2, /* issue_rate. */ 1105 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1106 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 1107 "16", /* function_align. */ 1108 "4", /* jump_align. */ 1109 "8", /* loop_align. */ 1110 2, /* int_reassoc_width. */ 1111 4, /* fp_reassoc_width. */ 1112 1, /* vec_reassoc_width. */ 1113 2, /* min_div_recip_mul_sf. */ 1114 2, /* min_div_recip_mul_df. */ 1115 0, /* max_case_values. */ 1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1118 &generic_prefetch_tune 1119}; 1120 1121 1122 1123static const struct tune_params exynosm1_tunings = 1124{ 1125 &exynosm1_extra_costs, 1126 &exynosm1_addrcost_table, 1127 &exynosm1_regmove_cost, 1128 &exynosm1_vector_cost, 1129 &generic_branch_cost, 1130 &exynosm1_approx_modes, 1131 SVE_NOT_IMPLEMENTED, /* sve_width */ 1132 4, /* memmov_cost */ 1133 3, /* issue_rate */ 1134 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ 1135 "4", /* function_align. */ 1136 "4", /* jump_align. */ 1137 "4", /* loop_align. */ 1138 2, /* int_reassoc_width. */ 1139 4, /* fp_reassoc_width. */ 1140 1, /* vec_reassoc_width. */ 1141 2, /* min_div_recip_mul_sf. */ 1142 2, /* min_div_recip_mul_df. */ 1143 48, /* max_case_values. */ 1144 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1145 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1146 &exynosm1_prefetch_tune 1147}; 1148 1149static const struct tune_params thunderxt88_tunings = 1150{ 1151 &thunderx_extra_costs, 1152 &generic_addrcost_table, 1153 &thunderx_regmove_cost, 1154 &thunderx_vector_cost, 1155 &generic_branch_cost, 1156 &generic_approx_modes, 1157 SVE_NOT_IMPLEMENTED, /* sve_width */ 1158 6, /* memmov_cost */ 1159 2, /* issue_rate */ 1160 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ 1161 "8", /* function_align. */ 1162 "8", /* jump_align. */ 1163 "8", /* loop_align. */ 1164 2, /* int_reassoc_width. */ 1165 4, /* fp_reassoc_width. */ 1166 1, /* vec_reassoc_width. */ 1167 2, /* min_div_recip_mul_sf. */ 1168 2, /* min_div_recip_mul_df. */ 1169 0, /* max_case_values. */ 1170 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 1171 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */ 1172 &thunderxt88_prefetch_tune 1173}; 1174 1175static const struct tune_params thunderx_tunings = 1176{ 1177 &thunderx_extra_costs, 1178 &generic_addrcost_table, 1179 &thunderx_regmove_cost, 1180 &thunderx_vector_cost, 1181 &generic_branch_cost, 1182 &generic_approx_modes, 1183 SVE_NOT_IMPLEMENTED, /* sve_width */ 1184 6, /* memmov_cost */ 1185 2, /* issue_rate */ 1186 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ 1187 "8", /* function_align. */ 1188 "8", /* jump_align. */ 1189 "8", /* loop_align. */ 1190 2, /* int_reassoc_width. */ 1191 4, /* fp_reassoc_width. */ 1192 1, /* vec_reassoc_width. */ 1193 2, /* min_div_recip_mul_sf. */ 1194 2, /* min_div_recip_mul_df. */ 1195 0, /* max_case_values. */ 1196 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 1197 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW 1198 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ 1199 &thunderx_prefetch_tune 1200}; 1201 1202static const struct tune_params tsv110_tunings = 1203{ 1204 &tsv110_extra_costs, 1205 &tsv110_addrcost_table, 1206 &tsv110_regmove_cost, 1207 &tsv110_vector_cost, 1208 &generic_branch_cost, 1209 &generic_approx_modes, 1210 SVE_NOT_IMPLEMENTED, /* sve_width */ 1211 4, /* memmov_cost */ 1212 4, /* issue_rate */ 1213 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH 1214 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ 1215 "16", /* function_align. */ 1216 "4", /* jump_align. */ 1217 "8", /* loop_align. */ 1218 2, /* int_reassoc_width. */ 1219 4, /* fp_reassoc_width. */ 1220 1, /* vec_reassoc_width. */ 1221 2, /* min_div_recip_mul_sf. */ 1222 2, /* min_div_recip_mul_df. */ 1223 0, /* max_case_values. */ 1224 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1225 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1226 &tsv110_prefetch_tune 1227}; 1228 1229static const struct tune_params xgene1_tunings = 1230{ 1231 &xgene1_extra_costs, 1232 &xgene1_addrcost_table, 1233 &xgene1_regmove_cost, 1234 &xgene1_vector_cost, 1235 &generic_branch_cost, 1236 &xgene1_approx_modes, 1237 SVE_NOT_IMPLEMENTED, /* sve_width */ 1238 6, /* memmov_cost */ 1239 4, /* issue_rate */ 1240 AARCH64_FUSE_NOTHING, /* fusible_ops */ 1241 "16", /* function_align. */ 1242 "16", /* jump_align. */ 1243 "16", /* loop_align. */ 1244 2, /* int_reassoc_width. */ 1245 4, /* fp_reassoc_width. */ 1246 1, /* vec_reassoc_width. */ 1247 2, /* min_div_recip_mul_sf. */ 1248 2, /* min_div_recip_mul_df. */ 1249 17, /* max_case_values. */ 1250 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 1251 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ 1252 &xgene1_prefetch_tune 1253}; 1254 1255static const struct tune_params emag_tunings = 1256{ 1257 &xgene1_extra_costs, 1258 &xgene1_addrcost_table, 1259 &xgene1_regmove_cost, 1260 &xgene1_vector_cost, 1261 &generic_branch_cost, 1262 &xgene1_approx_modes, 1263 SVE_NOT_IMPLEMENTED, 1264 6, /* memmov_cost */ 1265 4, /* issue_rate */ 1266 AARCH64_FUSE_NOTHING, /* fusible_ops */ 1267 "16", /* function_align. */ 1268 "16", /* jump_align. */ 1269 "16", /* loop_align. */ 1270 2, /* int_reassoc_width. */ 1271 4, /* fp_reassoc_width. */ 1272 1, /* vec_reassoc_width. */ 1273 2, /* min_div_recip_mul_sf. */ 1274 2, /* min_div_recip_mul_df. */ 1275 17, /* max_case_values. */ 1276 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 1277 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ 1278 &xgene1_prefetch_tune 1279}; 1280 1281static const struct tune_params qdf24xx_tunings = 1282{ 1283 &qdf24xx_extra_costs, 1284 &qdf24xx_addrcost_table, 1285 &qdf24xx_regmove_cost, 1286 &qdf24xx_vector_cost, 1287 &generic_branch_cost, 1288 &generic_approx_modes, 1289 SVE_NOT_IMPLEMENTED, /* sve_width */ 1290 4, /* memmov_cost */ 1291 4, /* issue_rate */ 1292 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1293 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ 1294 "16", /* function_align. */ 1295 "8", /* jump_align. */ 1296 "16", /* loop_align. */ 1297 2, /* int_reassoc_width. */ 1298 4, /* fp_reassoc_width. */ 1299 1, /* vec_reassoc_width. */ 1300 2, /* min_div_recip_mul_sf. */ 1301 2, /* min_div_recip_mul_df. */ 1302 0, /* max_case_values. */ 1303 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1304 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */ 1305 &qdf24xx_prefetch_tune 1306}; 1307 1308/* Tuning structure for the Qualcomm Saphira core. Default to falkor values 1309 for now. */ 1310static const struct tune_params saphira_tunings = 1311{ 1312 &generic_extra_costs, 1313 &generic_addrcost_table, 1314 &generic_regmove_cost, 1315 &generic_vector_cost, 1316 &generic_branch_cost, 1317 &generic_approx_modes, 1318 SVE_NOT_IMPLEMENTED, /* sve_width */ 1319 4, /* memmov_cost */ 1320 4, /* issue_rate */ 1321 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1322 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ 1323 "16", /* function_align. */ 1324 "8", /* jump_align. */ 1325 "16", /* loop_align. */ 1326 2, /* int_reassoc_width. */ 1327 4, /* fp_reassoc_width. */ 1328 1, /* vec_reassoc_width. */ 1329 2, /* min_div_recip_mul_sf. */ 1330 2, /* min_div_recip_mul_df. */ 1331 0, /* max_case_values. */ 1332 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1333 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1334 &generic_prefetch_tune 1335}; 1336 1337static const struct tune_params thunderx2t99_tunings = 1338{ 1339 &thunderx2t99_extra_costs, 1340 &thunderx2t99_addrcost_table, 1341 &thunderx2t99_regmove_cost, 1342 &thunderx2t99_vector_cost, 1343 &generic_branch_cost, 1344 &generic_approx_modes, 1345 SVE_NOT_IMPLEMENTED, /* sve_width */ 1346 4, /* memmov_cost. */ 1347 4, /* issue_rate. */ 1348 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC 1349 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ 1350 "16", /* function_align. */ 1351 "8", /* jump_align. */ 1352 "16", /* loop_align. */ 1353 3, /* int_reassoc_width. */ 1354 2, /* fp_reassoc_width. */ 1355 2, /* vec_reassoc_width. */ 1356 2, /* min_div_recip_mul_sf. */ 1357 2, /* min_div_recip_mul_df. */ 1358 0, /* max_case_values. */ 1359 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1360 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1361 &thunderx2t99_prefetch_tune 1362}; 1363 1364static const struct tune_params thunderx3t110_tunings = 1365{ 1366 &thunderx3t110_extra_costs, 1367 &thunderx3t110_addrcost_table, 1368 &thunderx3t110_regmove_cost, 1369 &thunderx3t110_vector_cost, 1370 &generic_branch_cost, 1371 &generic_approx_modes, 1372 SVE_NOT_IMPLEMENTED, /* sve_width */ 1373 4, /* memmov_cost. */ 1374 6, /* issue_rate. */ 1375 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC 1376 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ 1377 "16", /* function_align. */ 1378 "8", /* jump_align. */ 1379 "16", /* loop_align. */ 1380 3, /* int_reassoc_width. */ 1381 2, /* fp_reassoc_width. */ 1382 2, /* vec_reassoc_width. */ 1383 2, /* min_div_recip_mul_sf. */ 1384 2, /* min_div_recip_mul_df. */ 1385 0, /* max_case_values. */ 1386 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1387 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1388 &thunderx3t110_prefetch_tune 1389}; 1390 1391static const struct tune_params neoversen1_tunings = 1392{ 1393 &cortexa57_extra_costs, 1394 &generic_addrcost_table, 1395 &generic_regmove_cost, 1396 &cortexa57_vector_cost, 1397 &generic_branch_cost, 1398 &generic_approx_modes, 1399 SVE_NOT_IMPLEMENTED, /* sve_width */ 1400 4, /* memmov_cost */ 1401 3, /* issue_rate */ 1402 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ 1403 "32:16", /* function_align. */ 1404 "4", /* jump_align. */ 1405 "32:16", /* loop_align. */ 1406 2, /* int_reassoc_width. */ 1407 4, /* fp_reassoc_width. */ 1408 2, /* vec_reassoc_width. */ 1409 2, /* min_div_recip_mul_sf. */ 1410 2, /* min_div_recip_mul_df. */ 1411 0, /* max_case_values. */ 1412 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1413 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1414 &generic_prefetch_tune 1415}; 1416 1417static const struct tune_params ampere1_tunings = 1418{ 1419 &ere1_extra_costs, 1420 &generic_addrcost_table, 1421 &generic_regmove_cost, 1422 &ere1_vector_cost, 1423 &generic_branch_cost, 1424 &generic_approx_modes, 1425 SVE_NOT_IMPLEMENTED, /* sve_width */ 1426 4, /* memmov_cost */ 1427 4, /* issue_rate */ 1428 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | 1429 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | 1430 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | 1431 AARCH64_FUSE_CMP_BRANCH), 1432 /* fusible_ops */ 1433 "32", /* function_align. */ 1434 "4", /* jump_align. */ 1435 "32:16", /* loop_align. */ 1436 2, /* int_reassoc_width. */ 1437 4, /* fp_reassoc_width. */ 1438 2, /* vec_reassoc_width. */ 1439 2, /* min_div_recip_mul_sf. */ 1440 2, /* min_div_recip_mul_df. */ 1441 0, /* max_case_values. */ 1442 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1443 (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ 1444 &ere1_prefetch_tune 1445}; 1446 1447static const struct tune_params ampere1a_tunings = 1448{ 1449 &ere1a_extra_costs, 1450 &generic_addrcost_table, 1451 &generic_regmove_cost, 1452 &ere1_vector_cost, 1453 &generic_branch_cost, 1454 &generic_approx_modes, 1455 SVE_NOT_IMPLEMENTED, /* sve_width */ 1456 4, /* memmov_cost */ 1457 4, /* issue_rate */ 1458 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | 1459 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | 1460 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | 1461 AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | 1462 AARCH64_FUSE_ADDSUB_2REG_CONST1), 1463 /* fusible_ops */ 1464 "32", /* function_align. */ 1465 "4", /* jump_align. */ 1466 "32:16", /* loop_align. */ 1467 2, /* int_reassoc_width. */ 1468 4, /* fp_reassoc_width. */ 1469 2, /* vec_reassoc_width. */ 1470 2, /* min_div_recip_mul_sf. */ 1471 2, /* min_div_recip_mul_df. */ 1472 0, /* max_case_values. */ 1473 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1474 (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ 1475 &ere1_prefetch_tune 1476}; 1477 1478static const struct tune_params neoversev1_tunings = 1479{ 1480 &cortexa57_extra_costs, 1481 &generic_addrcost_table, 1482 &generic_regmove_cost, 1483 &cortexa57_vector_cost, 1484 &generic_branch_cost, 1485 &generic_approx_modes, 1486 SVE_256, /* sve_width */ 1487 4, /* memmov_cost */ 1488 3, /* issue_rate */ 1489 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ 1490 "32:16", /* function_align. */ 1491 "4", /* jump_align. */ 1492 "32:16", /* loop_align. */ 1493 2, /* int_reassoc_width. */ 1494 4, /* fp_reassoc_width. */ 1495 2, /* vec_reassoc_width. */ 1496 2, /* min_div_recip_mul_sf. */ 1497 2, /* min_div_recip_mul_df. */ 1498 0, /* max_case_values. */ 1499 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1500 (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC 1501 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ 1502 &generic_prefetch_tune 1503}; 1504 1505static const struct tune_params neoversen2_tunings = 1506{ 1507 &cortexa57_extra_costs, 1508 &generic_addrcost_table, 1509 &generic_regmove_cost, 1510 &cortexa57_vector_cost, 1511 &generic_branch_cost, 1512 &generic_approx_modes, 1513 SVE_128, /* sve_width */ 1514 4, /* memmov_cost */ 1515 3, /* issue_rate */ 1516 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ 1517 "32:16", /* function_align. */ 1518 "4", /* jump_align. */ 1519 "32:16", /* loop_align. */ 1520 2, /* int_reassoc_width. */ 1521 4, /* fp_reassoc_width. */ 1522 2, /* vec_reassoc_width. */ 1523 2, /* min_div_recip_mul_sf. */ 1524 2, /* min_div_recip_mul_df. */ 1525 0, /* max_case_values. */ 1526 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1527 (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC), /* tune_flags. */ 1528 &generic_prefetch_tune 1529}; 1530 1531static const struct tune_params a64fx_tunings = 1532{ 1533 &a64fx_extra_costs, 1534 &a64fx_addrcost_table, 1535 &a64fx_regmove_cost, 1536 &a64fx_vector_cost, 1537 &generic_branch_cost, 1538 &generic_approx_modes, 1539 SVE_512, /* sve_width */ 1540 4, /* memmov_cost */ 1541 7, /* issue_rate */ 1542 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ 1543 "32", /* function_align. */ 1544 "16", /* jump_align. */ 1545 "32", /* loop_align. */ 1546 4, /* int_reassoc_width. */ 1547 2, /* fp_reassoc_width. */ 1548 2, /* vec_reassoc_width. */ 1549 2, /* min_div_recip_mul_sf. */ 1550 2, /* min_div_recip_mul_df. */ 1551 0, /* max_case_values. */ 1552 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1553 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1554 &a64fx_prefetch_tune 1555}; 1556 1557/* Support for fine-grained override of the tuning structures. */ 1558struct aarch64_tuning_override_function 1559{ 1560 const char* name; 1561 void (*parse_override)(const char*, struct tune_params*); 1562}; 1563 1564static void aarch64_parse_fuse_string (const char*, struct tune_params*); 1565static void aarch64_parse_tune_string (const char*, struct tune_params*); 1566static void aarch64_parse_sve_width_string (const char*, struct tune_params*); 1567 1568static const struct aarch64_tuning_override_function 1569aarch64_tuning_override_functions[] = 1570{ 1571 { "fuse", aarch64_parse_fuse_string }, 1572 { "tune", aarch64_parse_tune_string }, 1573 { "sve_width", aarch64_parse_sve_width_string }, 1574 { NULL, NULL } 1575}; 1576 1577/* A processor implementing AArch64. */ 1578struct processor 1579{ 1580 const char *const name; 1581 enum aarch64_processor ident; 1582 enum aarch64_processor sched_core; 1583 enum aarch64_arch arch; 1584 unsigned architecture_version; 1585 const uint64_t flags; 1586 const struct tune_params *const tune; 1587}; 1588 1589/* Architectures implementing AArch64. */ 1590static const struct processor all_architectures[] = 1591{ 1592#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \ 1593 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL}, 1594#include "aarch64-arches.def" 1595 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} 1596}; 1597 1598/* Processor cores implementing AArch64. */ 1599static const struct processor all_cores[] = 1600{ 1601#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \ 1602 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \ 1603 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \ 1604 FLAGS, &COSTS##_tunings}, 1605#include "aarch64-cores.def" 1606 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8, 1607 AARCH64_FL_FOR_ARCH8, &generic_tunings}, 1608 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} 1609}; 1610 1611 1612/* Target specification. These are populated by the -march, -mtune, -mcpu 1613 handling code or by target attributes. */ 1614static const struct processor *selected_arch; 1615static const struct processor *selected_cpu; 1616static const struct processor *selected_tune; 1617 1618enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A; 1619 1620/* The current tuning set. */ 1621struct tune_params aarch64_tune_params = generic_tunings; 1622 1623/* Check whether an 'aarch64_vector_pcs' attribute is valid. */ 1624 1625static tree 1626handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree, 1627 int, bool *no_add_attrs) 1628{ 1629 /* Since we set fn_type_req to true, the caller should have checked 1630 this for us. */ 1631 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node)); 1632 switch ((arm_pcs) fntype_abi (*node).id ()) 1633 { 1634 case ARM_PCS_AAPCS64: 1635 case ARM_PCS_SIMD: 1636 return NULL_TREE; 1637 1638 case ARM_PCS_SVE: 1639 error ("the %qE attribute cannot be applied to an SVE function type", 1640 name); 1641 *no_add_attrs = true; 1642 return NULL_TREE; 1643 1644 case ARM_PCS_TLSDESC: 1645 case ARM_PCS_UNKNOWN: 1646 break; 1647 } 1648 gcc_unreachable (); 1649} 1650 1651/* Table of machine attributes. */ 1652static const struct attribute_spec aarch64_attribute_table[] = 1653{ 1654 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, 1655 affects_type_identity, handler, exclude } */ 1656 { "aarch64_vector_pcs", 0, 0, false, true, true, true, 1657 handle_aarch64_vector_pcs_attribute, NULL }, 1658 { "arm_sve_vector_bits", 1, 1, false, true, false, true, 1659 aarch64_sve::handle_arm_sve_vector_bits_attribute, 1660 NULL }, 1661 { "Advanced SIMD type", 0, 0, false, true, false, true, NULL, NULL }, 1662 { "SVE type", 3, 3, false, true, false, true, NULL, NULL }, 1663 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL }, 1664 { NULL, 0, 0, false, false, false, false, NULL, NULL } 1665}; 1666 1667#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0) 1668 1669/* An ISA extension in the co-processor and main instruction set space. */ 1670struct aarch64_option_extension 1671{ 1672 const char *const name; 1673 const unsigned long flags_on; 1674 const unsigned long flags_off; 1675}; 1676 1677typedef enum aarch64_cond_code 1678{ 1679 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL, 1680 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT, 1681 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV 1682} 1683aarch64_cc; 1684 1685#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1)) 1686 1687struct aarch64_branch_protect_type 1688{ 1689 /* The type's name that the user passes to the branch-protection option 1690 string. */ 1691 const char* name; 1692 /* Function to handle the protection type and set global variables. 1693 First argument is the string token corresponding with this type and the 1694 second argument is the next token in the option string. 1695 Return values: 1696 * AARCH64_PARSE_OK: Handling was sucessful. 1697 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller 1698 should print an error. 1699 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its 1700 own error. */ 1701 enum aarch64_parse_opt_result (*handler)(char*, char*); 1702 /* A list of types that can follow this type in the option string. */ 1703 const aarch64_branch_protect_type* subtypes; 1704 unsigned int num_subtypes; 1705}; 1706 1707static enum aarch64_parse_opt_result 1708aarch64_handle_no_branch_protection (char* str, char* rest) 1709{ 1710 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; 1711 aarch64_enable_bti = 0; 1712 if (rest) 1713 { 1714 error ("unexpected %<%s%> after %<%s%>", rest, str); 1715 return AARCH64_PARSE_INVALID_FEATURE; 1716 } 1717 return AARCH64_PARSE_OK; 1718} 1719 1720static enum aarch64_parse_opt_result 1721aarch64_handle_standard_branch_protection (char* str, char* rest) 1722{ 1723 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 1724 aarch64_ra_sign_key = AARCH64_KEY_A; 1725 aarch64_enable_bti = 1; 1726 if (rest) 1727 { 1728 error ("unexpected %<%s%> after %<%s%>", rest, str); 1729 return AARCH64_PARSE_INVALID_FEATURE; 1730 } 1731 return AARCH64_PARSE_OK; 1732} 1733 1734static enum aarch64_parse_opt_result 1735aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED, 1736 char* rest ATTRIBUTE_UNUSED) 1737{ 1738 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 1739 aarch64_ra_sign_key = AARCH64_KEY_A; 1740 return AARCH64_PARSE_OK; 1741} 1742 1743static enum aarch64_parse_opt_result 1744aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED, 1745 char* rest ATTRIBUTE_UNUSED) 1746{ 1747 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL; 1748 return AARCH64_PARSE_OK; 1749} 1750 1751static enum aarch64_parse_opt_result 1752aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED, 1753 char* rest ATTRIBUTE_UNUSED) 1754{ 1755 aarch64_ra_sign_key = AARCH64_KEY_B; 1756 return AARCH64_PARSE_OK; 1757} 1758 1759static enum aarch64_parse_opt_result 1760aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED, 1761 char* rest ATTRIBUTE_UNUSED) 1762{ 1763 aarch64_enable_bti = 1; 1764 return AARCH64_PARSE_OK; 1765} 1766 1767static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = { 1768 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 }, 1769 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 }, 1770 { NULL, NULL, NULL, 0 } 1771}; 1772 1773static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = { 1774 { "none", aarch64_handle_no_branch_protection, NULL, 0 }, 1775 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 }, 1776 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes, 1777 ARRAY_SIZE (aarch64_pac_ret_subtypes) }, 1778 { "bti", aarch64_handle_bti_protection, NULL, 0 }, 1779 { NULL, NULL, NULL, 0 } 1780}; 1781 1782/* The condition codes of the processor, and the inverse function. */ 1783static const char * const aarch64_condition_codes[] = 1784{ 1785 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", 1786 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" 1787}; 1788 1789/* The preferred condition codes for SVE conditions. */ 1790static const char *const aarch64_sve_condition_codes[] = 1791{ 1792 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc", 1793 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv" 1794}; 1795 1796/* Return the assembly token for svpattern value VALUE. */ 1797 1798static const char * 1799svpattern_token (enum aarch64_svpattern pattern) 1800{ 1801 switch (pattern) 1802 { 1803#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; 1804 AARCH64_FOR_SVPATTERN (CASE) 1805#undef CASE 1806 case AARCH64_NUM_SVPATTERNS: 1807 break; 1808 } 1809 gcc_unreachable (); 1810} 1811 1812/* Return the location of a piece that is known to be passed or returned 1813 in registers. FIRST_ZR is the first unused vector argument register 1814 and FIRST_PR is the first unused predicate argument register. */ 1815 1816rtx 1817pure_scalable_type_info::piece::get_rtx (unsigned int first_zr, 1818 unsigned int first_pr) const 1819{ 1820 gcc_assert (VECTOR_MODE_P (mode) 1821 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS 1822 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS); 1823 1824 if (num_zr > 0 && num_pr == 0) 1825 return gen_rtx_REG (mode, first_zr); 1826 1827 if (num_zr == 0 && num_pr == 1) 1828 return gen_rtx_REG (mode, first_pr); 1829 1830 gcc_unreachable (); 1831} 1832 1833/* Return the total number of vector registers required by the PST. */ 1834 1835unsigned int 1836pure_scalable_type_info::num_zr () const 1837{ 1838 unsigned int res = 0; 1839 for (unsigned int i = 0; i < pieces.length (); ++i) 1840 res += pieces[i].num_zr; 1841 return res; 1842} 1843 1844/* Return the total number of predicate registers required by the PST. */ 1845 1846unsigned int 1847pure_scalable_type_info::num_pr () const 1848{ 1849 unsigned int res = 0; 1850 for (unsigned int i = 0; i < pieces.length (); ++i) 1851 res += pieces[i].num_pr; 1852 return res; 1853} 1854 1855/* Return the location of a PST that is known to be passed or returned 1856 in registers. FIRST_ZR is the first unused vector argument register 1857 and FIRST_PR is the first unused predicate argument register. */ 1858 1859rtx 1860pure_scalable_type_info::get_rtx (machine_mode mode, 1861 unsigned int first_zr, 1862 unsigned int first_pr) const 1863{ 1864 /* Try to return a single REG if possible. This leads to better 1865 code generation; it isn't required for correctness. */ 1866 if (mode == pieces[0].mode) 1867 { 1868 gcc_assert (pieces.length () == 1); 1869 return pieces[0].get_rtx (first_zr, first_pr); 1870 } 1871 1872 /* Build up a PARALLEL that contains the individual pieces. */ 1873 rtvec rtxes = rtvec_alloc (pieces.length ()); 1874 for (unsigned int i = 0; i < pieces.length (); ++i) 1875 { 1876 rtx reg = pieces[i].get_rtx (first_zr, first_pr); 1877 rtx offset = gen_int_mode (pieces[i].offset, Pmode); 1878 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset); 1879 first_zr += pieces[i].num_zr; 1880 first_pr += pieces[i].num_pr; 1881 } 1882 return gen_rtx_PARALLEL (mode, rtxes); 1883} 1884 1885/* Analyze whether TYPE is a Pure Scalable Type according to the rules 1886 in the AAPCS64. */ 1887 1888pure_scalable_type_info::analysis_result 1889pure_scalable_type_info::analyze (const_tree type) 1890{ 1891 /* Prevent accidental reuse. */ 1892 gcc_assert (pieces.is_empty ()); 1893 1894 /* No code will be generated for erroneous types, so we won't establish 1895 an ABI mapping. */ 1896 if (type == error_mark_node) 1897 return NO_ABI_IDENTITY; 1898 1899 /* Zero-sized types disappear in the language->ABI mapping. */ 1900 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) 1901 return NO_ABI_IDENTITY; 1902 1903 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */ 1904 piece p = {}; 1905 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr)) 1906 { 1907 machine_mode mode = TYPE_MODE_RAW (type); 1908 gcc_assert (VECTOR_MODE_P (mode) 1909 && (!TARGET_SVE || aarch64_sve_mode_p (mode))); 1910 1911 p.mode = p.orig_mode = mode; 1912 add_piece (p); 1913 return IS_PST; 1914 } 1915 1916 /* Check for user-defined PSTs. */ 1917 if (TREE_CODE (type) == ARRAY_TYPE) 1918 return analyze_array (type); 1919 if (TREE_CODE (type) == RECORD_TYPE) 1920 return analyze_record (type); 1921 1922 return ISNT_PST; 1923} 1924 1925/* Analyze a type that is known not to be passed or returned in memory. 1926 Return true if it has an ABI identity and is a Pure Scalable Type. */ 1927 1928bool 1929pure_scalable_type_info::analyze_registers (const_tree type) 1930{ 1931 analysis_result result = analyze (type); 1932 gcc_assert (result != DOESNT_MATTER); 1933 return result == IS_PST; 1934} 1935 1936/* Subroutine of analyze for handling ARRAY_TYPEs. */ 1937 1938pure_scalable_type_info::analysis_result 1939pure_scalable_type_info::analyze_array (const_tree type) 1940{ 1941 /* Analyze the element type. */ 1942 pure_scalable_type_info element_info; 1943 analysis_result result = element_info.analyze (TREE_TYPE (type)); 1944 if (result != IS_PST) 1945 return result; 1946 1947 /* An array of unknown, flexible or variable length will be passed and 1948 returned by reference whatever we do. */ 1949 tree nelts_minus_one = array_type_nelts (type); 1950 if (!tree_fits_uhwi_p (nelts_minus_one)) 1951 return DOESNT_MATTER; 1952 1953 /* Likewise if the array is constant-sized but too big to be interesting. 1954 The double checks against MAX_PIECES are to protect against overflow. */ 1955 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one); 1956 if (count > MAX_PIECES) 1957 return DOESNT_MATTER; 1958 count += 1; 1959 if (count * element_info.pieces.length () > MAX_PIECES) 1960 return DOESNT_MATTER; 1961 1962 /* The above checks should have weeded out elements of unknown size. */ 1963 poly_uint64 element_bytes; 1964 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes)) 1965 gcc_unreachable (); 1966 1967 /* Build up the list of individual vectors and predicates. */ 1968 gcc_assert (!element_info.pieces.is_empty ()); 1969 for (unsigned int i = 0; i < count; ++i) 1970 for (unsigned int j = 0; j < element_info.pieces.length (); ++j) 1971 { 1972 piece p = element_info.pieces[j]; 1973 p.offset += i * element_bytes; 1974 add_piece (p); 1975 } 1976 return IS_PST; 1977} 1978 1979/* Subroutine of analyze for handling RECORD_TYPEs. */ 1980 1981pure_scalable_type_info::analysis_result 1982pure_scalable_type_info::analyze_record (const_tree type) 1983{ 1984 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 1985 { 1986 if (TREE_CODE (field) != FIELD_DECL) 1987 continue; 1988 1989 /* Zero-sized fields disappear in the language->ABI mapping. */ 1990 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field))) 1991 continue; 1992 1993 /* All fields with an ABI identity must be PSTs for the record as 1994 a whole to be a PST. If any individual field is too big to be 1995 interesting then the record is too. */ 1996 pure_scalable_type_info field_info; 1997 analysis_result subresult = field_info.analyze (TREE_TYPE (field)); 1998 if (subresult == NO_ABI_IDENTITY) 1999 continue; 2000 if (subresult != IS_PST) 2001 return subresult; 2002 2003 /* Since all previous fields are PSTs, we ought to be able to track 2004 the field offset using poly_ints. */ 2005 tree bitpos = bit_position (field); 2006 gcc_assert (poly_int_tree_p (bitpos)); 2007 2008 /* For the same reason, it shouldn't be possible to create a PST field 2009 whose offset isn't byte-aligned. */ 2010 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos), 2011 BITS_PER_UNIT); 2012 2013 /* Punt if the record is too big to be interesting. */ 2014 poly_uint64 bytepos; 2015 if (!wide_bytepos.to_uhwi (&bytepos) 2016 || pieces.length () + field_info.pieces.length () > MAX_PIECES) 2017 return DOESNT_MATTER; 2018 2019 /* Add the individual vectors and predicates in the field to the 2020 record's list. */ 2021 gcc_assert (!field_info.pieces.is_empty ()); 2022 for (unsigned int i = 0; i < field_info.pieces.length (); ++i) 2023 { 2024 piece p = field_info.pieces[i]; 2025 p.offset += bytepos; 2026 add_piece (p); 2027 } 2028 } 2029 /* Empty structures disappear in the language->ABI mapping. */ 2030 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST; 2031} 2032 2033/* Add P to the list of pieces in the type. */ 2034 2035void 2036pure_scalable_type_info::add_piece (const piece &p) 2037{ 2038 /* Try to fold the new piece into the previous one to form a 2039 single-mode PST. For example, if we see three consecutive vectors 2040 of the same mode, we can represent them using the corresponding 2041 3-tuple mode. 2042 2043 This is purely an optimization. */ 2044 if (!pieces.is_empty ()) 2045 { 2046 piece &prev = pieces.last (); 2047 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode)); 2048 unsigned int nelems1, nelems2; 2049 if (prev.orig_mode == p.orig_mode 2050 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset) 2051 && constant_multiple_p (GET_MODE_NUNITS (prev.mode), 2052 GET_MODE_NUNITS (p.orig_mode), &nelems1) 2053 && constant_multiple_p (GET_MODE_NUNITS (p.mode), 2054 GET_MODE_NUNITS (p.orig_mode), &nelems2) 2055 && targetm.array_mode (p.orig_mode, 2056 nelems1 + nelems2).exists (&prev.mode)) 2057 { 2058 prev.num_zr += p.num_zr; 2059 prev.num_pr += p.num_pr; 2060 return; 2061 } 2062 } 2063 pieces.quick_push (p); 2064} 2065 2066/* Return true if at least one possible value of type TYPE includes at 2067 least one object of Pure Scalable Type, in the sense of the AAPCS64. 2068 2069 This is a relatively expensive test for some types, so it should 2070 generally be made as late as possible. */ 2071 2072static bool 2073aarch64_some_values_include_pst_objects_p (const_tree type) 2074{ 2075 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) 2076 return false; 2077 2078 if (aarch64_sve::builtin_type_p (type)) 2079 return true; 2080 2081 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE) 2082 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type)); 2083 2084 if (RECORD_OR_UNION_TYPE_P (type)) 2085 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 2086 if (TREE_CODE (field) == FIELD_DECL 2087 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field))) 2088 return true; 2089 2090 return false; 2091} 2092 2093/* Return the descriptor of the SIMD ABI. */ 2094 2095static const predefined_function_abi & 2096aarch64_simd_abi (void) 2097{ 2098 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD]; 2099 if (!simd_abi.initialized_p ()) 2100 { 2101 HARD_REG_SET full_reg_clobbers 2102 = default_function_abi.full_reg_clobbers (); 2103 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) 2104 if (FP_SIMD_SAVED_REGNUM_P (regno)) 2105 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); 2106 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers); 2107 } 2108 return simd_abi; 2109} 2110 2111/* Return the descriptor of the SVE PCS. */ 2112 2113static const predefined_function_abi & 2114aarch64_sve_abi (void) 2115{ 2116 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE]; 2117 if (!sve_abi.initialized_p ()) 2118 { 2119 HARD_REG_SET full_reg_clobbers 2120 = default_function_abi.full_reg_clobbers (); 2121 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno) 2122 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); 2123 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno) 2124 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); 2125 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers); 2126 } 2127 return sve_abi; 2128} 2129 2130/* If X is an UNSPEC_SALT_ADDR expression, return the address that it 2131 wraps, otherwise return X itself. */ 2132 2133static rtx 2134strip_salt (rtx x) 2135{ 2136 rtx search = x; 2137 if (GET_CODE (search) == CONST) 2138 search = XEXP (search, 0); 2139 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR) 2140 x = XVECEXP (search, 0, 0); 2141 return x; 2142} 2143 2144/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the 2145 expression. */ 2146 2147static rtx 2148strip_offset_and_salt (rtx addr, poly_int64 *offset) 2149{ 2150 return strip_salt (strip_offset (addr, offset)); 2151} 2152 2153/* Generate code to enable conditional branches in functions over 1 MiB. */ 2154const char * 2155aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, 2156 const char * branch_format) 2157{ 2158 rtx_code_label * tmp_label = gen_label_rtx (); 2159 char label_buf[256]; 2160 char buffer[128]; 2161 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest, 2162 CODE_LABEL_NUMBER (tmp_label)); 2163 const char *label_ptr = targetm.strip_name_encoding (label_buf); 2164 rtx dest_label = operands[pos_label]; 2165 operands[pos_label] = tmp_label; 2166 2167 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr); 2168 output_asm_insn (buffer, operands); 2169 2170 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr); 2171 operands[pos_label] = dest_label; 2172 output_asm_insn (buffer, operands); 2173 return ""; 2174} 2175 2176void 2177aarch64_err_no_fpadvsimd (machine_mode mode) 2178{ 2179 if (TARGET_GENERAL_REGS_ONLY) 2180 if (FLOAT_MODE_P (mode)) 2181 error ("%qs is incompatible with the use of floating-point types", 2182 "-mgeneral-regs-only"); 2183 else 2184 error ("%qs is incompatible with the use of vector types", 2185 "-mgeneral-regs-only"); 2186 else 2187 if (FLOAT_MODE_P (mode)) 2188 error ("%qs feature modifier is incompatible with the use of" 2189 " floating-point types", "+nofp"); 2190 else 2191 error ("%qs feature modifier is incompatible with the use of" 2192 " vector types", "+nofp"); 2193} 2194 2195/* Report when we try to do something that requires SVE when SVE is disabled. 2196 This is an error of last resort and isn't very high-quality. It usually 2197 involves attempts to measure the vector length in some way. */ 2198static void 2199aarch64_report_sve_required (void) 2200{ 2201 static bool reported_p = false; 2202 2203 /* Avoid reporting a slew of messages for a single oversight. */ 2204 if (reported_p) 2205 return; 2206 2207 error ("this operation requires the SVE ISA extension"); 2208 inform (input_location, "you can enable SVE using the command-line" 2209 " option %<-march%>, or by using the %<target%>" 2210 " attribute or pragma"); 2211 reported_p = true; 2212} 2213 2214/* Return true if REGNO is P0-P15 or one of the special FFR-related 2215 registers. */ 2216inline bool 2217pr_or_ffr_regnum_p (unsigned int regno) 2218{ 2219 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM; 2220} 2221 2222/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. 2223 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and 2224 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much 2225 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS 2226 and GENERAL_REGS is lower than the memory cost (in this case the best class 2227 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its 2228 cost results in bad allocations with many redundant int<->FP moves which 2229 are expensive on various cores. 2230 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but 2231 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class 2232 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't 2233 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode. 2234 The result of this is that it is no longer inefficient to have a higher 2235 memory move cost than the register move cost. 2236*/ 2237 2238static reg_class_t 2239aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class, 2240 reg_class_t best_class) 2241{ 2242 machine_mode mode; 2243 2244 if (!reg_class_subset_p (GENERAL_REGS, allocno_class) 2245 || !reg_class_subset_p (FP_REGS, allocno_class)) 2246 return allocno_class; 2247 2248 if (!reg_class_subset_p (GENERAL_REGS, best_class) 2249 || !reg_class_subset_p (FP_REGS, best_class)) 2250 return best_class; 2251 2252 mode = PSEUDO_REGNO_MODE (regno); 2253 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS; 2254} 2255 2256static unsigned int 2257aarch64_min_divisions_for_recip_mul (machine_mode mode) 2258{ 2259 if (GET_MODE_UNIT_SIZE (mode) == 4) 2260 return aarch64_tune_params.min_div_recip_mul_sf; 2261 return aarch64_tune_params.min_div_recip_mul_df; 2262} 2263 2264/* Return the reassociation width of treeop OPC with mode MODE. */ 2265static int 2266aarch64_reassociation_width (unsigned opc, machine_mode mode) 2267{ 2268 if (VECTOR_MODE_P (mode)) 2269 return aarch64_tune_params.vec_reassoc_width; 2270 if (INTEGRAL_MODE_P (mode)) 2271 return aarch64_tune_params.int_reassoc_width; 2272 /* Avoid reassociating floating point addition so we emit more FMAs. */ 2273 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR) 2274 return aarch64_tune_params.fp_reassoc_width; 2275 return 1; 2276} 2277 2278/* Provide a mapping from gcc register numbers to dwarf register numbers. */ 2279unsigned 2280aarch64_dbx_register_number (unsigned regno) 2281{ 2282 if (GP_REGNUM_P (regno)) 2283 return AARCH64_DWARF_R0 + regno - R0_REGNUM; 2284 else if (regno == SP_REGNUM) 2285 return AARCH64_DWARF_SP; 2286 else if (FP_REGNUM_P (regno)) 2287 return AARCH64_DWARF_V0 + regno - V0_REGNUM; 2288 else if (PR_REGNUM_P (regno)) 2289 return AARCH64_DWARF_P0 + regno - P0_REGNUM; 2290 else if (regno == VG_REGNUM) 2291 return AARCH64_DWARF_VG; 2292 2293 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no 2294 equivalent DWARF register. */ 2295 return DWARF_FRAME_REGISTERS; 2296} 2297 2298/* If X is a CONST_DOUBLE, return its bit representation as a constant 2299 integer, otherwise return X unmodified. */ 2300static rtx 2301aarch64_bit_representation (rtx x) 2302{ 2303 if (CONST_DOUBLE_P (x)) 2304 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x); 2305 return x; 2306} 2307 2308/* Return true if MODE is any of the Advanced SIMD structure modes. */ 2309static bool 2310aarch64_advsimd_struct_mode_p (machine_mode mode) 2311{ 2312 return (TARGET_SIMD 2313 && (mode == OImode || mode == CImode || mode == XImode)); 2314} 2315 2316/* Return true if MODE is an SVE predicate mode. */ 2317static bool 2318aarch64_sve_pred_mode_p (machine_mode mode) 2319{ 2320 return (TARGET_SVE 2321 && (mode == VNx16BImode 2322 || mode == VNx8BImode 2323 || mode == VNx4BImode 2324 || mode == VNx2BImode)); 2325} 2326 2327/* Three mutually-exclusive flags describing a vector or predicate type. */ 2328const unsigned int VEC_ADVSIMD = 1; 2329const unsigned int VEC_SVE_DATA = 2; 2330const unsigned int VEC_SVE_PRED = 4; 2331/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate 2332 a structure of 2, 3 or 4 vectors. */ 2333const unsigned int VEC_STRUCT = 8; 2334/* Can be used in combination with VEC_SVE_DATA to indicate that the 2335 vector has fewer significant bytes than a full SVE vector. */ 2336const unsigned int VEC_PARTIAL = 16; 2337/* Useful combinations of the above. */ 2338const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED; 2339const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA; 2340 2341/* Return a set of flags describing the vector properties of mode MODE. 2342 Ignore modes that are not supported by the current target. */ 2343static unsigned int 2344aarch64_classify_vector_mode (machine_mode mode) 2345{ 2346 if (aarch64_advsimd_struct_mode_p (mode)) 2347 return VEC_ADVSIMD | VEC_STRUCT; 2348 2349 if (aarch64_sve_pred_mode_p (mode)) 2350 return VEC_SVE_PRED; 2351 2352 /* Make the decision based on the mode's enum value rather than its 2353 properties, so that we keep the correct classification regardless 2354 of -msve-vector-bits. */ 2355 switch (mode) 2356 { 2357 /* Partial SVE QI vectors. */ 2358 case E_VNx2QImode: 2359 case E_VNx4QImode: 2360 case E_VNx8QImode: 2361 /* Partial SVE HI vectors. */ 2362 case E_VNx2HImode: 2363 case E_VNx4HImode: 2364 /* Partial SVE SI vector. */ 2365 case E_VNx2SImode: 2366 /* Partial SVE HF vectors. */ 2367 case E_VNx2HFmode: 2368 case E_VNx4HFmode: 2369 /* Partial SVE SF vector. */ 2370 case E_VNx2SFmode: 2371 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0; 2372 2373 case E_VNx16QImode: 2374 case E_VNx8HImode: 2375 case E_VNx4SImode: 2376 case E_VNx2DImode: 2377 case E_VNx8BFmode: 2378 case E_VNx8HFmode: 2379 case E_VNx4SFmode: 2380 case E_VNx2DFmode: 2381 return TARGET_SVE ? VEC_SVE_DATA : 0; 2382 2383 /* x2 SVE vectors. */ 2384 case E_VNx32QImode: 2385 case E_VNx16HImode: 2386 case E_VNx8SImode: 2387 case E_VNx4DImode: 2388 case E_VNx16BFmode: 2389 case E_VNx16HFmode: 2390 case E_VNx8SFmode: 2391 case E_VNx4DFmode: 2392 /* x3 SVE vectors. */ 2393 case E_VNx48QImode: 2394 case E_VNx24HImode: 2395 case E_VNx12SImode: 2396 case E_VNx6DImode: 2397 case E_VNx24BFmode: 2398 case E_VNx24HFmode: 2399 case E_VNx12SFmode: 2400 case E_VNx6DFmode: 2401 /* x4 SVE vectors. */ 2402 case E_VNx64QImode: 2403 case E_VNx32HImode: 2404 case E_VNx16SImode: 2405 case E_VNx8DImode: 2406 case E_VNx32BFmode: 2407 case E_VNx32HFmode: 2408 case E_VNx16SFmode: 2409 case E_VNx8DFmode: 2410 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0; 2411 2412 /* 64-bit Advanced SIMD vectors. */ 2413 case E_V8QImode: 2414 case E_V4HImode: 2415 case E_V2SImode: 2416 /* ...E_V1DImode doesn't exist. */ 2417 case E_V4HFmode: 2418 case E_V4BFmode: 2419 case E_V2SFmode: 2420 case E_V1DFmode: 2421 /* 128-bit Advanced SIMD vectors. */ 2422 case E_V16QImode: 2423 case E_V8HImode: 2424 case E_V4SImode: 2425 case E_V2DImode: 2426 case E_V8HFmode: 2427 case E_V8BFmode: 2428 case E_V4SFmode: 2429 case E_V2DFmode: 2430 return TARGET_SIMD ? VEC_ADVSIMD : 0; 2431 2432 default: 2433 return 0; 2434 } 2435} 2436 2437/* Return true if MODE is any of the data vector modes, including 2438 structure modes. */ 2439static bool 2440aarch64_vector_data_mode_p (machine_mode mode) 2441{ 2442 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA; 2443} 2444 2445/* Return true if MODE is any form of SVE mode, including predicates, 2446 vectors and structures. */ 2447bool 2448aarch64_sve_mode_p (machine_mode mode) 2449{ 2450 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE; 2451} 2452 2453/* Return true if MODE is an SVE data vector mode; either a single vector 2454 or a structure of vectors. */ 2455static bool 2456aarch64_sve_data_mode_p (machine_mode mode) 2457{ 2458 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; 2459} 2460 2461/* Return the number of defined bytes in one constituent vector of 2462 SVE mode MODE, which has vector flags VEC_FLAGS. */ 2463static poly_int64 2464aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags) 2465{ 2466 if (vec_flags & VEC_PARTIAL) 2467 /* A single partial vector. */ 2468 return GET_MODE_SIZE (mode); 2469 2470 if (vec_flags & VEC_SVE_DATA) 2471 /* A single vector or a tuple. */ 2472 return BYTES_PER_SVE_VECTOR; 2473 2474 /* A single predicate. */ 2475 gcc_assert (vec_flags & VEC_SVE_PRED); 2476 return BYTES_PER_SVE_PRED; 2477} 2478 2479/* Implement target hook TARGET_ARRAY_MODE. */ 2480static opt_machine_mode 2481aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems) 2482{ 2483 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA 2484 && IN_RANGE (nelems, 2, 4)) 2485 return mode_for_vector (GET_MODE_INNER (mode), 2486 GET_MODE_NUNITS (mode) * nelems); 2487 2488 return opt_machine_mode (); 2489} 2490 2491/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */ 2492static bool 2493aarch64_array_mode_supported_p (machine_mode mode, 2494 unsigned HOST_WIDE_INT nelems) 2495{ 2496 if (TARGET_SIMD 2497 && (AARCH64_VALID_SIMD_QREG_MODE (mode) 2498 || AARCH64_VALID_SIMD_DREG_MODE (mode)) 2499 && (nelems >= 2 && nelems <= 4)) 2500 return true; 2501 2502 return false; 2503} 2504 2505/* MODE is some form of SVE vector mode. For data modes, return the number 2506 of vector register bits that each element of MODE occupies, such as 64 2507 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored 2508 in a 64-bit container). For predicate modes, return the number of 2509 data bits controlled by each significant predicate bit. */ 2510 2511static unsigned int 2512aarch64_sve_container_bits (machine_mode mode) 2513{ 2514 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 2515 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED) 2516 ? BITS_PER_SVE_VECTOR 2517 : GET_MODE_BITSIZE (mode)); 2518 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode)); 2519} 2520 2521/* Return the SVE predicate mode to use for elements that have 2522 ELEM_NBYTES bytes, if such a mode exists. */ 2523 2524opt_machine_mode 2525aarch64_sve_pred_mode (unsigned int elem_nbytes) 2526{ 2527 if (TARGET_SVE) 2528 { 2529 if (elem_nbytes == 1) 2530 return VNx16BImode; 2531 if (elem_nbytes == 2) 2532 return VNx8BImode; 2533 if (elem_nbytes == 4) 2534 return VNx4BImode; 2535 if (elem_nbytes == 8) 2536 return VNx2BImode; 2537 } 2538 return opt_machine_mode (); 2539} 2540 2541/* Return the SVE predicate mode that should be used to control 2542 SVE mode MODE. */ 2543 2544machine_mode 2545aarch64_sve_pred_mode (machine_mode mode) 2546{ 2547 unsigned int bits = aarch64_sve_container_bits (mode); 2548 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require (); 2549} 2550 2551/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ 2552 2553static opt_machine_mode 2554aarch64_get_mask_mode (machine_mode mode) 2555{ 2556 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 2557 if (vec_flags & VEC_SVE_DATA) 2558 return aarch64_sve_pred_mode (mode); 2559 2560 return default_get_mask_mode (mode); 2561} 2562 2563/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */ 2564 2565opt_machine_mode 2566aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits) 2567{ 2568 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode) 2569 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT); 2570 machine_mode mode; 2571 FOR_EACH_MODE_IN_CLASS (mode, mclass) 2572 if (inner_mode == GET_MODE_INNER (mode) 2573 && known_eq (nunits, GET_MODE_NUNITS (mode)) 2574 && aarch64_sve_data_mode_p (mode)) 2575 return mode; 2576 return opt_machine_mode (); 2577} 2578 2579/* Return the integer element mode associated with SVE mode MODE. */ 2580 2581static scalar_int_mode 2582aarch64_sve_element_int_mode (machine_mode mode) 2583{ 2584 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL 2585 ? BITS_PER_SVE_VECTOR 2586 : GET_MODE_BITSIZE (mode)); 2587 unsigned int elt_bits = vector_element_size (vector_bits, 2588 GET_MODE_NUNITS (mode)); 2589 return int_mode_for_size (elt_bits, 0).require (); 2590} 2591 2592/* Return an integer element mode that contains exactly 2593 aarch64_sve_container_bits (MODE) bits. This is wider than 2594 aarch64_sve_element_int_mode if MODE is a partial vector, 2595 otherwise it's the same. */ 2596 2597static scalar_int_mode 2598aarch64_sve_container_int_mode (machine_mode mode) 2599{ 2600 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require (); 2601} 2602 2603/* Return the integer vector mode associated with SVE mode MODE. 2604 Unlike related_int_vector_mode, this can handle the case in which 2605 MODE is a predicate (and thus has a different total size). */ 2606 2607machine_mode 2608aarch64_sve_int_mode (machine_mode mode) 2609{ 2610 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); 2611 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require (); 2612} 2613 2614/* Implement TARGET_VECTORIZE_RELATED_MODE. */ 2615 2616static opt_machine_mode 2617aarch64_vectorize_related_mode (machine_mode vector_mode, 2618 scalar_mode element_mode, 2619 poly_uint64 nunits) 2620{ 2621 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode); 2622 2623 /* If we're operating on SVE vectors, try to return an SVE mode. */ 2624 poly_uint64 sve_nunits; 2625 if ((vec_flags & VEC_SVE_DATA) 2626 && multiple_p (BYTES_PER_SVE_VECTOR, 2627 GET_MODE_SIZE (element_mode), &sve_nunits)) 2628 { 2629 machine_mode sve_mode; 2630 if (maybe_ne (nunits, 0U)) 2631 { 2632 /* Try to find a full or partial SVE mode with exactly 2633 NUNITS units. */ 2634 if (multiple_p (sve_nunits, nunits) 2635 && aarch64_sve_data_mode (element_mode, 2636 nunits).exists (&sve_mode)) 2637 return sve_mode; 2638 } 2639 else 2640 { 2641 /* Take the preferred number of units from the number of bytes 2642 that fit in VECTOR_MODE. We always start by "autodetecting" 2643 a full vector mode with preferred_simd_mode, so vectors 2644 chosen here will also be full vector modes. Then 2645 autovectorize_vector_modes tries smaller starting modes 2646 and thus smaller preferred numbers of units. */ 2647 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode)); 2648 if (aarch64_sve_data_mode (element_mode, 2649 sve_nunits).exists (&sve_mode)) 2650 return sve_mode; 2651 } 2652 } 2653 2654 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */ 2655 if ((vec_flags & VEC_ADVSIMD) 2656 && known_eq (nunits, 0U) 2657 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U) 2658 && maybe_ge (GET_MODE_BITSIZE (element_mode) 2659 * GET_MODE_NUNITS (vector_mode), 128U)) 2660 { 2661 machine_mode res = aarch64_simd_container_mode (element_mode, 128); 2662 if (VECTOR_MODE_P (res)) 2663 return res; 2664 } 2665 2666 return default_vectorize_related_mode (vector_mode, element_mode, nunits); 2667} 2668 2669/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, 2670 prefer to use the first arithmetic operand as the else value if 2671 the else value doesn't matter, since that exactly matches the SVE 2672 destructive merging form. For ternary operations we could either 2673 pick the first operand and use FMAD-like instructions or the last 2674 operand and use FMLA-like instructions; the latter seems more 2675 natural. */ 2676 2677static tree 2678aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops) 2679{ 2680 return nops == 3 ? ops[2] : ops[0]; 2681} 2682 2683/* Implement TARGET_HARD_REGNO_NREGS. */ 2684 2685static unsigned int 2686aarch64_hard_regno_nregs (unsigned regno, machine_mode mode) 2687{ 2688 /* ??? Logically we should only need to provide a value when 2689 HARD_REGNO_MODE_OK says that the combination is valid, 2690 but at the moment we need to handle all modes. Just ignore 2691 any runtime parts for registers that can't store them. */ 2692 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode)); 2693 switch (aarch64_regno_regclass (regno)) 2694 { 2695 case FP_REGS: 2696 case FP_LO_REGS: 2697 case FP_LO8_REGS: 2698 { 2699 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 2700 if (vec_flags & VEC_SVE_DATA) 2701 return exact_div (GET_MODE_SIZE (mode), 2702 aarch64_vl_bytes (mode, vec_flags)).to_constant (); 2703 return CEIL (lowest_size, UNITS_PER_VREG); 2704 } 2705 case PR_REGS: 2706 case PR_LO_REGS: 2707 case PR_HI_REGS: 2708 case FFR_REGS: 2709 case PR_AND_FFR_REGS: 2710 return 1; 2711 default: 2712 return CEIL (lowest_size, UNITS_PER_WORD); 2713 } 2714 gcc_unreachable (); 2715} 2716 2717/* Implement TARGET_HARD_REGNO_MODE_OK. */ 2718 2719static bool 2720aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode) 2721{ 2722 if (GET_MODE_CLASS (mode) == MODE_CC) 2723 return regno == CC_REGNUM; 2724 2725 if (regno == VG_REGNUM) 2726 /* This must have the same size as _Unwind_Word. */ 2727 return mode == DImode; 2728 2729 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 2730 if (vec_flags & VEC_SVE_PRED) 2731 return pr_or_ffr_regnum_p (regno); 2732 2733 if (pr_or_ffr_regnum_p (regno)) 2734 return false; 2735 2736 if (regno == SP_REGNUM) 2737 /* The purpose of comparing with ptr_mode is to support the 2738 global register variable associated with the stack pointer 2739 register via the syntax of asm ("wsp") in ILP32. */ 2740 return mode == Pmode || mode == ptr_mode; 2741 2742 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM) 2743 return mode == Pmode; 2744 2745 if (GP_REGNUM_P (regno)) 2746 { 2747 if (vec_flags & VEC_ANY_SVE) 2748 return false; 2749 if (known_le (GET_MODE_SIZE (mode), 8)) 2750 return true; 2751 if (known_le (GET_MODE_SIZE (mode), 16)) 2752 return (regno & 1) == 0; 2753 } 2754 else if (FP_REGNUM_P (regno)) 2755 { 2756 if (vec_flags & VEC_STRUCT) 2757 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM; 2758 else 2759 return !VECTOR_MODE_P (mode) || vec_flags != 0; 2760 } 2761 2762 return false; 2763} 2764 2765/* Return true if a function with type FNTYPE returns its value in 2766 SVE vector or predicate registers. */ 2767 2768static bool 2769aarch64_returns_value_in_sve_regs_p (const_tree fntype) 2770{ 2771 tree return_type = TREE_TYPE (fntype); 2772 2773 pure_scalable_type_info pst_info; 2774 switch (pst_info.analyze (return_type)) 2775 { 2776 case pure_scalable_type_info::IS_PST: 2777 return (pst_info.num_zr () <= NUM_FP_ARG_REGS 2778 && pst_info.num_pr () <= NUM_PR_ARG_REGS); 2779 2780 case pure_scalable_type_info::DOESNT_MATTER: 2781 gcc_assert (aarch64_return_in_memory_1 (return_type)); 2782 return false; 2783 2784 case pure_scalable_type_info::NO_ABI_IDENTITY: 2785 case pure_scalable_type_info::ISNT_PST: 2786 return false; 2787 } 2788 gcc_unreachable (); 2789} 2790 2791/* Return true if a function with type FNTYPE takes arguments in 2792 SVE vector or predicate registers. */ 2793 2794static bool 2795aarch64_takes_arguments_in_sve_regs_p (const_tree fntype) 2796{ 2797 CUMULATIVE_ARGS args_so_far_v; 2798 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX, 2799 NULL_TREE, 0, true); 2800 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v); 2801 2802 for (tree chain = TYPE_ARG_TYPES (fntype); 2803 chain && chain != void_list_node; 2804 chain = TREE_CHAIN (chain)) 2805 { 2806 tree arg_type = TREE_VALUE (chain); 2807 if (arg_type == error_mark_node) 2808 return false; 2809 2810 function_arg_info arg (arg_type, /*named=*/true); 2811 apply_pass_by_reference_rules (&args_so_far_v, arg); 2812 pure_scalable_type_info pst_info; 2813 if (pst_info.analyze_registers (arg.type)) 2814 { 2815 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr (); 2816 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr (); 2817 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS); 2818 return true; 2819 } 2820 2821 targetm.calls.function_arg_advance (args_so_far, arg); 2822 } 2823 return false; 2824} 2825 2826/* Implement TARGET_FNTYPE_ABI. */ 2827 2828static const predefined_function_abi & 2829aarch64_fntype_abi (const_tree fntype) 2830{ 2831 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype))) 2832 return aarch64_simd_abi (); 2833 2834 if (aarch64_returns_value_in_sve_regs_p (fntype) 2835 || aarch64_takes_arguments_in_sve_regs_p (fntype)) 2836 return aarch64_sve_abi (); 2837 2838 return default_function_abi; 2839} 2840 2841/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */ 2842 2843static bool 2844aarch64_compatible_vector_types_p (const_tree type1, const_tree type2) 2845{ 2846 return (aarch64_sve::builtin_type_p (type1) 2847 == aarch64_sve::builtin_type_p (type2)); 2848} 2849 2850/* Return true if we should emit CFI for register REGNO. */ 2851 2852static bool 2853aarch64_emit_cfi_for_reg_p (unsigned int regno) 2854{ 2855 return (GP_REGNUM_P (regno) 2856 || !default_function_abi.clobbers_full_reg_p (regno)); 2857} 2858 2859/* Return the mode we should use to save and restore register REGNO. */ 2860 2861static machine_mode 2862aarch64_reg_save_mode (unsigned int regno) 2863{ 2864 if (GP_REGNUM_P (regno)) 2865 return DImode; 2866 2867 if (FP_REGNUM_P (regno)) 2868 switch (crtl->abi->id ()) 2869 { 2870 case ARM_PCS_AAPCS64: 2871 /* Only the low 64 bits are saved by the base PCS. */ 2872 return DFmode; 2873 2874 case ARM_PCS_SIMD: 2875 /* The vector PCS saves the low 128 bits (which is the full 2876 register on non-SVE targets). */ 2877 return TFmode; 2878 2879 case ARM_PCS_SVE: 2880 /* Use vectors of DImode for registers that need frame 2881 information, so that the first 64 bytes of the save slot 2882 are always the equivalent of what storing D<n> would give. */ 2883 if (aarch64_emit_cfi_for_reg_p (regno)) 2884 return VNx2DImode; 2885 2886 /* Use vectors of bytes otherwise, so that the layout is 2887 endian-agnostic, and so that we can use LDR and STR for 2888 big-endian targets. */ 2889 return VNx16QImode; 2890 2891 case ARM_PCS_TLSDESC: 2892 case ARM_PCS_UNKNOWN: 2893 break; 2894 } 2895 2896 if (PR_REGNUM_P (regno)) 2897 /* Save the full predicate register. */ 2898 return VNx16BImode; 2899 2900 gcc_unreachable (); 2901} 2902 2903/* Implement TARGET_INSN_CALLEE_ABI. */ 2904 2905const predefined_function_abi & 2906aarch64_insn_callee_abi (const rtx_insn *insn) 2907{ 2908 rtx pat = PATTERN (insn); 2909 gcc_assert (GET_CODE (pat) == PARALLEL); 2910 rtx unspec = XVECEXP (pat, 0, 1); 2911 gcc_assert (GET_CODE (unspec) == UNSPEC 2912 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI); 2913 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))]; 2914} 2915 2916/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves 2917 the lower 64 bits of a 128-bit register. Tell the compiler the callee 2918 clobbers the top 64 bits when restoring the bottom 64 bits. */ 2919 2920static bool 2921aarch64_hard_regno_call_part_clobbered (unsigned int abi_id, 2922 unsigned int regno, 2923 machine_mode mode) 2924{ 2925 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE) 2926 { 2927 poly_int64 per_register_size = GET_MODE_SIZE (mode); 2928 unsigned int nregs = hard_regno_nregs (regno, mode); 2929 if (nregs > 1) 2930 per_register_size = exact_div (per_register_size, nregs); 2931 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC) 2932 return maybe_gt (per_register_size, 16); 2933 return maybe_gt (per_register_size, 8); 2934 } 2935 return false; 2936} 2937 2938/* Implement REGMODE_NATURAL_SIZE. */ 2939poly_uint64 2940aarch64_regmode_natural_size (machine_mode mode) 2941{ 2942 /* The natural size for SVE data modes is one SVE data vector, 2943 and similarly for predicates. We can't independently modify 2944 anything smaller than that. */ 2945 /* ??? For now, only do this for variable-width SVE registers. 2946 Doing it for constant-sized registers breaks lower-subreg.c. */ 2947 /* ??? And once that's fixed, we should probably have similar 2948 code for Advanced SIMD. */ 2949 if (!aarch64_sve_vg.is_constant ()) 2950 { 2951 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 2952 if (vec_flags & VEC_SVE_PRED) 2953 return BYTES_PER_SVE_PRED; 2954 if (vec_flags & VEC_SVE_DATA) 2955 return BYTES_PER_SVE_VECTOR; 2956 } 2957 return UNITS_PER_WORD; 2958} 2959 2960/* Implement HARD_REGNO_CALLER_SAVE_MODE. */ 2961machine_mode 2962aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned, 2963 machine_mode mode) 2964{ 2965 /* The predicate mode determines which bits are significant and 2966 which are "don't care". Decreasing the number of lanes would 2967 lose data while increasing the number of lanes would make bits 2968 unnecessarily significant. */ 2969 if (PR_REGNUM_P (regno)) 2970 return mode; 2971 if (known_ge (GET_MODE_SIZE (mode), 4)) 2972 return mode; 2973 else 2974 return SImode; 2975} 2976 2977/* Return true if I's bits are consecutive ones from the MSB. */ 2978bool 2979aarch64_high_bits_all_ones_p (HOST_WIDE_INT i) 2980{ 2981 return exact_log2 (-i) != HOST_WIDE_INT_M1; 2982} 2983 2984/* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so 2985 that strcpy from constants will be faster. */ 2986 2987static HOST_WIDE_INT 2988aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align) 2989{ 2990 if (TREE_CODE (exp) == STRING_CST && !optimize_size) 2991 return MAX (align, BITS_PER_WORD); 2992 return align; 2993} 2994 2995/* Return true if calls to DECL should be treated as 2996 long-calls (ie called via a register). */ 2997static bool 2998aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED) 2999{ 3000 return false; 3001} 3002 3003/* Return true if calls to symbol-ref SYM should be treated as 3004 long-calls (ie called via a register). */ 3005bool 3006aarch64_is_long_call_p (rtx sym) 3007{ 3008 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym)); 3009} 3010 3011/* Return true if calls to symbol-ref SYM should not go through 3012 plt stubs. */ 3013 3014bool 3015aarch64_is_noplt_call_p (rtx sym) 3016{ 3017 const_tree decl = SYMBOL_REF_DECL (sym); 3018 3019 if (flag_pic 3020 && decl 3021 && (!flag_plt 3022 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl))) 3023 && !targetm.binds_local_p (decl)) 3024 return true; 3025 3026 return false; 3027} 3028 3029/* Return true if the offsets to a zero/sign-extract operation 3030 represent an expression that matches an extend operation. The 3031 operands represent the parameters from 3032 3033 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */ 3034bool 3035aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm, 3036 rtx extract_imm) 3037{ 3038 HOST_WIDE_INT mult_val, extract_val; 3039 3040 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm)) 3041 return false; 3042 3043 mult_val = INTVAL (mult_imm); 3044 extract_val = INTVAL (extract_imm); 3045 3046 if (extract_val > 8 3047 && extract_val < GET_MODE_BITSIZE (mode) 3048 && exact_log2 (extract_val & ~7) > 0 3049 && (extract_val & 7) <= 4 3050 && mult_val == (1 << (extract_val & 7))) 3051 return true; 3052 3053 return false; 3054} 3055 3056/* Emit an insn that's a simple single-set. Both the operands must be 3057 known to be valid. */ 3058inline static rtx_insn * 3059emit_set_insn (rtx x, rtx y) 3060{ 3061 return emit_insn (gen_rtx_SET (x, y)); 3062} 3063 3064/* X and Y are two things to compare using CODE. Emit the compare insn and 3065 return the rtx for register 0 in the proper mode. */ 3066rtx 3067aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y) 3068{ 3069 machine_mode cmp_mode = GET_MODE (x); 3070 machine_mode cc_mode; 3071 rtx cc_reg; 3072 3073 if (cmp_mode == TImode) 3074 { 3075 gcc_assert (code == NE); 3076 3077 cc_mode = CCmode; 3078 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); 3079 3080 rtx x_lo = operand_subword (x, 0, 0, TImode); 3081 rtx y_lo = operand_subword (y, 0, 0, TImode); 3082 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo)); 3083 3084 rtx x_hi = operand_subword (x, 1, 0, TImode); 3085 rtx y_hi = operand_subword (y, 1, 0, TImode); 3086 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi, 3087 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx), 3088 GEN_INT (AARCH64_EQ))); 3089 } 3090 else 3091 { 3092 cc_mode = SELECT_CC_MODE (code, x, y); 3093 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); 3094 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y)); 3095 } 3096 return cc_reg; 3097} 3098 3099/* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */ 3100 3101static rtx 3102aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, 3103 machine_mode y_mode) 3104{ 3105 if (y_mode == E_QImode || y_mode == E_HImode) 3106 { 3107 if (CONST_INT_P (y)) 3108 { 3109 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode)); 3110 y_mode = SImode; 3111 } 3112 else 3113 { 3114 rtx t, cc_reg; 3115 machine_mode cc_mode; 3116 3117 t = gen_rtx_ZERO_EXTEND (SImode, y); 3118 t = gen_rtx_COMPARE (CC_SWPmode, t, x); 3119 cc_mode = CC_SWPmode; 3120 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); 3121 emit_set_insn (cc_reg, t); 3122 return cc_reg; 3123 } 3124 } 3125 3126 if (!aarch64_plus_operand (y, y_mode)) 3127 y = force_reg (y_mode, y); 3128 3129 return aarch64_gen_compare_reg (code, x, y); 3130} 3131 3132/* Build the SYMBOL_REF for __tls_get_addr. */ 3133 3134static GTY(()) rtx tls_get_addr_libfunc; 3135 3136rtx 3137aarch64_tls_get_addr (void) 3138{ 3139 if (!tls_get_addr_libfunc) 3140 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr"); 3141 return tls_get_addr_libfunc; 3142} 3143 3144/* Return the TLS model to use for ADDR. */ 3145 3146static enum tls_model 3147tls_symbolic_operand_type (rtx addr) 3148{ 3149 enum tls_model tls_kind = TLS_MODEL_NONE; 3150 poly_int64 offset; 3151 addr = strip_offset_and_salt (addr, &offset); 3152 if (GET_CODE (addr) == SYMBOL_REF) 3153 tls_kind = SYMBOL_REF_TLS_MODEL (addr); 3154 3155 return tls_kind; 3156} 3157 3158/* We'll allow lo_sum's in addresses in our legitimate addresses 3159 so that combine would take care of combining addresses where 3160 necessary, but for generation purposes, we'll generate the address 3161 as : 3162 RTL Absolute 3163 tmp = hi (symbol_ref); adrp x1, foo 3164 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo 3165 nop 3166 3167 PIC TLS 3168 adrp x1, :got:foo adrp tmp, :tlsgd:foo 3169 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo 3170 bl __tls_get_addr 3171 nop 3172 3173 Load TLS symbol, depending on TLS mechanism and TLS access model. 3174 3175 Global Dynamic - Traditional TLS: 3176 adrp tmp, :tlsgd:imm 3177 add dest, tmp, #:tlsgd_lo12:imm 3178 bl __tls_get_addr 3179 3180 Global Dynamic - TLS Descriptors: 3181 adrp dest, :tlsdesc:imm 3182 ldr tmp, [dest, #:tlsdesc_lo12:imm] 3183 add dest, dest, #:tlsdesc_lo12:imm 3184 blr tmp 3185 mrs tp, tpidr_el0 3186 add dest, dest, tp 3187 3188 Initial Exec: 3189 mrs tp, tpidr_el0 3190 adrp tmp, :gottprel:imm 3191 ldr dest, [tmp, #:gottprel_lo12:imm] 3192 add dest, dest, tp 3193 3194 Local Exec: 3195 mrs tp, tpidr_el0 3196 add t0, tp, #:tprel_hi12:imm, lsl #12 3197 add t0, t0, #:tprel_lo12_nc:imm 3198*/ 3199 3200static void 3201aarch64_load_symref_appropriately (rtx dest, rtx imm, 3202 enum aarch64_symbol_type type) 3203{ 3204 switch (type) 3205 { 3206 case SYMBOL_SMALL_ABSOLUTE: 3207 { 3208 /* In ILP32, the mode of dest can be either SImode or DImode. */ 3209 rtx tmp_reg = dest; 3210 machine_mode mode = GET_MODE (dest); 3211 3212 gcc_assert (mode == Pmode || mode == ptr_mode); 3213 3214 if (can_create_pseudo_p ()) 3215 tmp_reg = gen_reg_rtx (mode); 3216 3217 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm))); 3218 emit_insn (gen_add_losym (dest, tmp_reg, imm)); 3219 return; 3220 } 3221 3222 case SYMBOL_TINY_ABSOLUTE: 3223 emit_insn (gen_rtx_SET (dest, imm)); 3224 return; 3225 3226 case SYMBOL_SMALL_GOT_28K: 3227 { 3228 machine_mode mode = GET_MODE (dest); 3229 rtx gp_rtx = pic_offset_table_rtx; 3230 rtx insn; 3231 rtx mem; 3232 3233 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach 3234 here before rtl expand. Tree IVOPT will generate rtl pattern to 3235 decide rtx costs, in which case pic_offset_table_rtx is not 3236 initialized. For that case no need to generate the first adrp 3237 instruction as the final cost for global variable access is 3238 one instruction. */ 3239 if (gp_rtx != NULL) 3240 { 3241 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are 3242 using the page base as GOT base, the first page may be wasted, 3243 in the worst scenario, there is only 28K space for GOT). 3244 3245 The generate instruction sequence for accessing global variable 3246 is: 3247 3248 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym] 3249 3250 Only one instruction needed. But we must initialize 3251 pic_offset_table_rtx properly. We generate initialize insn for 3252 every global access, and allow CSE to remove all redundant. 3253 3254 The final instruction sequences will look like the following 3255 for multiply global variables access. 3256 3257 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_ 3258 3259 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1] 3260 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2] 3261 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3] 3262 ... */ 3263 3264 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); 3265 crtl->uses_pic_offset_table = 1; 3266 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s)); 3267 3268 if (mode != GET_MODE (gp_rtx)) 3269 gp_rtx = gen_lowpart (mode, gp_rtx); 3270 3271 } 3272 3273 if (mode == ptr_mode) 3274 { 3275 if (mode == DImode) 3276 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm); 3277 else 3278 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm); 3279 3280 mem = XVECEXP (SET_SRC (insn), 0, 0); 3281 } 3282 else 3283 { 3284 gcc_assert (mode == Pmode); 3285 3286 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm); 3287 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); 3288 } 3289 3290 /* The operand is expected to be MEM. Whenever the related insn 3291 pattern changed, above code which calculate mem should be 3292 updated. */ 3293 gcc_assert (GET_CODE (mem) == MEM); 3294 MEM_READONLY_P (mem) = 1; 3295 MEM_NOTRAP_P (mem) = 1; 3296 emit_insn (insn); 3297 return; 3298 } 3299 3300 case SYMBOL_SMALL_GOT_4G: 3301 { 3302 /* In ILP32, the mode of dest can be either SImode or DImode, 3303 while the got entry is always of SImode size. The mode of 3304 dest depends on how dest is used: if dest is assigned to a 3305 pointer (e.g. in the memory), it has SImode; it may have 3306 DImode if dest is dereferenced to access the memeory. 3307 This is why we have to handle three different ldr_got_small 3308 patterns here (two patterns for ILP32). */ 3309 3310 rtx insn; 3311 rtx mem; 3312 rtx tmp_reg = dest; 3313 machine_mode mode = GET_MODE (dest); 3314 3315 if (can_create_pseudo_p ()) 3316 tmp_reg = gen_reg_rtx (mode); 3317 3318 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm)); 3319 if (mode == ptr_mode) 3320 { 3321 if (mode == DImode) 3322 insn = gen_ldr_got_small_di (dest, tmp_reg, imm); 3323 else 3324 insn = gen_ldr_got_small_si (dest, tmp_reg, imm); 3325 3326 mem = XVECEXP (SET_SRC (insn), 0, 0); 3327 } 3328 else 3329 { 3330 gcc_assert (mode == Pmode); 3331 3332 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm); 3333 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); 3334 } 3335 3336 gcc_assert (GET_CODE (mem) == MEM); 3337 MEM_READONLY_P (mem) = 1; 3338 MEM_NOTRAP_P (mem) = 1; 3339 emit_insn (insn); 3340 return; 3341 } 3342 3343 case SYMBOL_SMALL_TLSGD: 3344 { 3345 rtx_insn *insns; 3346 /* The return type of __tls_get_addr is the C pointer type 3347 so use ptr_mode. */ 3348 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM); 3349 rtx tmp_reg = dest; 3350 3351 if (GET_MODE (dest) != ptr_mode) 3352 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result; 3353 3354 start_sequence (); 3355 if (ptr_mode == SImode) 3356 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm)); 3357 else 3358 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm)); 3359 insns = get_insns (); 3360 end_sequence (); 3361 3362 RTL_CONST_CALL_P (insns) = 1; 3363 emit_libcall_block (insns, tmp_reg, result, imm); 3364 /* Convert back to the mode of the dest adding a zero_extend 3365 from SImode (ptr_mode) to DImode (Pmode). */ 3366 if (dest != tmp_reg) 3367 convert_move (dest, tmp_reg, true); 3368 return; 3369 } 3370 3371 case SYMBOL_SMALL_TLSDESC: 3372 { 3373 machine_mode mode = GET_MODE (dest); 3374 rtx x0 = gen_rtx_REG (mode, R0_REGNUM); 3375 rtx tp; 3376 3377 gcc_assert (mode == Pmode || mode == ptr_mode); 3378 3379 /* In ILP32, the got entry is always of SImode size. Unlike 3380 small GOT, the dest is fixed at reg 0. */ 3381 if (TARGET_ILP32) 3382 emit_insn (gen_tlsdesc_small_si (imm)); 3383 else 3384 emit_insn (gen_tlsdesc_small_di (imm)); 3385 tp = aarch64_load_tp (NULL); 3386 3387 if (mode != Pmode) 3388 tp = gen_lowpart (mode, tp); 3389 3390 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0))); 3391 if (REG_P (dest)) 3392 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 3393 return; 3394 } 3395 3396 case SYMBOL_SMALL_TLSIE: 3397 { 3398 /* In ILP32, the mode of dest can be either SImode or DImode, 3399 while the got entry is always of SImode size. The mode of 3400 dest depends on how dest is used: if dest is assigned to a 3401 pointer (e.g. in the memory), it has SImode; it may have 3402 DImode if dest is dereferenced to access the memeory. 3403 This is why we have to handle three different tlsie_small 3404 patterns here (two patterns for ILP32). */ 3405 machine_mode mode = GET_MODE (dest); 3406 rtx tmp_reg = gen_reg_rtx (mode); 3407 rtx tp = aarch64_load_tp (NULL); 3408 3409 if (mode == ptr_mode) 3410 { 3411 if (mode == DImode) 3412 emit_insn (gen_tlsie_small_di (tmp_reg, imm)); 3413 else 3414 { 3415 emit_insn (gen_tlsie_small_si (tmp_reg, imm)); 3416 tp = gen_lowpart (mode, tp); 3417 } 3418 } 3419 else 3420 { 3421 gcc_assert (mode == Pmode); 3422 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm)); 3423 } 3424 3425 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg))); 3426 if (REG_P (dest)) 3427 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 3428 return; 3429 } 3430 3431 case SYMBOL_TLSLE12: 3432 case SYMBOL_TLSLE24: 3433 case SYMBOL_TLSLE32: 3434 case SYMBOL_TLSLE48: 3435 { 3436 machine_mode mode = GET_MODE (dest); 3437 rtx tp = aarch64_load_tp (NULL); 3438 3439 if (mode != Pmode) 3440 tp = gen_lowpart (mode, tp); 3441 3442 switch (type) 3443 { 3444 case SYMBOL_TLSLE12: 3445 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si) 3446 (dest, tp, imm)); 3447 break; 3448 case SYMBOL_TLSLE24: 3449 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si) 3450 (dest, tp, imm)); 3451 break; 3452 case SYMBOL_TLSLE32: 3453 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si) 3454 (dest, imm)); 3455 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3) 3456 (dest, dest, tp)); 3457 break; 3458 case SYMBOL_TLSLE48: 3459 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si) 3460 (dest, imm)); 3461 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3) 3462 (dest, dest, tp)); 3463 break; 3464 default: 3465 gcc_unreachable (); 3466 } 3467 3468 if (REG_P (dest)) 3469 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 3470 return; 3471 } 3472 3473 case SYMBOL_TINY_GOT: 3474 { 3475 rtx insn; 3476 machine_mode mode = GET_MODE (dest); 3477 3478 if (mode == ptr_mode) 3479 insn = gen_ldr_got_tiny (mode, dest, imm); 3480 else 3481 { 3482 gcc_assert (mode == Pmode); 3483 insn = gen_ldr_got_tiny_sidi (dest, imm); 3484 } 3485 3486 emit_insn (insn); 3487 return; 3488 } 3489 3490 case SYMBOL_TINY_TLSIE: 3491 { 3492 machine_mode mode = GET_MODE (dest); 3493 rtx tp = aarch64_load_tp (NULL); 3494 3495 if (mode == ptr_mode) 3496 { 3497 if (mode == DImode) 3498 emit_insn (gen_tlsie_tiny_di (dest, imm, tp)); 3499 else 3500 { 3501 tp = gen_lowpart (mode, tp); 3502 emit_insn (gen_tlsie_tiny_si (dest, imm, tp)); 3503 } 3504 } 3505 else 3506 { 3507 gcc_assert (mode == Pmode); 3508 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp)); 3509 } 3510 3511 if (REG_P (dest)) 3512 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 3513 return; 3514 } 3515 3516 default: 3517 gcc_unreachable (); 3518 } 3519} 3520 3521/* Emit a move from SRC to DEST. Assume that the move expanders can 3522 handle all moves if !can_create_pseudo_p (). The distinction is 3523 important because, unlike emit_move_insn, the move expanders know 3524 how to force Pmode objects into the constant pool even when the 3525 constant pool address is not itself legitimate. */ 3526static rtx 3527aarch64_emit_move (rtx dest, rtx src) 3528{ 3529 return (can_create_pseudo_p () 3530 ? emit_move_insn (dest, src) 3531 : emit_move_insn_1 (dest, src)); 3532} 3533 3534/* Apply UNOPTAB to OP and store the result in DEST. */ 3535 3536static void 3537aarch64_emit_unop (rtx dest, optab unoptab, rtx op) 3538{ 3539 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0); 3540 if (dest != tmp) 3541 emit_move_insn (dest, tmp); 3542} 3543 3544/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */ 3545 3546static void 3547aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1) 3548{ 3549 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0, 3550 OPTAB_DIRECT); 3551 if (dest != tmp) 3552 emit_move_insn (dest, tmp); 3553} 3554 3555/* Split a 128-bit move operation into two 64-bit move operations, 3556 taking care to handle partial overlap of register to register 3557 copies. Special cases are needed when moving between GP regs and 3558 FP regs. SRC can be a register, constant or memory; DST a register 3559 or memory. If either operand is memory it must not have any side 3560 effects. */ 3561void 3562aarch64_split_128bit_move (rtx dst, rtx src) 3563{ 3564 rtx dst_lo, dst_hi; 3565 rtx src_lo, src_hi; 3566 3567 machine_mode mode = GET_MODE (dst); 3568 3569 gcc_assert (mode == TImode || mode == TFmode); 3570 gcc_assert (!(side_effects_p (src) || side_effects_p (dst))); 3571 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode); 3572 3573 if (REG_P (dst) && REG_P (src)) 3574 { 3575 int src_regno = REGNO (src); 3576 int dst_regno = REGNO (dst); 3577 3578 /* Handle FP <-> GP regs. */ 3579 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno)) 3580 { 3581 src_lo = gen_lowpart (word_mode, src); 3582 src_hi = gen_highpart (word_mode, src); 3583 3584 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo)); 3585 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi)); 3586 return; 3587 } 3588 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno)) 3589 { 3590 dst_lo = gen_lowpart (word_mode, dst); 3591 dst_hi = gen_highpart (word_mode, dst); 3592 3593 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src)); 3594 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src)); 3595 return; 3596 } 3597 } 3598 3599 dst_lo = gen_lowpart (word_mode, dst); 3600 dst_hi = gen_highpart (word_mode, dst); 3601 src_lo = gen_lowpart (word_mode, src); 3602 src_hi = gen_highpart_mode (word_mode, mode, src); 3603 3604 /* At most one pairing may overlap. */ 3605 if (reg_overlap_mentioned_p (dst_lo, src_hi)) 3606 { 3607 aarch64_emit_move (dst_hi, src_hi); 3608 aarch64_emit_move (dst_lo, src_lo); 3609 } 3610 else 3611 { 3612 aarch64_emit_move (dst_lo, src_lo); 3613 aarch64_emit_move (dst_hi, src_hi); 3614 } 3615} 3616 3617bool 3618aarch64_split_128bit_move_p (rtx dst, rtx src) 3619{ 3620 return (! REG_P (src) 3621 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src)))); 3622} 3623 3624/* Split a complex SIMD combine. */ 3625 3626void 3627aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) 3628{ 3629 machine_mode src_mode = GET_MODE (src1); 3630 machine_mode dst_mode = GET_MODE (dst); 3631 3632 gcc_assert (VECTOR_MODE_P (dst_mode)); 3633 gcc_assert (register_operand (dst, dst_mode) 3634 && register_operand (src1, src_mode) 3635 && register_operand (src2, src_mode)); 3636 3637 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2)); 3638 return; 3639} 3640 3641/* Split a complex SIMD move. */ 3642 3643void 3644aarch64_split_simd_move (rtx dst, rtx src) 3645{ 3646 machine_mode src_mode = GET_MODE (src); 3647 machine_mode dst_mode = GET_MODE (dst); 3648 3649 gcc_assert (VECTOR_MODE_P (dst_mode)); 3650 3651 if (REG_P (dst) && REG_P (src)) 3652 { 3653 gcc_assert (VECTOR_MODE_P (src_mode)); 3654 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src)); 3655 } 3656} 3657 3658bool 3659aarch64_zero_extend_const_eq (machine_mode xmode, rtx x, 3660 machine_mode ymode, rtx y) 3661{ 3662 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode); 3663 gcc_assert (r != NULL); 3664 return rtx_equal_p (x, r); 3665} 3666 3667/* Return TARGET if it is nonnull and a register of mode MODE. 3668 Otherwise, return a fresh register of mode MODE if we can, 3669 or TARGET reinterpreted as MODE if we can't. */ 3670 3671static rtx 3672aarch64_target_reg (rtx target, machine_mode mode) 3673{ 3674 if (target && REG_P (target) && GET_MODE (target) == mode) 3675 return target; 3676 if (!can_create_pseudo_p ()) 3677 { 3678 gcc_assert (target); 3679 return gen_lowpart (mode, target); 3680 } 3681 return gen_reg_rtx (mode); 3682} 3683 3684/* Return a register that contains the constant in BUILDER, given that 3685 the constant is a legitimate move operand. Use TARGET as the register 3686 if it is nonnull and convenient. */ 3687 3688static rtx 3689aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder) 3690{ 3691 rtx src = builder.build (); 3692 target = aarch64_target_reg (target, GET_MODE (src)); 3693 emit_insn (gen_rtx_SET (target, src)); 3694 return target; 3695} 3696 3697static rtx 3698aarch64_force_temporary (machine_mode mode, rtx x, rtx value) 3699{ 3700 if (can_create_pseudo_p ()) 3701 return force_reg (mode, value); 3702 else 3703 { 3704 gcc_assert (x); 3705 aarch64_emit_move (x, value); 3706 return x; 3707 } 3708} 3709 3710/* Return true if predicate value X is a constant in which every element 3711 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI 3712 value, i.e. as a predicate in which all bits are significant. */ 3713 3714static bool 3715aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x) 3716{ 3717 if (GET_CODE (x) != CONST_VECTOR) 3718 return false; 3719 3720 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode), 3721 GET_MODE_NUNITS (GET_MODE (x))); 3722 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor; 3723 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); 3724 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern); 3725 3726 unsigned int nelts = const_vector_encoded_nelts (x); 3727 for (unsigned int i = 0; i < nelts; ++i) 3728 { 3729 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i); 3730 if (!CONST_INT_P (elt)) 3731 return false; 3732 3733 builder.quick_push (elt); 3734 for (unsigned int j = 1; j < factor; ++j) 3735 builder.quick_push (const0_rtx); 3736 } 3737 builder.finalize (); 3738 return true; 3739} 3740 3741/* BUILDER contains a predicate constant of mode VNx16BI. Return the 3742 widest predicate element size it can have (that is, the largest size 3743 for which each element would still be 0 or 1). */ 3744 3745unsigned int 3746aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder) 3747{ 3748 /* Start with the most optimistic assumption: that we only need 3749 one bit per pattern. This is what we will use if only the first 3750 bit in each pattern is ever set. */ 3751 unsigned int mask = GET_MODE_SIZE (DImode); 3752 mask |= builder.npatterns (); 3753 3754 /* Look for set bits. */ 3755 unsigned int nelts = builder.encoded_nelts (); 3756 for (unsigned int i = 1; i < nelts; ++i) 3757 if (INTVAL (builder.elt (i)) != 0) 3758 { 3759 if (i & 1) 3760 return 1; 3761 mask |= i; 3762 } 3763 return mask & -mask; 3764} 3765 3766/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode, 3767 return that predicate mode, otherwise return opt_machine_mode (). */ 3768 3769opt_machine_mode 3770aarch64_ptrue_all_mode (rtx x) 3771{ 3772 gcc_assert (GET_MODE (x) == VNx16BImode); 3773 if (GET_CODE (x) != CONST_VECTOR 3774 || !CONST_VECTOR_DUPLICATE_P (x) 3775 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0)) 3776 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0) 3777 return opt_machine_mode (); 3778 3779 unsigned int nelts = const_vector_encoded_nelts (x); 3780 for (unsigned int i = 1; i < nelts; ++i) 3781 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx) 3782 return opt_machine_mode (); 3783 3784 return aarch64_sve_pred_mode (nelts); 3785} 3786 3787/* BUILDER is a predicate constant of mode VNx16BI. Consider the value 3788 that the constant would have with predicate element size ELT_SIZE 3789 (ignoring the upper bits in each element) and return: 3790 3791 * -1 if all bits are set 3792 * N if the predicate has N leading set bits followed by all clear bits 3793 * 0 if the predicate does not have any of these forms. */ 3794 3795int 3796aarch64_partial_ptrue_length (rtx_vector_builder &builder, 3797 unsigned int elt_size) 3798{ 3799 /* If nelts_per_pattern is 3, we have set bits followed by clear bits 3800 followed by set bits. */ 3801 if (builder.nelts_per_pattern () == 3) 3802 return 0; 3803 3804 /* Skip over leading set bits. */ 3805 unsigned int nelts = builder.encoded_nelts (); 3806 unsigned int i = 0; 3807 for (; i < nelts; i += elt_size) 3808 if (INTVAL (builder.elt (i)) == 0) 3809 break; 3810 unsigned int vl = i / elt_size; 3811 3812 /* Check for the all-true case. */ 3813 if (i == nelts) 3814 return -1; 3815 3816 /* If nelts_per_pattern is 1, then either VL is zero, or we have a 3817 repeating pattern of set bits followed by clear bits. */ 3818 if (builder.nelts_per_pattern () != 2) 3819 return 0; 3820 3821 /* We have a "foreground" value and a duplicated "background" value. 3822 If the background might repeat and the last set bit belongs to it, 3823 we might have set bits followed by clear bits followed by set bits. */ 3824 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ())) 3825 return 0; 3826 3827 /* Make sure that the rest are all clear. */ 3828 for (; i < nelts; i += elt_size) 3829 if (INTVAL (builder.elt (i)) != 0) 3830 return 0; 3831 3832 return vl; 3833} 3834 3835/* See if there is an svpattern that encodes an SVE predicate of mode 3836 PRED_MODE in which the first VL bits are set and the rest are clear. 3837 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS. 3838 A VL of -1 indicates an all-true vector. */ 3839 3840aarch64_svpattern 3841aarch64_svpattern_for_vl (machine_mode pred_mode, int vl) 3842{ 3843 if (vl < 0) 3844 return AARCH64_SV_ALL; 3845 3846 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode))) 3847 return AARCH64_NUM_SVPATTERNS; 3848 3849 if (vl >= 1 && vl <= 8) 3850 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1)); 3851 3852 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl)) 3853 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4)); 3854 3855 int max_vl; 3856 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl)) 3857 { 3858 if (vl == (max_vl / 3) * 3) 3859 return AARCH64_SV_MUL3; 3860 /* These would only trigger for non-power-of-2 lengths. */ 3861 if (vl == (max_vl & -4)) 3862 return AARCH64_SV_MUL4; 3863 if (vl == (1 << floor_log2 (max_vl))) 3864 return AARCH64_SV_POW2; 3865 if (vl == max_vl) 3866 return AARCH64_SV_ALL; 3867 } 3868 return AARCH64_NUM_SVPATTERNS; 3869} 3870 3871/* Return a VNx16BImode constant in which every sequence of ELT_SIZE 3872 bits has the lowest bit set and the upper bits clear. This is the 3873 VNx16BImode equivalent of a PTRUE for controlling elements of 3874 ELT_SIZE bytes. However, because the constant is VNx16BImode, 3875 all bits are significant, even the upper zeros. */ 3876 3877rtx 3878aarch64_ptrue_all (unsigned int elt_size) 3879{ 3880 rtx_vector_builder builder (VNx16BImode, elt_size, 1); 3881 builder.quick_push (const1_rtx); 3882 for (unsigned int i = 1; i < elt_size; ++i) 3883 builder.quick_push (const0_rtx); 3884 return builder.build (); 3885} 3886 3887/* Return an all-true predicate register of mode MODE. */ 3888 3889rtx 3890aarch64_ptrue_reg (machine_mode mode) 3891{ 3892 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL); 3893 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode)); 3894 return gen_lowpart (mode, reg); 3895} 3896 3897/* Return an all-false predicate register of mode MODE. */ 3898 3899rtx 3900aarch64_pfalse_reg (machine_mode mode) 3901{ 3902 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL); 3903 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode)); 3904 return gen_lowpart (mode, reg); 3905} 3906 3907/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag 3908 for it. PRED2[0] is the predicate for the instruction whose result 3909 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag 3910 for it. Return true if we can prove that the two predicates are 3911 equivalent for PTEST purposes; that is, if we can replace PRED2[0] 3912 with PRED1[0] without changing behavior. */ 3913 3914bool 3915aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2) 3916{ 3917 machine_mode mode = GET_MODE (pred1[0]); 3918 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL 3919 && mode == GET_MODE (pred2[0]) 3920 && aarch64_sve_ptrue_flag (pred1[1], SImode) 3921 && aarch64_sve_ptrue_flag (pred2[1], SImode)); 3922 3923 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode) 3924 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE); 3925 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode) 3926 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE); 3927 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]); 3928} 3929 3930/* Emit a comparison CMP between OP0 and OP1, both of which have mode 3931 DATA_MODE, and return the result in a predicate of mode PRED_MODE. 3932 Use TARGET as the target register if nonnull and convenient. */ 3933 3934static rtx 3935aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp, 3936 machine_mode data_mode, rtx op1, rtx op2) 3937{ 3938 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode); 3939 expand_operand ops[5]; 3940 create_output_operand (&ops[0], target, pred_mode); 3941 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode); 3942 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE); 3943 create_input_operand (&ops[3], op1, data_mode); 3944 create_input_operand (&ops[4], op2, data_mode); 3945 expand_insn (icode, 5, ops); 3946 return ops[0].value; 3947} 3948 3949/* Use a comparison to convert integer vector SRC into MODE, which is 3950 the corresponding SVE predicate mode. Use TARGET for the result 3951 if it's nonnull and convenient. */ 3952 3953rtx 3954aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src) 3955{ 3956 machine_mode src_mode = GET_MODE (src); 3957 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode, 3958 src, CONST0_RTX (src_mode)); 3959} 3960 3961/* Return the assembly token for svprfop value PRFOP. */ 3962 3963static const char * 3964svprfop_token (enum aarch64_svprfop prfop) 3965{ 3966 switch (prfop) 3967 { 3968#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; 3969 AARCH64_FOR_SVPRFOP (CASE) 3970#undef CASE 3971 case AARCH64_NUM_SVPRFOPS: 3972 break; 3973 } 3974 gcc_unreachable (); 3975} 3976 3977/* Return the assembly string for an SVE prefetch operation with 3978 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation 3979 and that SUFFIX is the format for the remaining operands. */ 3980 3981char * 3982aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx, 3983 const char *suffix) 3984{ 3985 static char buffer[128]; 3986 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx); 3987 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s", 3988 mnemonic, svprfop_token (prfop), suffix); 3989 gcc_assert (written < sizeof (buffer)); 3990 return buffer; 3991} 3992 3993/* Check whether we can calculate the number of elements in PATTERN 3994 at compile time, given that there are NELTS_PER_VQ elements per 3995 128-bit block. Return the value if so, otherwise return -1. */ 3996 3997HOST_WIDE_INT 3998aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq) 3999{ 4000 unsigned int vl, const_vg; 4001 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8) 4002 vl = 1 + (pattern - AARCH64_SV_VL1); 4003 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256) 4004 vl = 16 << (pattern - AARCH64_SV_VL16); 4005 else if (aarch64_sve_vg.is_constant (&const_vg)) 4006 { 4007 /* There are two vector granules per quadword. */ 4008 unsigned int nelts = (const_vg / 2) * nelts_per_vq; 4009 switch (pattern) 4010 { 4011 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts); 4012 case AARCH64_SV_MUL4: return nelts & -4; 4013 case AARCH64_SV_MUL3: return (nelts / 3) * 3; 4014 case AARCH64_SV_ALL: return nelts; 4015 default: gcc_unreachable (); 4016 } 4017 } 4018 else 4019 return -1; 4020 4021 /* There are two vector granules per quadword. */ 4022 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq; 4023 if (known_le (vl, nelts_all)) 4024 return vl; 4025 4026 /* Requesting more elements than are available results in a PFALSE. */ 4027 if (known_gt (vl, nelts_all)) 4028 return 0; 4029 4030 return -1; 4031} 4032 4033/* Return true if we can move VALUE into a register using a single 4034 CNT[BHWD] instruction. */ 4035 4036static bool 4037aarch64_sve_cnt_immediate_p (poly_int64 value) 4038{ 4039 HOST_WIDE_INT factor = value.coeffs[0]; 4040 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */ 4041 return (value.coeffs[1] == factor 4042 && IN_RANGE (factor, 2, 16 * 16) 4043 && (factor & 1) == 0 4044 && factor <= 16 * (factor & -factor)); 4045} 4046 4047/* Likewise for rtx X. */ 4048 4049bool 4050aarch64_sve_cnt_immediate_p (rtx x) 4051{ 4052 poly_int64 value; 4053 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value); 4054} 4055 4056/* Return the asm string for an instruction with a CNT-like vector size 4057 operand (a vector pattern followed by a multiplier in the range [1, 16]). 4058 PREFIX is the mnemonic without the size suffix and OPERANDS is the 4059 first part of the operands template (the part that comes before the 4060 vector size itself). PATTERN is the pattern to use. FACTOR is the 4061 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements 4062 in each quadword. If it is zero, we can use any element size. */ 4063 4064static char * 4065aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands, 4066 aarch64_svpattern pattern, 4067 unsigned int factor, 4068 unsigned int nelts_per_vq) 4069{ 4070 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")]; 4071 4072 if (nelts_per_vq == 0) 4073 /* There is some overlap in the ranges of the four CNT instructions. 4074 Here we always use the smallest possible element size, so that the 4075 multiplier is 1 whereever possible. */ 4076 nelts_per_vq = factor & -factor; 4077 int shift = std::min (exact_log2 (nelts_per_vq), 4); 4078 gcc_assert (IN_RANGE (shift, 1, 4)); 4079 char suffix = "dwhb"[shift - 1]; 4080 4081 factor >>= shift; 4082 unsigned int written; 4083 if (pattern == AARCH64_SV_ALL && factor == 1) 4084 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s", 4085 prefix, suffix, operands); 4086 else if (factor == 1) 4087 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s", 4088 prefix, suffix, operands, svpattern_token (pattern)); 4089 else 4090 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d", 4091 prefix, suffix, operands, svpattern_token (pattern), 4092 factor); 4093 gcc_assert (written < sizeof (buffer)); 4094 return buffer; 4095} 4096 4097/* Return the asm string for an instruction with a CNT-like vector size 4098 operand (a vector pattern followed by a multiplier in the range [1, 16]). 4099 PREFIX is the mnemonic without the size suffix and OPERANDS is the 4100 first part of the operands template (the part that comes before the 4101 vector size itself). X is the value of the vector size operand, 4102 as a polynomial integer rtx; we need to convert this into an "all" 4103 pattern with a multiplier. */ 4104 4105char * 4106aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands, 4107 rtx x) 4108{ 4109 poly_int64 value = rtx_to_poly_int64 (x); 4110 gcc_assert (aarch64_sve_cnt_immediate_p (value)); 4111 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL, 4112 value.coeffs[1], 0); 4113} 4114 4115/* Return the asm string for an instruction with a CNT-like vector size 4116 operand (a vector pattern followed by a multiplier in the range [1, 16]). 4117 PREFIX is the mnemonic without the size suffix and OPERANDS is the 4118 first part of the operands template (the part that comes before the 4119 vector size itself). CNT_PAT[0..2] are the operands of the 4120 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */ 4121 4122char * 4123aarch64_output_sve_cnt_pat_immediate (const char *prefix, 4124 const char *operands, rtx *cnt_pat) 4125{ 4126 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]); 4127 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]); 4128 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq; 4129 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern, 4130 factor, nelts_per_vq); 4131} 4132 4133/* Return true if we can add X using a single SVE INC or DEC instruction. */ 4134 4135bool 4136aarch64_sve_scalar_inc_dec_immediate_p (rtx x) 4137{ 4138 poly_int64 value; 4139 return (poly_int_rtx_p (x, &value) 4140 && (aarch64_sve_cnt_immediate_p (value) 4141 || aarch64_sve_cnt_immediate_p (-value))); 4142} 4143 4144/* Return the asm string for adding SVE INC/DEC immediate OFFSET to 4145 operand 0. */ 4146 4147char * 4148aarch64_output_sve_scalar_inc_dec (rtx offset) 4149{ 4150 poly_int64 offset_value = rtx_to_poly_int64 (offset); 4151 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]); 4152 if (offset_value.coeffs[1] > 0) 4153 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL, 4154 offset_value.coeffs[1], 0); 4155 else 4156 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL, 4157 -offset_value.coeffs[1], 0); 4158} 4159 4160/* Return true if we can add VALUE to a register using a single ADDVL 4161 or ADDPL instruction. */ 4162 4163static bool 4164aarch64_sve_addvl_addpl_immediate_p (poly_int64 value) 4165{ 4166 HOST_WIDE_INT factor = value.coeffs[0]; 4167 if (factor == 0 || value.coeffs[1] != factor) 4168 return false; 4169 /* FACTOR counts VG / 2, so a value of 2 is one predicate width 4170 and a value of 16 is one vector width. */ 4171 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16)) 4172 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2))); 4173} 4174 4175/* Likewise for rtx X. */ 4176 4177bool 4178aarch64_sve_addvl_addpl_immediate_p (rtx x) 4179{ 4180 poly_int64 value; 4181 return (poly_int_rtx_p (x, &value) 4182 && aarch64_sve_addvl_addpl_immediate_p (value)); 4183} 4184 4185/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET 4186 to operand 1 and storing the result in operand 0. */ 4187 4188char * 4189aarch64_output_sve_addvl_addpl (rtx offset) 4190{ 4191 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)]; 4192 poly_int64 offset_value = rtx_to_poly_int64 (offset); 4193 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value)); 4194 4195 int factor = offset_value.coeffs[1]; 4196 if ((factor & 15) == 0) 4197 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16); 4198 else 4199 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2); 4200 return buffer; 4201} 4202 4203/* Return true if X is a valid immediate for an SVE vector INC or DEC 4204 instruction. If it is, store the number of elements in each vector 4205 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication 4206 factor in *FACTOR_OUT (if nonnull). */ 4207 4208bool 4209aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out, 4210 unsigned int *nelts_per_vq_out) 4211{ 4212 rtx elt; 4213 poly_int64 value; 4214 4215 if (!const_vec_duplicate_p (x, &elt) 4216 || !poly_int_rtx_p (elt, &value)) 4217 return false; 4218 4219 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x)); 4220 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2) 4221 /* There's no vector INCB. */ 4222 return false; 4223 4224 HOST_WIDE_INT factor = value.coeffs[0]; 4225 if (value.coeffs[1] != factor) 4226 return false; 4227 4228 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */ 4229 if ((factor % nelts_per_vq) != 0 4230 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq)) 4231 return false; 4232 4233 if (factor_out) 4234 *factor_out = factor; 4235 if (nelts_per_vq_out) 4236 *nelts_per_vq_out = nelts_per_vq; 4237 return true; 4238} 4239 4240/* Return true if X is a valid immediate for an SVE vector INC or DEC 4241 instruction. */ 4242 4243bool 4244aarch64_sve_vector_inc_dec_immediate_p (rtx x) 4245{ 4246 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL); 4247} 4248 4249/* Return the asm template for an SVE vector INC or DEC instruction. 4250 OPERANDS gives the operands before the vector count and X is the 4251 value of the vector count operand itself. */ 4252 4253char * 4254aarch64_output_sve_vector_inc_dec (const char *operands, rtx x) 4255{ 4256 int factor; 4257 unsigned int nelts_per_vq; 4258 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq)) 4259 gcc_unreachable (); 4260 if (factor < 0) 4261 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL, 4262 -factor, nelts_per_vq); 4263 else 4264 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL, 4265 factor, nelts_per_vq); 4266} 4267 4268static int 4269aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, 4270 scalar_int_mode mode) 4271{ 4272 int i; 4273 unsigned HOST_WIDE_INT val, val2, mask; 4274 int one_match, zero_match; 4275 int num_insns; 4276 4277 val = INTVAL (imm); 4278 4279 if (aarch64_move_imm (val, mode)) 4280 { 4281 if (generate) 4282 emit_insn (gen_rtx_SET (dest, imm)); 4283 return 1; 4284 } 4285 4286 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff 4287 (with XXXX non-zero). In that case check to see if the move can be done in 4288 a smaller mode. */ 4289 val2 = val & 0xffffffff; 4290 if (mode == DImode 4291 && aarch64_move_imm (val2, SImode) 4292 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0)) 4293 { 4294 if (generate) 4295 emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); 4296 4297 /* Check if we have to emit a second instruction by checking to see 4298 if any of the upper 32 bits of the original DI mode value is set. */ 4299 if (val == val2) 4300 return 1; 4301 4302 i = (val >> 48) ? 48 : 32; 4303 4304 if (generate) 4305 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 4306 GEN_INT ((val >> i) & 0xffff))); 4307 4308 return 2; 4309 } 4310 4311 if ((val >> 32) == 0 || mode == SImode) 4312 { 4313 if (generate) 4314 { 4315 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff))); 4316 if (mode == SImode) 4317 emit_insn (gen_insv_immsi (dest, GEN_INT (16), 4318 GEN_INT ((val >> 16) & 0xffff))); 4319 else 4320 emit_insn (gen_insv_immdi (dest, GEN_INT (16), 4321 GEN_INT ((val >> 16) & 0xffff))); 4322 } 4323 return 2; 4324 } 4325 4326 /* Remaining cases are all for DImode. */ 4327 4328 mask = 0xffff; 4329 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) + 4330 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0); 4331 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) + 4332 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0); 4333 4334 if (zero_match != 2 && one_match != 2) 4335 { 4336 /* Try emitting a bitmask immediate with a movk replacing 16 bits. 4337 For a 64-bit bitmask try whether changing 16 bits to all ones or 4338 zeroes creates a valid bitmask. To check any repeated bitmask, 4339 try using 16 bits from the other 32-bit half of val. */ 4340 4341 for (i = 0; i < 64; i += 16, mask <<= 16) 4342 { 4343 val2 = val & ~mask; 4344 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 4345 break; 4346 val2 = val | mask; 4347 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 4348 break; 4349 val2 = val2 & ~mask; 4350 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); 4351 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 4352 break; 4353 } 4354 if (i != 64) 4355 { 4356 if (generate) 4357 { 4358 emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); 4359 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 4360 GEN_INT ((val >> i) & 0xffff))); 4361 } 4362 return 2; 4363 } 4364 } 4365 4366 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which 4367 are emitted by the initial mov. If one_match > zero_match, skip set bits, 4368 otherwise skip zero bits. */ 4369 4370 num_insns = 1; 4371 mask = 0xffff; 4372 val2 = one_match > zero_match ? ~val : val; 4373 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32; 4374 4375 if (generate) 4376 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match 4377 ? (val | ~(mask << i)) 4378 : (val & (mask << i))))); 4379 for (i += 16; i < 64; i += 16) 4380 { 4381 if ((val2 & (mask << i)) == 0) 4382 continue; 4383 if (generate) 4384 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 4385 GEN_INT ((val >> i) & 0xffff))); 4386 num_insns ++; 4387 } 4388 4389 return num_insns; 4390} 4391 4392/* Return whether imm is a 128-bit immediate which is simple enough to 4393 expand inline. */ 4394bool 4395aarch64_mov128_immediate (rtx imm) 4396{ 4397 if (GET_CODE (imm) == CONST_INT) 4398 return true; 4399 4400 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2); 4401 4402 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0)); 4403 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1)); 4404 4405 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode) 4406 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4; 4407} 4408 4409 4410/* Return the number of temporary registers that aarch64_add_offset_1 4411 would need to add OFFSET to a register. */ 4412 4413static unsigned int 4414aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset) 4415{ 4416 return absu_hwi (offset) < 0x1000000 ? 0 : 1; 4417} 4418 4419/* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for 4420 a non-polynomial OFFSET. MODE is the mode of the addition. 4421 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should 4422 be set and CFA adjustments added to the generated instructions. 4423 4424 TEMP1, if nonnull, is a register of mode MODE that can be used as a 4425 temporary if register allocation is already complete. This temporary 4426 register may overlap DEST but must not overlap SRC. If TEMP1 is known 4427 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting 4428 the immediate again. 4429 4430 Since this function may be used to adjust the stack pointer, we must 4431 ensure that it cannot cause transient stack deallocation (for example 4432 by first incrementing SP and then decrementing when adjusting by a 4433 large immediate). */ 4434 4435static void 4436aarch64_add_offset_1 (scalar_int_mode mode, rtx dest, 4437 rtx src, HOST_WIDE_INT offset, rtx temp1, 4438 bool frame_related_p, bool emit_move_imm) 4439{ 4440 gcc_assert (emit_move_imm || temp1 != NULL_RTX); 4441 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); 4442 4443 unsigned HOST_WIDE_INT moffset = absu_hwi (offset); 4444 rtx_insn *insn; 4445 4446 if (!moffset) 4447 { 4448 if (!rtx_equal_p (dest, src)) 4449 { 4450 insn = emit_insn (gen_rtx_SET (dest, src)); 4451 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4452 } 4453 return; 4454 } 4455 4456 /* Single instruction adjustment. */ 4457 if (aarch64_uimm12_shift (moffset)) 4458 { 4459 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset))); 4460 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4461 return; 4462 } 4463 4464 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits 4465 and either: 4466 4467 a) the offset cannot be loaded by a 16-bit move or 4468 b) there is no spare register into which we can move it. */ 4469 if (moffset < 0x1000000 4470 && ((!temp1 && !can_create_pseudo_p ()) 4471 || !aarch64_move_imm (moffset, mode))) 4472 { 4473 HOST_WIDE_INT low_off = moffset & 0xfff; 4474 4475 low_off = offset < 0 ? -low_off : low_off; 4476 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off))); 4477 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4478 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off))); 4479 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4480 return; 4481 } 4482 4483 /* Emit a move immediate if required and an addition/subtraction. */ 4484 if (emit_move_imm) 4485 { 4486 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ()); 4487 temp1 = aarch64_force_temporary (mode, temp1, 4488 gen_int_mode (moffset, mode)); 4489 } 4490 insn = emit_insn (offset < 0 4491 ? gen_sub3_insn (dest, src, temp1) 4492 : gen_add3_insn (dest, src, temp1)); 4493 if (frame_related_p) 4494 { 4495 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4496 rtx adj = plus_constant (mode, src, offset); 4497 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj)); 4498 } 4499} 4500 4501/* Return the number of temporary registers that aarch64_add_offset 4502 would need to move OFFSET into a register or add OFFSET to a register; 4503 ADD_P is true if we want the latter rather than the former. */ 4504 4505static unsigned int 4506aarch64_offset_temporaries (bool add_p, poly_int64 offset) 4507{ 4508 /* This follows the same structure as aarch64_add_offset. */ 4509 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset)) 4510 return 0; 4511 4512 unsigned int count = 0; 4513 HOST_WIDE_INT factor = offset.coeffs[1]; 4514 HOST_WIDE_INT constant = offset.coeffs[0] - factor; 4515 poly_int64 poly_offset (factor, factor); 4516 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) 4517 /* Need one register for the ADDVL/ADDPL result. */ 4518 count += 1; 4519 else if (factor != 0) 4520 { 4521 factor = abs (factor); 4522 if (factor > 16 * (factor & -factor)) 4523 /* Need one register for the CNT result and one for the multiplication 4524 factor. If necessary, the second temporary can be reused for the 4525 constant part of the offset. */ 4526 return 2; 4527 /* Need one register for the CNT result (which might then 4528 be shifted). */ 4529 count += 1; 4530 } 4531 return count + aarch64_add_offset_1_temporaries (constant); 4532} 4533 4534/* If X can be represented as a poly_int64, return the number 4535 of temporaries that are required to add it to a register. 4536 Return -1 otherwise. */ 4537 4538int 4539aarch64_add_offset_temporaries (rtx x) 4540{ 4541 poly_int64 offset; 4542 if (!poly_int_rtx_p (x, &offset)) 4543 return -1; 4544 return aarch64_offset_temporaries (true, offset); 4545} 4546 4547/* Set DEST to SRC + OFFSET. MODE is the mode of the addition. 4548 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should 4549 be set and CFA adjustments added to the generated instructions. 4550 4551 TEMP1, if nonnull, is a register of mode MODE that can be used as a 4552 temporary if register allocation is already complete. This temporary 4553 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC. 4554 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to 4555 false to avoid emitting the immediate again. 4556 4557 TEMP2, if nonnull, is a second temporary register that doesn't 4558 overlap either DEST or REG. 4559 4560 Since this function may be used to adjust the stack pointer, we must 4561 ensure that it cannot cause transient stack deallocation (for example 4562 by first incrementing SP and then decrementing when adjusting by a 4563 large immediate). */ 4564 4565static void 4566aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, 4567 poly_int64 offset, rtx temp1, rtx temp2, 4568 bool frame_related_p, bool emit_move_imm = true) 4569{ 4570 gcc_assert (emit_move_imm || temp1 != NULL_RTX); 4571 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); 4572 gcc_assert (temp1 == NULL_RTX 4573 || !frame_related_p 4574 || !reg_overlap_mentioned_p (temp1, dest)); 4575 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2)); 4576 4577 /* Try using ADDVL or ADDPL to add the whole value. */ 4578 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset)) 4579 { 4580 rtx offset_rtx = gen_int_mode (offset, mode); 4581 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); 4582 RTX_FRAME_RELATED_P (insn) = frame_related_p; 4583 return; 4584 } 4585 4586 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an 4587 SVE vector register, over and above the minimum size of 128 bits. 4588 This is equivalent to half the value returned by CNTD with a 4589 vector shape of ALL. */ 4590 HOST_WIDE_INT factor = offset.coeffs[1]; 4591 HOST_WIDE_INT constant = offset.coeffs[0] - factor; 4592 4593 /* Try using ADDVL or ADDPL to add the VG-based part. */ 4594 poly_int64 poly_offset (factor, factor); 4595 if (src != const0_rtx 4596 && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) 4597 { 4598 rtx offset_rtx = gen_int_mode (poly_offset, mode); 4599 if (frame_related_p) 4600 { 4601 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); 4602 RTX_FRAME_RELATED_P (insn) = true; 4603 src = dest; 4604 } 4605 else 4606 { 4607 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx); 4608 src = aarch64_force_temporary (mode, temp1, addr); 4609 temp1 = temp2; 4610 temp2 = NULL_RTX; 4611 } 4612 } 4613 /* Otherwise use a CNT-based sequence. */ 4614 else if (factor != 0) 4615 { 4616 /* Use a subtraction if we have a negative factor. */ 4617 rtx_code code = PLUS; 4618 if (factor < 0) 4619 { 4620 factor = -factor; 4621 code = MINUS; 4622 } 4623 4624 /* Calculate CNTD * FACTOR / 2. First try to fold the division 4625 into the multiplication. */ 4626 rtx val; 4627 int shift = 0; 4628 if (factor & 1) 4629 /* Use a right shift by 1. */ 4630 shift = -1; 4631 else 4632 factor /= 2; 4633 HOST_WIDE_INT low_bit = factor & -factor; 4634 if (factor <= 16 * low_bit) 4635 { 4636 if (factor > 16 * 8) 4637 { 4638 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate 4639 the value with the minimum multiplier and shift it into 4640 position. */ 4641 int extra_shift = exact_log2 (low_bit); 4642 shift += extra_shift; 4643 factor >>= extra_shift; 4644 } 4645 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode); 4646 } 4647 else 4648 { 4649 /* Base the factor on LOW_BIT if we can calculate LOW_BIT 4650 directly, since that should increase the chances of being 4651 able to use a shift and add sequence. If LOW_BIT itself 4652 is out of range, just use CNTD. */ 4653 if (low_bit <= 16 * 8) 4654 factor /= low_bit; 4655 else 4656 low_bit = 1; 4657 4658 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode); 4659 val = aarch64_force_temporary (mode, temp1, val); 4660 4661 if (can_create_pseudo_p ()) 4662 { 4663 rtx coeff1 = gen_int_mode (factor, mode); 4664 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true); 4665 } 4666 else 4667 { 4668 /* Go back to using a negative multiplication factor if we have 4669 no register from which to subtract. */ 4670 if (code == MINUS && src == const0_rtx) 4671 { 4672 factor = -factor; 4673 code = PLUS; 4674 } 4675 rtx coeff1 = gen_int_mode (factor, mode); 4676 coeff1 = aarch64_force_temporary (mode, temp2, coeff1); 4677 val = gen_rtx_MULT (mode, val, coeff1); 4678 } 4679 } 4680 4681 if (shift > 0) 4682 { 4683 /* Multiply by 1 << SHIFT. */ 4684 val = aarch64_force_temporary (mode, temp1, val); 4685 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift)); 4686 } 4687 else if (shift == -1) 4688 { 4689 /* Divide by 2. */ 4690 val = aarch64_force_temporary (mode, temp1, val); 4691 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx); 4692 } 4693 4694 /* Calculate SRC +/- CNTD * FACTOR / 2. */ 4695 if (src != const0_rtx) 4696 { 4697 val = aarch64_force_temporary (mode, temp1, val); 4698 val = gen_rtx_fmt_ee (code, mode, src, val); 4699 } 4700 else if (code == MINUS) 4701 { 4702 val = aarch64_force_temporary (mode, temp1, val); 4703 val = gen_rtx_NEG (mode, val); 4704 } 4705 4706 if (constant == 0 || frame_related_p) 4707 { 4708 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val)); 4709 if (frame_related_p) 4710 { 4711 RTX_FRAME_RELATED_P (insn) = true; 4712 add_reg_note (insn, REG_CFA_ADJUST_CFA, 4713 gen_rtx_SET (dest, plus_constant (Pmode, src, 4714 poly_offset))); 4715 } 4716 src = dest; 4717 if (constant == 0) 4718 return; 4719 } 4720 else 4721 { 4722 src = aarch64_force_temporary (mode, temp1, val); 4723 temp1 = temp2; 4724 temp2 = NULL_RTX; 4725 } 4726 4727 emit_move_imm = true; 4728 } 4729 4730 aarch64_add_offset_1 (mode, dest, src, constant, temp1, 4731 frame_related_p, emit_move_imm); 4732} 4733 4734/* Like aarch64_add_offset, but the offset is given as an rtx rather 4735 than a poly_int64. */ 4736 4737void 4738aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src, 4739 rtx offset_rtx, rtx temp1, rtx temp2) 4740{ 4741 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx), 4742 temp1, temp2, false); 4743} 4744 4745/* Add DELTA to the stack pointer, marking the instructions frame-related. 4746 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false 4747 if TEMP1 already contains abs (DELTA). */ 4748 4749static inline void 4750aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm) 4751{ 4752 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, 4753 temp1, temp2, true, emit_move_imm); 4754} 4755 4756/* Subtract DELTA from the stack pointer, marking the instructions 4757 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary 4758 if nonnull. */ 4759 4760static inline void 4761aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p, 4762 bool emit_move_imm = true) 4763{ 4764 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, 4765 temp1, temp2, frame_related_p, emit_move_imm); 4766} 4767 4768/* Set DEST to (vec_series BASE STEP). */ 4769 4770static void 4771aarch64_expand_vec_series (rtx dest, rtx base, rtx step) 4772{ 4773 machine_mode mode = GET_MODE (dest); 4774 scalar_mode inner = GET_MODE_INNER (mode); 4775 4776 /* Each operand can be a register or an immediate in the range [-16, 15]. */ 4777 if (!aarch64_sve_index_immediate_p (base)) 4778 base = force_reg (inner, base); 4779 if (!aarch64_sve_index_immediate_p (step)) 4780 step = force_reg (inner, step); 4781 4782 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step)); 4783} 4784 4785/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE 4786 register of mode MODE. Use TARGET for the result if it's nonnull 4787 and convenient. 4788 4789 The two vector modes must have the same element mode. The behavior 4790 is to duplicate architectural lane N of SRC into architectural lanes 4791 N + I * STEP of the result. On big-endian targets, architectural 4792 lane 0 of an Advanced SIMD vector is the last element of the vector 4793 in memory layout, so for big-endian targets this operation has the 4794 effect of reversing SRC before duplicating it. Callers need to 4795 account for this. */ 4796 4797rtx 4798aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src) 4799{ 4800 machine_mode src_mode = GET_MODE (src); 4801 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode)); 4802 insn_code icode = (BYTES_BIG_ENDIAN 4803 ? code_for_aarch64_vec_duplicate_vq_be (mode) 4804 : code_for_aarch64_vec_duplicate_vq_le (mode)); 4805 4806 unsigned int i = 0; 4807 expand_operand ops[3]; 4808 create_output_operand (&ops[i++], target, mode); 4809 create_output_operand (&ops[i++], src, src_mode); 4810 if (BYTES_BIG_ENDIAN) 4811 { 4812 /* Create a PARALLEL describing the reversal of SRC. */ 4813 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode); 4814 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq, 4815 nelts_per_vq - 1, -1); 4816 create_fixed_operand (&ops[i++], sel); 4817 } 4818 expand_insn (icode, i, ops); 4819 return ops[0].value; 4820} 4821 4822/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch 4823 the memory image into DEST. Return true on success. */ 4824 4825static bool 4826aarch64_expand_sve_ld1rq (rtx dest, rtx src) 4827{ 4828 src = force_const_mem (GET_MODE (src), src); 4829 if (!src) 4830 return false; 4831 4832 /* Make sure that the address is legitimate. */ 4833 if (!aarch64_sve_ld1rq_operand_p (src)) 4834 { 4835 rtx addr = force_reg (Pmode, XEXP (src, 0)); 4836 src = replace_equiv_address (src, addr); 4837 } 4838 4839 machine_mode mode = GET_MODE (dest); 4840 machine_mode pred_mode = aarch64_sve_pred_mode (mode); 4841 rtx ptrue = aarch64_ptrue_reg (pred_mode); 4842 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue)); 4843 return true; 4844} 4845 4846/* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed 4847 by N "background" values. Try to move it into TARGET using: 4848 4849 PTRUE PRED.<T>, VL<N> 4850 MOV TRUE.<T>, #<foreground> 4851 MOV FALSE.<T>, #<background> 4852 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T> 4853 4854 The PTRUE is always a single instruction but the MOVs might need a 4855 longer sequence. If the background value is zero (as it often is), 4856 the sequence can sometimes collapse to a PTRUE followed by a 4857 zero-predicated move. 4858 4859 Return the target on success, otherwise return null. */ 4860 4861static rtx 4862aarch64_expand_sve_const_vector_sel (rtx target, rtx src) 4863{ 4864 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2); 4865 4866 /* Make sure that the PTRUE is valid. */ 4867 machine_mode mode = GET_MODE (src); 4868 machine_mode pred_mode = aarch64_sve_pred_mode (mode); 4869 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); 4870 if (aarch64_svpattern_for_vl (pred_mode, npatterns) 4871 == AARCH64_NUM_SVPATTERNS) 4872 return NULL_RTX; 4873 4874 rtx_vector_builder pred_builder (pred_mode, npatterns, 2); 4875 rtx_vector_builder true_builder (mode, npatterns, 1); 4876 rtx_vector_builder false_builder (mode, npatterns, 1); 4877 for (unsigned int i = 0; i < npatterns; ++i) 4878 { 4879 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i)); 4880 pred_builder.quick_push (CONST1_RTX (BImode)); 4881 } 4882 for (unsigned int i = 0; i < npatterns; ++i) 4883 { 4884 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns)); 4885 pred_builder.quick_push (CONST0_RTX (BImode)); 4886 } 4887 expand_operand ops[4]; 4888 create_output_operand (&ops[0], target, mode); 4889 create_input_operand (&ops[1], true_builder.build (), mode); 4890 create_input_operand (&ops[2], false_builder.build (), mode); 4891 create_input_operand (&ops[3], pred_builder.build (), pred_mode); 4892 expand_insn (code_for_vcond_mask (mode, mode), 4, ops); 4893 return target; 4894} 4895 4896/* Return a register containing CONST_VECTOR SRC, given that SRC has an 4897 SVE data mode and isn't a legitimate constant. Use TARGET for the 4898 result if convenient. 4899 4900 The returned register can have whatever mode seems most natural 4901 given the contents of SRC. */ 4902 4903static rtx 4904aarch64_expand_sve_const_vector (rtx target, rtx src) 4905{ 4906 machine_mode mode = GET_MODE (src); 4907 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); 4908 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src); 4909 scalar_mode elt_mode = GET_MODE_INNER (mode); 4910 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode); 4911 unsigned int container_bits = aarch64_sve_container_bits (mode); 4912 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits; 4913 4914 if (nelts_per_pattern == 1 4915 && encoded_bits <= 128 4916 && container_bits != elt_bits) 4917 { 4918 /* We have a partial vector mode and a constant whose full-vector 4919 equivalent would occupy a repeating 128-bit sequence. Build that 4920 full-vector equivalent instead, so that we have the option of 4921 using LD1RQ and Advanced SIMD operations. */ 4922 unsigned int repeat = container_bits / elt_bits; 4923 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require (); 4924 rtx_vector_builder builder (full_mode, npatterns * repeat, 1); 4925 for (unsigned int i = 0; i < npatterns; ++i) 4926 for (unsigned int j = 0; j < repeat; ++j) 4927 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i)); 4928 target = aarch64_target_reg (target, full_mode); 4929 return aarch64_expand_sve_const_vector (target, builder.build ()); 4930 } 4931 4932 if (nelts_per_pattern == 1 && encoded_bits == 128) 4933 { 4934 /* The constant is a duplicated quadword but can't be narrowed 4935 beyond a quadword. Get the memory image of the first quadword 4936 as a 128-bit vector and try using LD1RQ to load it from memory. 4937 4938 The effect for both endiannesses is to load memory lane N into 4939 architectural lanes N + I * STEP of the result. On big-endian 4940 targets, the layout of the 128-bit vector in an Advanced SIMD 4941 register would be different from its layout in an SVE register, 4942 but this 128-bit vector is a memory value only. */ 4943 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); 4944 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0); 4945 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value)) 4946 return target; 4947 } 4948 4949 if (nelts_per_pattern == 1 && encoded_bits < 128) 4950 { 4951 /* The vector is a repeating sequence of 64 bits or fewer. 4952 See if we can load them using an Advanced SIMD move and then 4953 duplicate it to fill a vector. This is better than using a GPR 4954 move because it keeps everything in the same register file. */ 4955 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); 4956 rtx_vector_builder builder (vq_mode, npatterns, 1); 4957 for (unsigned int i = 0; i < npatterns; ++i) 4958 { 4959 /* We want memory lane N to go into architectural lane N, 4960 so reverse for big-endian targets. The DUP .Q pattern 4961 has a compensating reverse built-in. */ 4962 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i; 4963 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci)); 4964 } 4965 rtx vq_src = builder.build (); 4966 if (aarch64_simd_valid_immediate (vq_src, NULL)) 4967 { 4968 vq_src = force_reg (vq_mode, vq_src); 4969 return aarch64_expand_sve_dupq (target, mode, vq_src); 4970 } 4971 4972 /* Get an integer representation of the repeating part of Advanced 4973 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC, 4974 which for big-endian targets is lane-swapped wrt a normal 4975 Advanced SIMD vector. This means that for both endiannesses, 4976 memory lane N of SVE vector SRC corresponds to architectural 4977 lane N of a register holding VQ_SRC. This in turn means that 4978 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed 4979 as a single 128-bit value) and thus that memory lane 0 of SRC is 4980 in the lsb of the integer. Duplicating the integer therefore 4981 ensures that memory lane N of SRC goes into architectural lane 4982 N + I * INDEX of the SVE register. */ 4983 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require (); 4984 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0); 4985 if (elt_value) 4986 { 4987 /* Pretend that we had a vector of INT_MODE to start with. */ 4988 elt_mode = int_mode; 4989 mode = aarch64_full_sve_mode (int_mode).require (); 4990 4991 /* If the integer can be moved into a general register by a 4992 single instruction, do that and duplicate the result. */ 4993 if (CONST_INT_P (elt_value) 4994 && aarch64_move_imm (INTVAL (elt_value), elt_mode)) 4995 { 4996 elt_value = force_reg (elt_mode, elt_value); 4997 return expand_vector_broadcast (mode, elt_value); 4998 } 4999 } 5000 else if (npatterns == 1) 5001 /* We're duplicating a single value, but can't do better than 5002 force it to memory and load from there. This handles things 5003 like symbolic constants. */ 5004 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0); 5005 5006 if (elt_value) 5007 { 5008 /* Load the element from memory if we can, otherwise move it into 5009 a register and use a DUP. */ 5010 rtx op = force_const_mem (elt_mode, elt_value); 5011 if (!op) 5012 op = force_reg (elt_mode, elt_value); 5013 return expand_vector_broadcast (mode, op); 5014 } 5015 } 5016 5017 /* Try using INDEX. */ 5018 rtx base, step; 5019 if (const_vec_series_p (src, &base, &step)) 5020 { 5021 aarch64_expand_vec_series (target, base, step); 5022 return target; 5023 } 5024 5025 /* From here on, it's better to force the whole constant to memory 5026 if we can. */ 5027 if (GET_MODE_NUNITS (mode).is_constant ()) 5028 return NULL_RTX; 5029 5030 if (nelts_per_pattern == 2) 5031 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src)) 5032 return res; 5033 5034 /* Expand each pattern individually. */ 5035 gcc_assert (npatterns > 1); 5036 rtx_vector_builder builder; 5037 auto_vec<rtx, 16> vectors (npatterns); 5038 for (unsigned int i = 0; i < npatterns; ++i) 5039 { 5040 builder.new_vector (mode, 1, nelts_per_pattern); 5041 for (unsigned int j = 0; j < nelts_per_pattern; ++j) 5042 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns)); 5043 vectors.quick_push (force_reg (mode, builder.build ())); 5044 } 5045 5046 /* Use permutes to interleave the separate vectors. */ 5047 while (npatterns > 1) 5048 { 5049 npatterns /= 2; 5050 for (unsigned int i = 0; i < npatterns; ++i) 5051 { 5052 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode)); 5053 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]); 5054 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); 5055 vectors[i] = tmp; 5056 } 5057 } 5058 gcc_assert (vectors[0] == target); 5059 return target; 5060} 5061 5062/* Use WHILE to set a predicate register of mode MODE in which the first 5063 VL bits are set and the rest are clear. Use TARGET for the register 5064 if it's nonnull and convenient. */ 5065 5066static rtx 5067aarch64_sve_move_pred_via_while (rtx target, machine_mode mode, 5068 unsigned int vl) 5069{ 5070 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode)); 5071 target = aarch64_target_reg (target, mode); 5072 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode, 5073 target, const0_rtx, limit)); 5074 return target; 5075} 5076 5077static rtx 5078aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool); 5079 5080/* BUILDER is a constant predicate in which the index of every set bit 5081 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant 5082 by inverting every element at a multiple of ELT_SIZE and EORing the 5083 result with an ELT_SIZE PTRUE. 5084 5085 Return a register that contains the constant on success, otherwise 5086 return null. Use TARGET as the register if it is nonnull and 5087 convenient. */ 5088 5089static rtx 5090aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder, 5091 unsigned int elt_size) 5092{ 5093 /* Invert every element at a multiple of ELT_SIZE, keeping the 5094 other bits zero. */ 5095 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (), 5096 builder.nelts_per_pattern ()); 5097 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) 5098 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0) 5099 inv_builder.quick_push (const1_rtx); 5100 else 5101 inv_builder.quick_push (const0_rtx); 5102 inv_builder.finalize (); 5103 5104 /* See if we can load the constant cheaply. */ 5105 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false); 5106 if (!inv) 5107 return NULL_RTX; 5108 5109 /* EOR the result with an ELT_SIZE PTRUE. */ 5110 rtx mask = aarch64_ptrue_all (elt_size); 5111 mask = force_reg (VNx16BImode, mask); 5112 inv = gen_lowpart (VNx16BImode, inv); 5113 target = aarch64_target_reg (target, VNx16BImode); 5114 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask)); 5115 return target; 5116} 5117 5118/* BUILDER is a constant predicate in which the index of every set bit 5119 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant 5120 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the 5121 register on success, otherwise return null. Use TARGET as the register 5122 if nonnull and convenient. */ 5123 5124static rtx 5125aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder, 5126 unsigned int elt_size, 5127 unsigned int permute_size) 5128{ 5129 /* We're going to split the constant into two new constants A and B, 5130 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0 5131 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1: 5132 5133 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ } 5134 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ } 5135 5136 where _ indicates elements that will be discarded by the permute. 5137 5138 First calculate the ELT_SIZEs for A and B. */ 5139 unsigned int a_elt_size = GET_MODE_SIZE (DImode); 5140 unsigned int b_elt_size = GET_MODE_SIZE (DImode); 5141 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size) 5142 if (INTVAL (builder.elt (i)) != 0) 5143 { 5144 if (i & permute_size) 5145 b_elt_size |= i - permute_size; 5146 else 5147 a_elt_size |= i; 5148 } 5149 a_elt_size &= -a_elt_size; 5150 b_elt_size &= -b_elt_size; 5151 5152 /* Now construct the vectors themselves. */ 5153 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (), 5154 builder.nelts_per_pattern ()); 5155 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (), 5156 builder.nelts_per_pattern ()); 5157 unsigned int nelts = builder.encoded_nelts (); 5158 for (unsigned int i = 0; i < nelts; ++i) 5159 if (i & (elt_size - 1)) 5160 { 5161 a_builder.quick_push (const0_rtx); 5162 b_builder.quick_push (const0_rtx); 5163 } 5164 else if ((i & permute_size) == 0) 5165 { 5166 /* The A and B elements are significant. */ 5167 a_builder.quick_push (builder.elt (i)); 5168 b_builder.quick_push (builder.elt (i + permute_size)); 5169 } 5170 else 5171 { 5172 /* The A and B elements are going to be discarded, so pick whatever 5173 is likely to give a nice constant. We are targeting element 5174 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively, 5175 with the aim of each being a sequence of ones followed by 5176 a sequence of zeros. So: 5177 5178 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to 5179 duplicate the last X_ELT_SIZE element, to extend the 5180 current sequence of ones or zeros. 5181 5182 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a 5183 zero, so that the constant really does have X_ELT_SIZE and 5184 not a smaller size. */ 5185 if (a_elt_size > permute_size) 5186 a_builder.quick_push (const0_rtx); 5187 else 5188 a_builder.quick_push (a_builder.elt (i - a_elt_size)); 5189 if (b_elt_size > permute_size) 5190 b_builder.quick_push (const0_rtx); 5191 else 5192 b_builder.quick_push (b_builder.elt (i - b_elt_size)); 5193 } 5194 a_builder.finalize (); 5195 b_builder.finalize (); 5196 5197 /* Try loading A into a register. */ 5198 rtx_insn *last = get_last_insn (); 5199 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false); 5200 if (!a) 5201 return NULL_RTX; 5202 5203 /* Try loading B into a register. */ 5204 rtx b = a; 5205 if (a_builder != b_builder) 5206 { 5207 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false); 5208 if (!b) 5209 { 5210 delete_insns_since (last); 5211 return NULL_RTX; 5212 } 5213 } 5214 5215 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI 5216 operands but permutes them as though they had mode MODE. */ 5217 machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); 5218 target = aarch64_target_reg (target, GET_MODE (a)); 5219 rtx type_reg = CONST0_RTX (mode); 5220 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); 5221 return target; 5222} 5223 5224/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI 5225 constant in BUILDER into an SVE predicate register. Return the register 5226 on success, otherwise return null. Use TARGET for the register if 5227 nonnull and convenient. 5228 5229 ALLOW_RECURSE_P is true if we can use methods that would call this 5230 function recursively. */ 5231 5232static rtx 5233aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder, 5234 bool allow_recurse_p) 5235{ 5236 if (builder.encoded_nelts () == 1) 5237 /* A PFALSE or a PTRUE .B ALL. */ 5238 return aarch64_emit_set_immediate (target, builder); 5239 5240 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); 5241 if (int vl = aarch64_partial_ptrue_length (builder, elt_size)) 5242 { 5243 /* If we can load the constant using PTRUE, use it as-is. */ 5244 machine_mode mode = aarch64_sve_pred_mode (elt_size).require (); 5245 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS) 5246 return aarch64_emit_set_immediate (target, builder); 5247 5248 /* Otherwise use WHILE to set the first VL bits. */ 5249 return aarch64_sve_move_pred_via_while (target, mode, vl); 5250 } 5251 5252 if (!allow_recurse_p) 5253 return NULL_RTX; 5254 5255 /* Try inverting the vector in element size ELT_SIZE and then EORing 5256 the result with an ELT_SIZE PTRUE. */ 5257 if (INTVAL (builder.elt (0)) == 0) 5258 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder, 5259 elt_size)) 5260 return res; 5261 5262 /* Try using TRN1 to permute two simpler constants. */ 5263 for (unsigned int i = elt_size; i <= 8; i *= 2) 5264 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder, 5265 elt_size, i)) 5266 return res; 5267 5268 return NULL_RTX; 5269} 5270 5271/* Return an SVE predicate register that contains the VNx16BImode 5272 constant in BUILDER, without going through the move expanders. 5273 5274 The returned register can have whatever mode seems most natural 5275 given the contents of BUILDER. Use TARGET for the result if 5276 convenient. */ 5277 5278static rtx 5279aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder) 5280{ 5281 /* Try loading the constant using pure predicate operations. */ 5282 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true)) 5283 return res; 5284 5285 /* Try forcing the constant to memory. */ 5286 if (builder.full_nelts ().is_constant ()) 5287 if (rtx mem = force_const_mem (VNx16BImode, builder.build ())) 5288 { 5289 target = aarch64_target_reg (target, VNx16BImode); 5290 emit_move_insn (target, mem); 5291 return target; 5292 } 5293 5294 /* The last resort is to load the constant as an integer and then 5295 compare it against zero. Use -1 for set bits in order to increase 5296 the changes of using SVE DUPM or an Advanced SIMD byte mask. */ 5297 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (), 5298 builder.nelts_per_pattern ()); 5299 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) 5300 int_builder.quick_push (INTVAL (builder.elt (i)) 5301 ? constm1_rtx : const0_rtx); 5302 return aarch64_convert_sve_data_to_pred (target, VNx16BImode, 5303 int_builder.build ()); 5304} 5305 5306/* Set DEST to immediate IMM. */ 5307 5308void 5309aarch64_expand_mov_immediate (rtx dest, rtx imm) 5310{ 5311 machine_mode mode = GET_MODE (dest); 5312 5313 /* Check on what type of symbol it is. */ 5314 scalar_int_mode int_mode; 5315 if ((GET_CODE (imm) == SYMBOL_REF 5316 || GET_CODE (imm) == LABEL_REF 5317 || GET_CODE (imm) == CONST 5318 || GET_CODE (imm) == CONST_POLY_INT) 5319 && is_a <scalar_int_mode> (mode, &int_mode)) 5320 { 5321 rtx mem; 5322 poly_int64 offset; 5323 HOST_WIDE_INT const_offset; 5324 enum aarch64_symbol_type sty; 5325 5326 /* If we have (const (plus symbol offset)), separate out the offset 5327 before we start classifying the symbol. */ 5328 rtx base = strip_offset (imm, &offset); 5329 5330 /* We must always add an offset involving VL separately, rather than 5331 folding it into the relocation. */ 5332 if (!offset.is_constant (&const_offset)) 5333 { 5334 if (!TARGET_SVE) 5335 { 5336 aarch64_report_sve_required (); 5337 return; 5338 } 5339 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset)) 5340 emit_insn (gen_rtx_SET (dest, imm)); 5341 else 5342 { 5343 /* Do arithmetic on 32-bit values if the result is smaller 5344 than that. */ 5345 if (partial_subreg_p (int_mode, SImode)) 5346 { 5347 /* It is invalid to do symbol calculations in modes 5348 narrower than SImode. */ 5349 gcc_assert (base == const0_rtx); 5350 dest = gen_lowpart (SImode, dest); 5351 int_mode = SImode; 5352 } 5353 if (base != const0_rtx) 5354 { 5355 base = aarch64_force_temporary (int_mode, dest, base); 5356 aarch64_add_offset (int_mode, dest, base, offset, 5357 NULL_RTX, NULL_RTX, false); 5358 } 5359 else 5360 aarch64_add_offset (int_mode, dest, base, offset, 5361 dest, NULL_RTX, false); 5362 } 5363 return; 5364 } 5365 5366 sty = aarch64_classify_symbol (base, const_offset); 5367 switch (sty) 5368 { 5369 case SYMBOL_FORCE_TO_MEM: 5370 if (const_offset != 0 5371 && targetm.cannot_force_const_mem (int_mode, imm)) 5372 { 5373 gcc_assert (can_create_pseudo_p ()); 5374 base = aarch64_force_temporary (int_mode, dest, base); 5375 aarch64_add_offset (int_mode, dest, base, const_offset, 5376 NULL_RTX, NULL_RTX, false); 5377 return; 5378 } 5379 5380 mem = force_const_mem (ptr_mode, imm); 5381 gcc_assert (mem); 5382 5383 /* If we aren't generating PC relative literals, then 5384 we need to expand the literal pool access carefully. 5385 This is something that needs to be done in a number 5386 of places, so could well live as a separate function. */ 5387 if (!aarch64_pcrelative_literal_loads) 5388 { 5389 gcc_assert (can_create_pseudo_p ()); 5390 base = gen_reg_rtx (ptr_mode); 5391 aarch64_expand_mov_immediate (base, XEXP (mem, 0)); 5392 if (ptr_mode != Pmode) 5393 base = convert_memory_address (Pmode, base); 5394 mem = gen_rtx_MEM (ptr_mode, base); 5395 } 5396 5397 if (int_mode != ptr_mode) 5398 mem = gen_rtx_ZERO_EXTEND (int_mode, mem); 5399 5400 emit_insn (gen_rtx_SET (dest, mem)); 5401 5402 return; 5403 5404 case SYMBOL_SMALL_TLSGD: 5405 case SYMBOL_SMALL_TLSDESC: 5406 case SYMBOL_SMALL_TLSIE: 5407 case SYMBOL_SMALL_GOT_28K: 5408 case SYMBOL_SMALL_GOT_4G: 5409 case SYMBOL_TINY_GOT: 5410 case SYMBOL_TINY_TLSIE: 5411 if (const_offset != 0) 5412 { 5413 gcc_assert(can_create_pseudo_p ()); 5414 base = aarch64_force_temporary (int_mode, dest, base); 5415 aarch64_add_offset (int_mode, dest, base, const_offset, 5416 NULL_RTX, NULL_RTX, false); 5417 return; 5418 } 5419 /* FALLTHRU */ 5420 5421 case SYMBOL_SMALL_ABSOLUTE: 5422 case SYMBOL_TINY_ABSOLUTE: 5423 case SYMBOL_TLSLE12: 5424 case SYMBOL_TLSLE24: 5425 case SYMBOL_TLSLE32: 5426 case SYMBOL_TLSLE48: 5427 aarch64_load_symref_appropriately (dest, imm, sty); 5428 return; 5429 5430 default: 5431 gcc_unreachable (); 5432 } 5433 } 5434 5435 if (!CONST_INT_P (imm)) 5436 { 5437 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) 5438 { 5439 /* Only the low bit of each .H, .S and .D element is defined, 5440 so we can set the upper bits to whatever we like. If the 5441 predicate is all-true in MODE, prefer to set all the undefined 5442 bits as well, so that we can share a single .B predicate for 5443 all modes. */ 5444 if (imm == CONSTM1_RTX (mode)) 5445 imm = CONSTM1_RTX (VNx16BImode); 5446 5447 /* All methods for constructing predicate modes wider than VNx16BI 5448 will set the upper bits of each element to zero. Expose this 5449 by moving such constants as a VNx16BI, so that all bits are 5450 significant and so that constants for different modes can be 5451 shared. The wider constant will still be available as a 5452 REG_EQUAL note. */ 5453 rtx_vector_builder builder; 5454 if (aarch64_get_sve_pred_bits (builder, imm)) 5455 { 5456 rtx res = aarch64_expand_sve_const_pred (dest, builder); 5457 if (dest != res) 5458 emit_move_insn (dest, gen_lowpart (mode, res)); 5459 return; 5460 } 5461 } 5462 5463 if (GET_CODE (imm) == HIGH 5464 || aarch64_simd_valid_immediate (imm, NULL)) 5465 { 5466 emit_insn (gen_rtx_SET (dest, imm)); 5467 return; 5468 } 5469 5470 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode)) 5471 if (rtx res = aarch64_expand_sve_const_vector (dest, imm)) 5472 { 5473 if (dest != res) 5474 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res)); 5475 return; 5476 } 5477 5478 rtx mem = force_const_mem (mode, imm); 5479 gcc_assert (mem); 5480 emit_move_insn (dest, mem); 5481 return; 5482 } 5483 5484 aarch64_internal_mov_immediate (dest, imm, true, 5485 as_a <scalar_int_mode> (mode)); 5486} 5487 5488/* Return the MEM rtx that provides the canary value that should be used 5489 for stack-smashing protection. MODE is the mode of the memory. 5490 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable 5491 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE 5492 indicates whether the caller is performing a SET or a TEST operation. */ 5493 5494rtx 5495aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl, 5496 aarch64_salt_type salt_type) 5497{ 5498 rtx addr; 5499 if (aarch64_stack_protector_guard == SSP_GLOBAL) 5500 { 5501 gcc_assert (MEM_P (decl_rtl)); 5502 addr = XEXP (decl_rtl, 0); 5503 poly_int64 offset; 5504 rtx base = strip_offset_and_salt (addr, &offset); 5505 if (!SYMBOL_REF_P (base)) 5506 return decl_rtl; 5507 5508 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type)); 5509 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR); 5510 addr = gen_rtx_CONST (Pmode, addr); 5511 addr = plus_constant (Pmode, addr, offset); 5512 } 5513 else 5514 { 5515 /* Calculate the address from the system register. */ 5516 rtx salt = GEN_INT (salt_type); 5517 addr = gen_reg_rtx (mode); 5518 if (mode == DImode) 5519 emit_insn (gen_reg_stack_protect_address_di (addr, salt)); 5520 else 5521 { 5522 emit_insn (gen_reg_stack_protect_address_si (addr, salt)); 5523 addr = convert_memory_address (Pmode, addr); 5524 } 5525 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset); 5526 } 5527 return gen_rtx_MEM (mode, force_reg (Pmode, addr)); 5528} 5529 5530/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate 5531 that is known to contain PTRUE. */ 5532 5533void 5534aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src) 5535{ 5536 expand_operand ops[3]; 5537 machine_mode mode = GET_MODE (dest); 5538 create_output_operand (&ops[0], dest, mode); 5539 create_input_operand (&ops[1], pred, GET_MODE(pred)); 5540 create_input_operand (&ops[2], src, mode); 5541 temporary_volatile_ok v (true); 5542 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); 5543} 5544 5545/* Expand a pre-RA SVE data move from SRC to DEST in which at least one 5546 operand is in memory. In this case we need to use the predicated LD1 5547 and ST1 instead of LDR and STR, both for correctness on big-endian 5548 targets and because LD1 and ST1 support a wider range of addressing modes. 5549 PRED_MODE is the mode of the predicate. 5550 5551 See the comment at the head of aarch64-sve.md for details about the 5552 big-endian handling. */ 5553 5554void 5555aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode) 5556{ 5557 machine_mode mode = GET_MODE (dest); 5558 rtx ptrue = aarch64_ptrue_reg (pred_mode); 5559 if (!register_operand (src, mode) 5560 && !register_operand (dest, mode)) 5561 { 5562 rtx tmp = gen_reg_rtx (mode); 5563 if (MEM_P (src)) 5564 aarch64_emit_sve_pred_move (tmp, ptrue, src); 5565 else 5566 emit_move_insn (tmp, src); 5567 src = tmp; 5568 } 5569 aarch64_emit_sve_pred_move (dest, ptrue, src); 5570} 5571 5572/* Called only on big-endian targets. See whether an SVE vector move 5573 from SRC to DEST is effectively a REV[BHW] instruction, because at 5574 least one operand is a subreg of an SVE vector that has wider or 5575 narrower elements. Return true and emit the instruction if so. 5576 5577 For example: 5578 5579 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0)) 5580 5581 represents a VIEW_CONVERT between the following vectors, viewed 5582 in memory order: 5583 5584 R2: { [0].high, [0].low, [1].high, [1].low, ... } 5585 R1: { [0], [1], [2], [3], ... } 5586 5587 The high part of lane X in R2 should therefore correspond to lane X*2 5588 of R1, but the register representations are: 5589 5590 msb lsb 5591 R2: ...... [1].high [1].low [0].high [0].low 5592 R1: ...... [3] [2] [1] [0] 5593 5594 where the low part of lane X in R2 corresponds to lane X*2 in R1. 5595 We therefore need a reverse operation to swap the high and low values 5596 around. 5597 5598 This is purely an optimization. Without it we would spill the 5599 subreg operand to the stack in one mode and reload it in the 5600 other mode, which has the same effect as the REV. */ 5601 5602bool 5603aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src) 5604{ 5605 gcc_assert (BYTES_BIG_ENDIAN); 5606 5607 /* Do not try to optimize subregs that LRA has created for matched 5608 reloads. These subregs only exist as a temporary measure to make 5609 the RTL well-formed, but they are exempt from the usual 5610 TARGET_CAN_CHANGE_MODE_CLASS rules. 5611 5612 For example, if we have: 5613 5614 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2))) 5615 5616 and the constraints require R1 and R2 to be in the same register, 5617 LRA may need to create RTL such as: 5618 5619 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2)) 5620 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0))) 5621 (set (reg:VNx8HI R1) (reg:VNx8HI TMP)) 5622 5623 which forces both the input and output of the original instruction 5624 to use the same hard register. But for this to work, the normal 5625 rules have to be suppressed on the subreg input, otherwise LRA 5626 would need to reload that input too, meaning that the process 5627 would never terminate. To compensate for this, the normal rules 5628 are also suppressed for the subreg output of the first move. 5629 Ignoring the special case and handling the first move normally 5630 would therefore generate wrong code: we would reverse the elements 5631 for the first subreg but not reverse them back for the second subreg. */ 5632 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest)) 5633 dest = SUBREG_REG (dest); 5634 if (SUBREG_P (src) && !LRA_SUBREG_P (src)) 5635 src = SUBREG_REG (src); 5636 5637 /* The optimization handles two single SVE REGs with different element 5638 sizes. */ 5639 if (!REG_P (dest) 5640 || !REG_P (src) 5641 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA 5642 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA 5643 || (GET_MODE_UNIT_SIZE (GET_MODE (dest)) 5644 == GET_MODE_UNIT_SIZE (GET_MODE (src)))) 5645 return false; 5646 5647 /* Generate *aarch64_sve_mov<mode>_subreg_be. */ 5648 rtx ptrue = aarch64_ptrue_reg (VNx16BImode); 5649 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src), 5650 UNSPEC_REV_SUBREG); 5651 emit_insn (gen_rtx_SET (dest, unspec)); 5652 return true; 5653} 5654 5655/* Return a copy of X with mode MODE, without changing its other 5656 attributes. Unlike gen_lowpart, this doesn't care whether the 5657 mode change is valid. */ 5658 5659rtx 5660aarch64_replace_reg_mode (rtx x, machine_mode mode) 5661{ 5662 if (GET_MODE (x) == mode) 5663 return x; 5664 5665 x = shallow_copy_rtx (x); 5666 set_mode_and_regno (x, mode, REGNO (x)); 5667 return x; 5668} 5669 5670/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE 5671 stored in wider integer containers. */ 5672 5673static unsigned int 5674aarch64_sve_rev_unspec (machine_mode mode) 5675{ 5676 switch (GET_MODE_UNIT_SIZE (mode)) 5677 { 5678 case 1: return UNSPEC_REVB; 5679 case 2: return UNSPEC_REVH; 5680 case 4: return UNSPEC_REVW; 5681 } 5682 gcc_unreachable (); 5683} 5684 5685/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given 5686 operands. */ 5687 5688void 5689aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src) 5690{ 5691 /* Decide which REV operation we need. The mode with wider elements 5692 determines the mode of the operands and the mode with the narrower 5693 elements determines the reverse width. */ 5694 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest)); 5695 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src)); 5696 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts) 5697 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts)) 5698 std::swap (mode_with_wider_elts, mode_with_narrower_elts); 5699 5700 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts); 5701 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts); 5702 5703 /* Get the operands in the appropriate modes and emit the instruction. */ 5704 ptrue = gen_lowpart (pred_mode, ptrue); 5705 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts); 5706 src = aarch64_replace_reg_mode (src, mode_with_wider_elts); 5707 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts, 5708 dest, ptrue, src)); 5709} 5710 5711static bool 5712aarch64_function_ok_for_sibcall (tree, tree exp) 5713{ 5714 if (crtl->abi->id () != expr_callee_abi (exp).id ()) 5715 return false; 5716 5717 return true; 5718} 5719 5720/* Subroutine of aarch64_pass_by_reference for arguments that are not 5721 passed in SVE registers. */ 5722 5723static bool 5724aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum, 5725 const function_arg_info &arg) 5726{ 5727 HOST_WIDE_INT size; 5728 machine_mode dummymode; 5729 int nregs; 5730 5731 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */ 5732 if (arg.mode == BLKmode && arg.type) 5733 size = int_size_in_bytes (arg.type); 5734 else 5735 /* No frontends can create types with variable-sized modes, so we 5736 shouldn't be asked to pass or return them. */ 5737 size = GET_MODE_SIZE (arg.mode).to_constant (); 5738 5739 /* Aggregates are passed by reference based on their size. */ 5740 if (arg.aggregate_type_p ()) 5741 size = int_size_in_bytes (arg.type); 5742 5743 /* Variable sized arguments are always returned by reference. */ 5744 if (size < 0) 5745 return true; 5746 5747 /* Can this be a candidate to be passed in fp/simd register(s)? */ 5748 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type, 5749 &dummymode, &nregs, NULL, 5750 !pcum || pcum->silent_p)) 5751 return false; 5752 5753 /* Arguments which are variable sized or larger than 2 registers are 5754 passed by reference unless they are a homogenous floating point 5755 aggregate. */ 5756 return size > 2 * UNITS_PER_WORD; 5757} 5758 5759/* Implement TARGET_PASS_BY_REFERENCE. */ 5760 5761static bool 5762aarch64_pass_by_reference (cumulative_args_t pcum_v, 5763 const function_arg_info &arg) 5764{ 5765 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 5766 5767 if (!arg.type) 5768 return aarch64_pass_by_reference_1 (pcum, arg); 5769 5770 pure_scalable_type_info pst_info; 5771 switch (pst_info.analyze (arg.type)) 5772 { 5773 case pure_scalable_type_info::IS_PST: 5774 if (pcum && !pcum->silent_p && !TARGET_SVE) 5775 /* We can't gracefully recover at this point, so make this a 5776 fatal error. */ 5777 fatal_error (input_location, "arguments of type %qT require" 5778 " the SVE ISA extension", arg.type); 5779 5780 /* Variadic SVE types are passed by reference. Normal non-variadic 5781 arguments are too if we've run out of registers. */ 5782 return (!arg.named 5783 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS 5784 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS); 5785 5786 case pure_scalable_type_info::DOESNT_MATTER: 5787 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg)); 5788 return true; 5789 5790 case pure_scalable_type_info::NO_ABI_IDENTITY: 5791 case pure_scalable_type_info::ISNT_PST: 5792 return aarch64_pass_by_reference_1 (pcum, arg); 5793 } 5794 gcc_unreachable (); 5795} 5796 5797/* Return TRUE if VALTYPE is padded to its least significant bits. */ 5798static bool 5799aarch64_return_in_msb (const_tree valtype) 5800{ 5801 machine_mode dummy_mode; 5802 int dummy_int; 5803 5804 /* Never happens in little-endian mode. */ 5805 if (!BYTES_BIG_ENDIAN) 5806 return false; 5807 5808 /* Only composite types smaller than or equal to 16 bytes can 5809 be potentially returned in registers. */ 5810 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype)) 5811 || int_size_in_bytes (valtype) <= 0 5812 || int_size_in_bytes (valtype) > 16) 5813 return false; 5814 5815 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate) 5816 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite 5817 is always passed/returned in the least significant bits of fp/simd 5818 register(s). */ 5819 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype, 5820 &dummy_mode, &dummy_int, NULL, 5821 false)) 5822 return false; 5823 5824 /* Likewise pure scalable types for SVE vector and predicate registers. */ 5825 pure_scalable_type_info pst_info; 5826 if (pst_info.analyze_registers (valtype)) 5827 return false; 5828 5829 return true; 5830} 5831 5832/* Implement TARGET_FUNCTION_VALUE. 5833 Define how to find the value returned by a function. */ 5834 5835static rtx 5836aarch64_function_value (const_tree type, const_tree func, 5837 bool outgoing ATTRIBUTE_UNUSED) 5838{ 5839 machine_mode mode; 5840 int unsignedp; 5841 5842 mode = TYPE_MODE (type); 5843 if (INTEGRAL_TYPE_P (type)) 5844 mode = promote_function_mode (type, mode, &unsignedp, func, 1); 5845 5846 pure_scalable_type_info pst_info; 5847 if (type && pst_info.analyze_registers (type)) 5848 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM); 5849 5850 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N 5851 are returned in memory, not by value. */ 5852 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 5853 bool sve_p = (vec_flags & VEC_ANY_SVE); 5854 5855 if (aarch64_return_in_msb (type)) 5856 { 5857 HOST_WIDE_INT size = int_size_in_bytes (type); 5858 5859 if (size % UNITS_PER_WORD != 0) 5860 { 5861 size += UNITS_PER_WORD - size % UNITS_PER_WORD; 5862 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require (); 5863 } 5864 } 5865 5866 int count; 5867 machine_mode ag_mode; 5868 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count, 5869 NULL, false)) 5870 { 5871 gcc_assert (!sve_p); 5872 if (!aarch64_composite_type_p (type, mode)) 5873 { 5874 gcc_assert (count == 1 && mode == ag_mode); 5875 return gen_rtx_REG (mode, V0_REGNUM); 5876 } 5877 else 5878 { 5879 int i; 5880 rtx par; 5881 5882 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count)); 5883 for (i = 0; i < count; i++) 5884 { 5885 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i); 5886 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode); 5887 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset); 5888 XVECEXP (par, 0, i) = tmp; 5889 } 5890 return par; 5891 } 5892 } 5893 else 5894 { 5895 if (sve_p) 5896 { 5897 /* Vector types can acquire a partial SVE mode using things like 5898 __attribute__((vector_size(N))), and this is potentially useful. 5899 However, the choice of mode doesn't affect the type's ABI 5900 identity, so we should treat the types as though they had 5901 the associated integer mode, just like they did before SVE 5902 was introduced. 5903 5904 We know that the vector must be 128 bits or smaller, 5905 otherwise we'd have returned it in memory instead. */ 5906 gcc_assert (type 5907 && (aarch64_some_values_include_pst_objects_p (type) 5908 || (vec_flags & VEC_PARTIAL))); 5909 5910 scalar_int_mode int_mode = int_mode_for_mode (mode).require (); 5911 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM); 5912 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx); 5913 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair)); 5914 } 5915 return gen_rtx_REG (mode, R0_REGNUM); 5916 } 5917} 5918 5919/* Implements TARGET_FUNCTION_VALUE_REGNO_P. 5920 Return true if REGNO is the number of a hard register in which the values 5921 of called function may come back. */ 5922 5923static bool 5924aarch64_function_value_regno_p (const unsigned int regno) 5925{ 5926 /* Maximum of 16 bytes can be returned in the general registers. Examples 5927 of 16-byte return values are: 128-bit integers and 16-byte small 5928 structures (excluding homogeneous floating-point aggregates). */ 5929 if (regno == R0_REGNUM || regno == R1_REGNUM) 5930 return true; 5931 5932 /* Up to four fp/simd registers can return a function value, e.g. a 5933 homogeneous floating-point aggregate having four members. */ 5934 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS) 5935 return TARGET_FLOAT; 5936 5937 return false; 5938} 5939 5940/* Subroutine for aarch64_return_in_memory for types that are not returned 5941 in SVE registers. */ 5942 5943static bool 5944aarch64_return_in_memory_1 (const_tree type) 5945{ 5946 HOST_WIDE_INT size; 5947 machine_mode ag_mode; 5948 int count; 5949 5950 if (!AGGREGATE_TYPE_P (type) 5951 && TREE_CODE (type) != COMPLEX_TYPE 5952 && TREE_CODE (type) != VECTOR_TYPE) 5953 /* Simple scalar types always returned in registers. */ 5954 return false; 5955 5956 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type, 5957 &ag_mode, &count, NULL, false)) 5958 return false; 5959 5960 /* Types larger than 2 registers returned in memory. */ 5961 size = int_size_in_bytes (type); 5962 return (size < 0 || size > 2 * UNITS_PER_WORD); 5963} 5964 5965/* Implement TARGET_RETURN_IN_MEMORY. 5966 5967 If the type T of the result of a function is such that 5968 void func (T arg) 5969 would require that arg be passed as a value in a register (or set of 5970 registers) according to the parameter passing rules, then the result 5971 is returned in the same registers as would be used for such an 5972 argument. */ 5973 5974static bool 5975aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED) 5976{ 5977 pure_scalable_type_info pst_info; 5978 switch (pst_info.analyze (type)) 5979 { 5980 case pure_scalable_type_info::IS_PST: 5981 return (pst_info.num_zr () > NUM_FP_ARG_REGS 5982 || pst_info.num_pr () > NUM_PR_ARG_REGS); 5983 5984 case pure_scalable_type_info::DOESNT_MATTER: 5985 gcc_assert (aarch64_return_in_memory_1 (type)); 5986 return true; 5987 5988 case pure_scalable_type_info::NO_ABI_IDENTITY: 5989 case pure_scalable_type_info::ISNT_PST: 5990 return aarch64_return_in_memory_1 (type); 5991 } 5992 gcc_unreachable (); 5993} 5994 5995static bool 5996aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode, 5997 const_tree type, int *nregs) 5998{ 5999 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 6000 return aarch64_vfp_is_call_or_return_candidate (mode, type, 6001 &pcum->aapcs_vfp_rmode, 6002 nregs, NULL, pcum->silent_p); 6003} 6004 6005/* Given MODE and TYPE of a function argument, return the alignment in 6006 bits. The idea is to suppress any stronger alignment requested by 6007 the user and opt for the natural alignment (specified in AAPCS64 \S 6008 4.1). ABI_BREAK is set to the old alignment if the alignment was 6009 incorrectly calculated in versions of GCC prior to GCC-9. This is 6010 a helper function for local use only. */ 6011 6012static unsigned int 6013aarch64_function_arg_alignment (machine_mode mode, const_tree type, 6014 bool *abi_break) 6015{ 6016 *abi_break = false; 6017 if (!type) 6018 return GET_MODE_ALIGNMENT (mode); 6019 6020 if (integer_zerop (TYPE_SIZE (type))) 6021 return 0; 6022 6023 gcc_assert (TYPE_MODE (type) == mode); 6024 6025 if (!AGGREGATE_TYPE_P (type)) 6026 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type)); 6027 6028 if (TREE_CODE (type) == ARRAY_TYPE) 6029 return TYPE_ALIGN (TREE_TYPE (type)); 6030 6031 unsigned int alignment = 0; 6032 unsigned int bitfield_alignment = 0; 6033 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) 6034 if (TREE_CODE (field) == FIELD_DECL) 6035 { 6036 /* Note that we explicitly consider zero-sized fields here, 6037 even though they don't map to AAPCS64 machine types. 6038 For example, in: 6039 6040 struct __attribute__((aligned(8))) empty {}; 6041 6042 struct s { 6043 [[no_unique_address]] empty e; 6044 int x; 6045 }; 6046 6047 "s" contains only one Fundamental Data Type (the int field) 6048 but gains 8-byte alignment and size thanks to "e". */ 6049 alignment = std::max (alignment, DECL_ALIGN (field)); 6050 if (DECL_BIT_FIELD_TYPE (field)) 6051 bitfield_alignment 6052 = std::max (bitfield_alignment, 6053 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field))); 6054 } 6055 6056 if (bitfield_alignment > alignment) 6057 { 6058 *abi_break = true; 6059 return bitfield_alignment; 6060 } 6061 6062 return alignment; 6063} 6064 6065/* Layout a function argument according to the AAPCS64 rules. The rule 6066 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the 6067 mode that was originally given to us by the target hook, whereas the 6068 mode in ARG might be the result of replacing partial SVE modes with 6069 the equivalent integer mode. */ 6070 6071static void 6072aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg) 6073{ 6074 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 6075 tree type = arg.type; 6076 machine_mode mode = arg.mode; 6077 int ncrn, nvrn, nregs; 6078 bool allocate_ncrn, allocate_nvrn; 6079 HOST_WIDE_INT size; 6080 bool abi_break; 6081 6082 /* We need to do this once per argument. */ 6083 if (pcum->aapcs_arg_processed) 6084 return; 6085 6086 bool warn_pcs_change 6087 = (warn_psabi 6088 && !pcum->silent_p 6089 && (currently_expanding_function_start 6090 || currently_expanding_gimple_stmt)); 6091 6092 unsigned int alignment 6093 = aarch64_function_arg_alignment (mode, type, &abi_break); 6094 gcc_assert (!alignment || abi_break < alignment); 6095 6096 pcum->aapcs_arg_processed = true; 6097 6098 pure_scalable_type_info pst_info; 6099 if (type && pst_info.analyze_registers (type)) 6100 { 6101 /* aarch64_function_arg_alignment has never had an effect on 6102 this case. */ 6103 6104 /* The PCS says that it is invalid to pass an SVE value to an 6105 unprototyped function. There is no ABI-defined location we 6106 can return in this case, so we have no real choice but to raise 6107 an error immediately, even though this is only a query function. */ 6108 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE) 6109 { 6110 gcc_assert (!pcum->silent_p); 6111 error ("SVE type %qT cannot be passed to an unprototyped function", 6112 arg.type); 6113 /* Avoid repeating the message, and avoid tripping the assert 6114 below. */ 6115 pcum->pcs_variant = ARM_PCS_SVE; 6116 } 6117 6118 /* We would have converted the argument into pass-by-reference 6119 form if it didn't fit in registers. */ 6120 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr (); 6121 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr (); 6122 gcc_assert (arg.named 6123 && pcum->pcs_variant == ARM_PCS_SVE 6124 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS 6125 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS); 6126 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn, 6127 P0_REGNUM + pcum->aapcs_nprn); 6128 return; 6129 } 6130 6131 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N 6132 are passed by reference, not by value. */ 6133 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 6134 bool sve_p = (vec_flags & VEC_ANY_SVE); 6135 if (sve_p) 6136 /* Vector types can acquire a partial SVE mode using things like 6137 __attribute__((vector_size(N))), and this is potentially useful. 6138 However, the choice of mode doesn't affect the type's ABI 6139 identity, so we should treat the types as though they had 6140 the associated integer mode, just like they did before SVE 6141 was introduced. 6142 6143 We know that the vector must be 128 bits or smaller, 6144 otherwise we'd have passed it in memory instead. */ 6145 gcc_assert (type 6146 && (aarch64_some_values_include_pst_objects_p (type) 6147 || (vec_flags & VEC_PARTIAL))); 6148 6149 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */ 6150 if (type) 6151 size = int_size_in_bytes (type); 6152 else 6153 /* No frontends can create types with variable-sized modes, so we 6154 shouldn't be asked to pass or return them. */ 6155 size = GET_MODE_SIZE (mode).to_constant (); 6156 size = ROUND_UP (size, UNITS_PER_WORD); 6157 6158 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode); 6159 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v, 6160 mode, 6161 type, 6162 &nregs); 6163 gcc_assert (!sve_p || !allocate_nvrn); 6164 6165 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable. 6166 The following code thus handles passing by SIMD/FP registers first. */ 6167 6168 nvrn = pcum->aapcs_nvrn; 6169 6170 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA) 6171 and homogenous short-vector aggregates (HVA). */ 6172 if (allocate_nvrn) 6173 { 6174 /* aarch64_function_arg_alignment has never had an effect on 6175 this case. */ 6176 if (!pcum->silent_p && !TARGET_FLOAT) 6177 aarch64_err_no_fpadvsimd (mode); 6178 6179 if (nvrn + nregs <= NUM_FP_ARG_REGS) 6180 { 6181 pcum->aapcs_nextnvrn = nvrn + nregs; 6182 if (!aarch64_composite_type_p (type, mode)) 6183 { 6184 gcc_assert (nregs == 1); 6185 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn); 6186 } 6187 else 6188 { 6189 rtx par; 6190 int i; 6191 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs)); 6192 for (i = 0; i < nregs; i++) 6193 { 6194 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode, 6195 V0_REGNUM + nvrn + i); 6196 rtx offset = gen_int_mode 6197 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode); 6198 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset); 6199 XVECEXP (par, 0, i) = tmp; 6200 } 6201 pcum->aapcs_reg = par; 6202 } 6203 return; 6204 } 6205 else 6206 { 6207 /* C.3 NSRN is set to 8. */ 6208 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS; 6209 goto on_stack; 6210 } 6211 } 6212 6213 ncrn = pcum->aapcs_ncrn; 6214 nregs = size / UNITS_PER_WORD; 6215 6216 /* C6 - C9. though the sign and zero extension semantics are 6217 handled elsewhere. This is the case where the argument fits 6218 entirely general registers. */ 6219 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS)) 6220 { 6221 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2); 6222 6223 /* C.8 if the argument has an alignment of 16 then the NGRN is 6224 rounded up to the next even number. */ 6225 if (nregs == 2 6226 && ncrn % 2 6227 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT 6228 comparison is there because for > 16 * BITS_PER_UNIT 6229 alignment nregs should be > 2 and therefore it should be 6230 passed by reference rather than value. */ 6231 && (aarch64_function_arg_alignment (mode, type, &abi_break) 6232 == 16 * BITS_PER_UNIT)) 6233 { 6234 if (warn_pcs_change && abi_break) 6235 inform (input_location, "parameter passing for argument of type " 6236 "%qT changed in GCC 9.1", type); 6237 ++ncrn; 6238 gcc_assert (ncrn + nregs <= NUM_ARG_REGS); 6239 } 6240 6241 /* If an argument with an SVE mode needs to be shifted up to the 6242 high part of the register, treat it as though it had an integer mode. 6243 Using the normal (parallel [...]) would suppress the shifting. */ 6244 if (sve_p 6245 && BYTES_BIG_ENDIAN 6246 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD) 6247 && aarch64_pad_reg_upward (mode, type, false)) 6248 { 6249 mode = int_mode_for_mode (mode).require (); 6250 sve_p = false; 6251 } 6252 6253 /* NREGS can be 0 when e.g. an empty structure is to be passed. 6254 A reg is still generated for it, but the caller should be smart 6255 enough not to use it. */ 6256 if (nregs == 0 6257 || (nregs == 1 && !sve_p) 6258 || GET_MODE_CLASS (mode) == MODE_INT) 6259 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn); 6260 else 6261 { 6262 rtx par; 6263 int i; 6264 6265 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs)); 6266 for (i = 0; i < nregs; i++) 6267 { 6268 scalar_int_mode reg_mode = word_mode; 6269 if (nregs == 1) 6270 reg_mode = int_mode_for_mode (mode).require (); 6271 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i); 6272 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, 6273 GEN_INT (i * UNITS_PER_WORD)); 6274 XVECEXP (par, 0, i) = tmp; 6275 } 6276 pcum->aapcs_reg = par; 6277 } 6278 6279 pcum->aapcs_nextncrn = ncrn + nregs; 6280 return; 6281 } 6282 6283 /* C.11 */ 6284 pcum->aapcs_nextncrn = NUM_ARG_REGS; 6285 6286 /* The argument is passed on stack; record the needed number of words for 6287 this argument and align the total size if necessary. */ 6288on_stack: 6289 pcum->aapcs_stack_words = size / UNITS_PER_WORD; 6290 6291 if (aarch64_function_arg_alignment (mode, type, &abi_break) 6292 == 16 * BITS_PER_UNIT) 6293 { 6294 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD); 6295 if (pcum->aapcs_stack_size != new_size) 6296 { 6297 if (warn_pcs_change && abi_break) 6298 inform (input_location, "parameter passing for argument of type " 6299 "%qT changed in GCC 9.1", type); 6300 pcum->aapcs_stack_size = new_size; 6301 } 6302 } 6303 return; 6304} 6305 6306/* Implement TARGET_FUNCTION_ARG. */ 6307 6308static rtx 6309aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg) 6310{ 6311 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 6312 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64 6313 || pcum->pcs_variant == ARM_PCS_SIMD 6314 || pcum->pcs_variant == ARM_PCS_SVE); 6315 6316 if (arg.end_marker_p ()) 6317 return gen_int_mode (pcum->pcs_variant, DImode); 6318 6319 aarch64_layout_arg (pcum_v, arg); 6320 return pcum->aapcs_reg; 6321} 6322 6323void 6324aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum, 6325 const_tree fntype, 6326 rtx libname ATTRIBUTE_UNUSED, 6327 const_tree fndecl ATTRIBUTE_UNUSED, 6328 unsigned n_named ATTRIBUTE_UNUSED, 6329 bool silent_p) 6330{ 6331 pcum->aapcs_ncrn = 0; 6332 pcum->aapcs_nvrn = 0; 6333 pcum->aapcs_nprn = 0; 6334 pcum->aapcs_nextncrn = 0; 6335 pcum->aapcs_nextnvrn = 0; 6336 pcum->aapcs_nextnprn = 0; 6337 if (fntype) 6338 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id (); 6339 else 6340 pcum->pcs_variant = ARM_PCS_AAPCS64; 6341 pcum->aapcs_reg = NULL_RTX; 6342 pcum->aapcs_arg_processed = false; 6343 pcum->aapcs_stack_words = 0; 6344 pcum->aapcs_stack_size = 0; 6345 pcum->silent_p = silent_p; 6346 6347 if (!silent_p 6348 && !TARGET_FLOAT 6349 && fndecl && TREE_PUBLIC (fndecl) 6350 && fntype && fntype != error_mark_node) 6351 { 6352 const_tree type = TREE_TYPE (fntype); 6353 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */ 6354 int nregs ATTRIBUTE_UNUSED; /* Likewise. */ 6355 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type, 6356 &mode, &nregs, NULL, false)) 6357 aarch64_err_no_fpadvsimd (TYPE_MODE (type)); 6358 } 6359 6360 if (!silent_p 6361 && !TARGET_SVE 6362 && pcum->pcs_variant == ARM_PCS_SVE) 6363 { 6364 /* We can't gracefully recover at this point, so make this a 6365 fatal error. */ 6366 if (fndecl) 6367 fatal_error (input_location, "%qE requires the SVE ISA extension", 6368 fndecl); 6369 else 6370 fatal_error (input_location, "calls to functions of type %qT require" 6371 " the SVE ISA extension", fntype); 6372 } 6373} 6374 6375static void 6376aarch64_function_arg_advance (cumulative_args_t pcum_v, 6377 const function_arg_info &arg) 6378{ 6379 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 6380 if (pcum->pcs_variant == ARM_PCS_AAPCS64 6381 || pcum->pcs_variant == ARM_PCS_SIMD 6382 || pcum->pcs_variant == ARM_PCS_SVE) 6383 { 6384 aarch64_layout_arg (pcum_v, arg); 6385 gcc_assert ((pcum->aapcs_reg != NULL_RTX) 6386 != (pcum->aapcs_stack_words != 0)); 6387 pcum->aapcs_arg_processed = false; 6388 pcum->aapcs_ncrn = pcum->aapcs_nextncrn; 6389 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn; 6390 pcum->aapcs_nprn = pcum->aapcs_nextnprn; 6391 pcum->aapcs_stack_size += pcum->aapcs_stack_words; 6392 pcum->aapcs_stack_words = 0; 6393 pcum->aapcs_reg = NULL_RTX; 6394 } 6395} 6396 6397bool 6398aarch64_function_arg_regno_p (unsigned regno) 6399{ 6400 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS) 6401 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)); 6402} 6403 6404/* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least 6405 PARM_BOUNDARY bits of alignment, but will be given anything up 6406 to STACK_BOUNDARY bits if the type requires it. This makes sure 6407 that both before and after the layout of each argument, the Next 6408 Stacked Argument Address (NSAA) will have a minimum alignment of 6409 8 bytes. */ 6410 6411static unsigned int 6412aarch64_function_arg_boundary (machine_mode mode, const_tree type) 6413{ 6414 bool abi_break; 6415 unsigned int alignment = aarch64_function_arg_alignment (mode, type, 6416 &abi_break); 6417 if (abi_break && warn_psabi) 6418 inform (input_location, "parameter passing for argument of type " 6419 "%qT changed in GCC 9.1", type); 6420 6421 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); 6422} 6423 6424/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */ 6425 6426static fixed_size_mode 6427aarch64_get_reg_raw_mode (int regno) 6428{ 6429 if (TARGET_SVE && FP_REGNUM_P (regno)) 6430 /* Don't use the SVE part of the register for __builtin_apply and 6431 __builtin_return. The SVE registers aren't used by the normal PCS, 6432 so using them there would be a waste of time. The PCS extensions 6433 for SVE types are fundamentally incompatible with the 6434 __builtin_return/__builtin_apply interface. */ 6435 return as_a <fixed_size_mode> (V16QImode); 6436 return default_get_reg_raw_mode (regno); 6437} 6438 6439/* Implement TARGET_FUNCTION_ARG_PADDING. 6440 6441 Small aggregate types are placed in the lowest memory address. 6442 6443 The related parameter passing rules are B.4, C.3, C.5 and C.14. */ 6444 6445static pad_direction 6446aarch64_function_arg_padding (machine_mode mode, const_tree type) 6447{ 6448 /* On little-endian targets, the least significant byte of every stack 6449 argument is passed at the lowest byte address of the stack slot. */ 6450 if (!BYTES_BIG_ENDIAN) 6451 return PAD_UPWARD; 6452 6453 /* Otherwise, integral, floating-point and pointer types are padded downward: 6454 the least significant byte of a stack argument is passed at the highest 6455 byte address of the stack slot. */ 6456 if (type 6457 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type) 6458 || POINTER_TYPE_P (type)) 6459 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode))) 6460 return PAD_DOWNWARD; 6461 6462 /* Everything else padded upward, i.e. data in first byte of stack slot. */ 6463 return PAD_UPWARD; 6464} 6465 6466/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST). 6467 6468 It specifies padding for the last (may also be the only) 6469 element of a block move between registers and memory. If 6470 assuming the block is in the memory, padding upward means that 6471 the last element is padded after its highest significant byte, 6472 while in downward padding, the last element is padded at the 6473 its least significant byte side. 6474 6475 Small aggregates and small complex types are always padded 6476 upwards. 6477 6478 We don't need to worry about homogeneous floating-point or 6479 short-vector aggregates; their move is not affected by the 6480 padding direction determined here. Regardless of endianness, 6481 each element of such an aggregate is put in the least 6482 significant bits of a fp/simd register. 6483 6484 Return !BYTES_BIG_ENDIAN if the least significant byte of the 6485 register has useful data, and return the opposite if the most 6486 significant byte does. */ 6487 6488bool 6489aarch64_pad_reg_upward (machine_mode mode, const_tree type, 6490 bool first ATTRIBUTE_UNUSED) 6491{ 6492 6493 /* Aside from pure scalable types, small composite types are always 6494 padded upward. */ 6495 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode)) 6496 { 6497 HOST_WIDE_INT size; 6498 if (type) 6499 size = int_size_in_bytes (type); 6500 else 6501 /* No frontends can create types with variable-sized modes, so we 6502 shouldn't be asked to pass or return them. */ 6503 size = GET_MODE_SIZE (mode).to_constant (); 6504 if (size < 2 * UNITS_PER_WORD) 6505 { 6506 pure_scalable_type_info pst_info; 6507 if (pst_info.analyze_registers (type)) 6508 return false; 6509 return true; 6510 } 6511 } 6512 6513 /* Otherwise, use the default padding. */ 6514 return !BYTES_BIG_ENDIAN; 6515} 6516 6517static scalar_int_mode 6518aarch64_libgcc_cmp_return_mode (void) 6519{ 6520 return SImode; 6521} 6522 6523#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) 6524 6525/* We use the 12-bit shifted immediate arithmetic instructions so values 6526 must be multiple of (1 << 12), i.e. 4096. */ 6527#define ARITH_FACTOR 4096 6528 6529#if (PROBE_INTERVAL % ARITH_FACTOR) != 0 6530#error Cannot use simple address calculation for stack probing 6531#endif 6532 6533/* The pair of scratch registers used for stack probing. */ 6534#define PROBE_STACK_FIRST_REG R9_REGNUM 6535#define PROBE_STACK_SECOND_REG R10_REGNUM 6536 6537/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE, 6538 inclusive. These are offsets from the current stack pointer. */ 6539 6540static void 6541aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size) 6542{ 6543 HOST_WIDE_INT size; 6544 if (!poly_size.is_constant (&size)) 6545 { 6546 sorry ("stack probes for SVE frames"); 6547 return; 6548 } 6549 6550 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG); 6551 6552 /* See the same assertion on PROBE_INTERVAL above. */ 6553 gcc_assert ((first % ARITH_FACTOR) == 0); 6554 6555 /* See if we have a constant small number of probes to generate. If so, 6556 that's the easy case. */ 6557 if (size <= PROBE_INTERVAL) 6558 { 6559 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR); 6560 6561 emit_set_insn (reg1, 6562 plus_constant (Pmode, 6563 stack_pointer_rtx, -(first + base))); 6564 emit_stack_probe (plus_constant (Pmode, reg1, base - size)); 6565 } 6566 6567 /* The run-time loop is made up of 8 insns in the generic case while the 6568 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */ 6569 else if (size <= 4 * PROBE_INTERVAL) 6570 { 6571 HOST_WIDE_INT i, rem; 6572 6573 emit_set_insn (reg1, 6574 plus_constant (Pmode, 6575 stack_pointer_rtx, 6576 -(first + PROBE_INTERVAL))); 6577 emit_stack_probe (reg1); 6578 6579 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until 6580 it exceeds SIZE. If only two probes are needed, this will not 6581 generate any code. Then probe at FIRST + SIZE. */ 6582 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL) 6583 { 6584 emit_set_insn (reg1, 6585 plus_constant (Pmode, reg1, -PROBE_INTERVAL)); 6586 emit_stack_probe (reg1); 6587 } 6588 6589 rem = size - (i - PROBE_INTERVAL); 6590 if (rem > 256) 6591 { 6592 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR); 6593 6594 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base)); 6595 emit_stack_probe (plus_constant (Pmode, reg1, base - rem)); 6596 } 6597 else 6598 emit_stack_probe (plus_constant (Pmode, reg1, -rem)); 6599 } 6600 6601 /* Otherwise, do the same as above, but in a loop. Note that we must be 6602 extra careful with variables wrapping around because we might be at 6603 the very top (or the very bottom) of the address space and we have 6604 to be able to handle this case properly; in particular, we use an 6605 equality test for the loop condition. */ 6606 else 6607 { 6608 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG); 6609 6610 /* Step 1: round SIZE to the previous multiple of the interval. */ 6611 6612 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL; 6613 6614 6615 /* Step 2: compute initial and final value of the loop counter. */ 6616 6617 /* TEST_ADDR = SP + FIRST. */ 6618 emit_set_insn (reg1, 6619 plus_constant (Pmode, stack_pointer_rtx, -first)); 6620 6621 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */ 6622 HOST_WIDE_INT adjustment = - (first + rounded_size); 6623 if (! aarch64_uimm12_shift (adjustment)) 6624 { 6625 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment), 6626 true, Pmode); 6627 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2)); 6628 } 6629 else 6630 emit_set_insn (reg2, 6631 plus_constant (Pmode, stack_pointer_rtx, adjustment)); 6632 6633 /* Step 3: the loop 6634 6635 do 6636 { 6637 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL 6638 probe at TEST_ADDR 6639 } 6640 while (TEST_ADDR != LAST_ADDR) 6641 6642 probes at FIRST + N * PROBE_INTERVAL for values of N from 1 6643 until it is equal to ROUNDED_SIZE. */ 6644 6645 emit_insn (gen_probe_stack_range (reg1, reg1, reg2)); 6646 6647 6648 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time 6649 that SIZE is equal to ROUNDED_SIZE. */ 6650 6651 if (size != rounded_size) 6652 { 6653 HOST_WIDE_INT rem = size - rounded_size; 6654 6655 if (rem > 256) 6656 { 6657 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR); 6658 6659 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base)); 6660 emit_stack_probe (plus_constant (Pmode, reg2, base - rem)); 6661 } 6662 else 6663 emit_stack_probe (plus_constant (Pmode, reg2, -rem)); 6664 } 6665 } 6666 6667 /* Make sure nothing is scheduled before we are done. */ 6668 emit_insn (gen_blockage ()); 6669} 6670 6671/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are 6672 absolute addresses. */ 6673 6674const char * 6675aarch64_output_probe_stack_range (rtx reg1, rtx reg2) 6676{ 6677 static int labelno = 0; 6678 char loop_lab[32]; 6679 rtx xops[2]; 6680 6681 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); 6682 6683 /* Loop. */ 6684 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); 6685 6686 HOST_WIDE_INT stack_clash_probe_interval 6687 = 1 << param_stack_clash_protection_guard_size; 6688 6689 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ 6690 xops[0] = reg1; 6691 HOST_WIDE_INT interval; 6692 if (flag_stack_clash_protection) 6693 interval = stack_clash_probe_interval; 6694 else 6695 interval = PROBE_INTERVAL; 6696 6697 gcc_assert (aarch64_uimm12_shift (interval)); 6698 xops[1] = GEN_INT (interval); 6699 6700 output_asm_insn ("sub\t%0, %0, %1", xops); 6701 6702 /* If doing stack clash protection then we probe up by the ABI specified 6703 amount. We do this because we're dropping full pages at a time in the 6704 loop. But if we're doing non-stack clash probing, probe at SP 0. */ 6705 if (flag_stack_clash_protection) 6706 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD); 6707 else 6708 xops[1] = CONST0_RTX (GET_MODE (xops[1])); 6709 6710 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe 6711 by this amount for each iteration. */ 6712 output_asm_insn ("str\txzr, [%0, %1]", xops); 6713 6714 /* Test if TEST_ADDR == LAST_ADDR. */ 6715 xops[1] = reg2; 6716 output_asm_insn ("cmp\t%0, %1", xops); 6717 6718 /* Branch. */ 6719 fputs ("\tb.ne\t", asm_out_file); 6720 assemble_name_raw (asm_out_file, loop_lab); 6721 fputc ('\n', asm_out_file); 6722 6723 return ""; 6724} 6725 6726/* Emit the probe loop for doing stack clash probes and stack adjustments for 6727 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size 6728 of GUARD_SIZE. When a probe is emitted it is done at most 6729 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of 6730 at most MIN_PROBE_THRESHOLD. By the end of this function 6731 BASE = BASE - ADJUSTMENT. */ 6732 6733const char * 6734aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, 6735 rtx min_probe_threshold, rtx guard_size) 6736{ 6737 /* This function is not allowed to use any instruction generation function 6738 like gen_ and friends. If you do you'll likely ICE during CFG validation, 6739 so instead emit the code you want using output_asm_insn. */ 6740 gcc_assert (flag_stack_clash_protection); 6741 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size)); 6742 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold)); 6743 6744 /* The minimum required allocation before the residual requires probing. */ 6745 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold); 6746 6747 /* Clamp the value down to the nearest value that can be used with a cmp. */ 6748 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard); 6749 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode); 6750 6751 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard); 6752 gcc_assert (aarch64_uimm12_shift (residual_probe_guard)); 6753 6754 static int labelno = 0; 6755 char loop_start_lab[32]; 6756 char loop_end_lab[32]; 6757 rtx xops[2]; 6758 6759 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno); 6760 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++); 6761 6762 /* Emit loop start label. */ 6763 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab); 6764 6765 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */ 6766 xops[0] = adjustment; 6767 xops[1] = probe_offset_value_rtx; 6768 output_asm_insn ("cmp\t%0, %1", xops); 6769 6770 /* Branch to end if not enough adjustment to probe. */ 6771 fputs ("\tb.lt\t", asm_out_file); 6772 assemble_name_raw (asm_out_file, loop_end_lab); 6773 fputc ('\n', asm_out_file); 6774 6775 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */ 6776 xops[0] = base; 6777 xops[1] = probe_offset_value_rtx; 6778 output_asm_insn ("sub\t%0, %0, %1", xops); 6779 6780 /* Probe at BASE. */ 6781 xops[1] = const0_rtx; 6782 output_asm_insn ("str\txzr, [%0, %1]", xops); 6783 6784 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */ 6785 xops[0] = adjustment; 6786 xops[1] = probe_offset_value_rtx; 6787 output_asm_insn ("sub\t%0, %0, %1", xops); 6788 6789 /* Branch to start if still more bytes to allocate. */ 6790 fputs ("\tb\t", asm_out_file); 6791 assemble_name_raw (asm_out_file, loop_start_lab); 6792 fputc ('\n', asm_out_file); 6793 6794 /* No probe leave. */ 6795 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab); 6796 6797 /* BASE = BASE - ADJUSTMENT. */ 6798 xops[0] = base; 6799 xops[1] = adjustment; 6800 output_asm_insn ("sub\t%0, %0, %1", xops); 6801 return ""; 6802} 6803 6804/* Determine whether a frame chain needs to be generated. */ 6805static bool 6806aarch64_needs_frame_chain (void) 6807{ 6808 /* Force a frame chain for EH returns so the return address is at FP+8. */ 6809 if (frame_pointer_needed || crtl->calls_eh_return) 6810 return true; 6811 6812 /* A leaf function cannot have calls or write LR. */ 6813 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM); 6814 6815 /* Don't use a frame chain in leaf functions if leaf frame pointers 6816 are disabled. */ 6817 if (flag_omit_leaf_frame_pointer && is_leaf) 6818 return false; 6819 6820 return aarch64_use_frame_pointer; 6821} 6822 6823/* Mark the registers that need to be saved by the callee and calculate 6824 the size of the callee-saved registers area and frame record (both FP 6825 and LR may be omitted). */ 6826static void 6827aarch64_layout_frame (void) 6828{ 6829 poly_int64 offset = 0; 6830 int regno, last_fp_reg = INVALID_REGNUM; 6831 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM); 6832 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); 6833 bool frame_related_fp_reg_p = false; 6834 aarch64_frame &frame = cfun->machine->frame; 6835 6836 frame.emit_frame_chain = aarch64_needs_frame_chain (); 6837 6838 /* Adjust the outgoing arguments size if required. Keep it in sync with what 6839 the mid-end is doing. */ 6840 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun); 6841 6842#define SLOT_NOT_REQUIRED (-2) 6843#define SLOT_REQUIRED (-1) 6844 6845 frame.wb_candidate1 = INVALID_REGNUM; 6846 frame.wb_candidate2 = INVALID_REGNUM; 6847 frame.spare_pred_reg = INVALID_REGNUM; 6848 6849 /* First mark all the registers that really need to be saved... */ 6850 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 6851 frame.reg_offset[regno] = SLOT_NOT_REQUIRED; 6852 6853 /* ... that includes the eh data registers (if needed)... */ 6854 if (crtl->calls_eh_return) 6855 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++) 6856 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED; 6857 6858 /* ... and any callee saved register that dataflow says is live. */ 6859 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 6860 if (df_regs_ever_live_p (regno) 6861 && !fixed_regs[regno] 6862 && (regno == R30_REGNUM 6863 || !crtl->abi->clobbers_full_reg_p (regno))) 6864 frame.reg_offset[regno] = SLOT_REQUIRED; 6865 6866 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 6867 if (df_regs_ever_live_p (regno) 6868 && !fixed_regs[regno] 6869 && !crtl->abi->clobbers_full_reg_p (regno)) 6870 { 6871 frame.reg_offset[regno] = SLOT_REQUIRED; 6872 last_fp_reg = regno; 6873 if (aarch64_emit_cfi_for_reg_p (regno)) 6874 frame_related_fp_reg_p = true; 6875 } 6876 6877 /* Big-endian SVE frames need a spare predicate register in order 6878 to save Z8-Z15. Decide which register they should use. Prefer 6879 an unused argument register if possible, so that we don't force P4 6880 to be saved unnecessarily. */ 6881 if (frame_related_fp_reg_p 6882 && crtl->abi->id () == ARM_PCS_SVE 6883 && BYTES_BIG_ENDIAN) 6884 { 6885 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)); 6886 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun)); 6887 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++) 6888 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno)) 6889 break; 6890 gcc_assert (regno <= P7_REGNUM); 6891 frame.spare_pred_reg = regno; 6892 df_set_regs_ever_live (regno, true); 6893 } 6894 6895 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) 6896 if (df_regs_ever_live_p (regno) 6897 && !fixed_regs[regno] 6898 && !crtl->abi->clobbers_full_reg_p (regno)) 6899 frame.reg_offset[regno] = SLOT_REQUIRED; 6900 6901 /* With stack-clash, LR must be saved in non-leaf functions. */ 6902 gcc_assert (crtl->is_leaf 6903 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); 6904 6905 /* Now assign stack slots for the registers. Start with the predicate 6906 registers, since predicate LDR and STR have a relatively small 6907 offset range. These saves happen below the hard frame pointer. */ 6908 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) 6909 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 6910 { 6911 frame.reg_offset[regno] = offset; 6912 offset += BYTES_PER_SVE_PRED; 6913 } 6914 6915 if (maybe_ne (offset, 0)) 6916 { 6917 /* If we have any vector registers to save above the predicate registers, 6918 the offset of the vector register save slots need to be a multiple 6919 of the vector size. This lets us use the immediate forms of LDR/STR 6920 (or LD1/ST1 for big-endian). 6921 6922 A vector register is 8 times the size of a predicate register, 6923 and we need to save a maximum of 12 predicate registers, so the 6924 first vector register will be at either #1, MUL VL or #2, MUL VL. 6925 6926 If we don't have any vector registers to save, and we know how 6927 big the predicate save area is, we can just round it up to the 6928 next 16-byte boundary. */ 6929 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ()) 6930 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 6931 else 6932 { 6933 if (known_le (offset, vector_save_size)) 6934 offset = vector_save_size; 6935 else if (known_le (offset, vector_save_size * 2)) 6936 offset = vector_save_size * 2; 6937 else 6938 gcc_unreachable (); 6939 } 6940 } 6941 6942 /* If we need to save any SVE vector registers, add them next. */ 6943 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE) 6944 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 6945 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 6946 { 6947 frame.reg_offset[regno] = offset; 6948 offset += vector_save_size; 6949 } 6950 6951 /* OFFSET is now the offset of the hard frame pointer from the bottom 6952 of the callee save area. */ 6953 bool saves_below_hard_fp_p = maybe_ne (offset, 0); 6954 frame.below_hard_fp_saved_regs_size = offset; 6955 if (frame.emit_frame_chain) 6956 { 6957 /* FP and LR are placed in the linkage record. */ 6958 frame.reg_offset[R29_REGNUM] = offset; 6959 frame.wb_candidate1 = R29_REGNUM; 6960 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; 6961 frame.wb_candidate2 = R30_REGNUM; 6962 offset += 2 * UNITS_PER_WORD; 6963 } 6964 6965 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 6966 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 6967 { 6968 frame.reg_offset[regno] = offset; 6969 if (frame.wb_candidate1 == INVALID_REGNUM) 6970 frame.wb_candidate1 = regno; 6971 else if (frame.wb_candidate2 == INVALID_REGNUM) 6972 frame.wb_candidate2 = regno; 6973 offset += UNITS_PER_WORD; 6974 } 6975 6976 poly_int64 max_int_offset = offset; 6977 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 6978 bool has_align_gap = maybe_ne (offset, max_int_offset); 6979 6980 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 6981 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) 6982 { 6983 /* If there is an alignment gap between integer and fp callee-saves, 6984 allocate the last fp register to it if possible. */ 6985 if (regno == last_fp_reg 6986 && has_align_gap 6987 && known_eq (vector_save_size, 8) 6988 && multiple_p (offset, 16)) 6989 { 6990 frame.reg_offset[regno] = max_int_offset; 6991 break; 6992 } 6993 6994 frame.reg_offset[regno] = offset; 6995 if (frame.wb_candidate1 == INVALID_REGNUM) 6996 frame.wb_candidate1 = regno; 6997 else if (frame.wb_candidate2 == INVALID_REGNUM 6998 && frame.wb_candidate1 >= V0_REGNUM) 6999 frame.wb_candidate2 = regno; 7000 offset += vector_save_size; 7001 } 7002 7003 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); 7004 7005 frame.saved_regs_size = offset; 7006 7007 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; 7008 7009 poly_int64 above_outgoing_args 7010 = aligned_upper_bound (varargs_and_saved_regs_size 7011 + get_frame_size (), 7012 STACK_BOUNDARY / BITS_PER_UNIT); 7013 7014 frame.hard_fp_offset 7015 = above_outgoing_args - frame.below_hard_fp_saved_regs_size; 7016 7017 /* Both these values are already aligned. */ 7018 gcc_assert (multiple_p (crtl->outgoing_args_size, 7019 STACK_BOUNDARY / BITS_PER_UNIT)); 7020 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size; 7021 7022 frame.locals_offset = frame.saved_varargs_size; 7023 7024 frame.initial_adjust = 0; 7025 frame.final_adjust = 0; 7026 frame.callee_adjust = 0; 7027 frame.sve_callee_adjust = 0; 7028 frame.callee_offset = 0; 7029 7030 HOST_WIDE_INT max_push_offset = 0; 7031 if (frame.wb_candidate2 != INVALID_REGNUM) 7032 max_push_offset = 512; 7033 else if (frame.wb_candidate1 != INVALID_REGNUM) 7034 max_push_offset = 256; 7035 7036 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; 7037 HOST_WIDE_INT const_saved_regs_size; 7038 if (frame.frame_size.is_constant (&const_size) 7039 && const_size < max_push_offset 7040 && known_eq (frame.hard_fp_offset, const_size)) 7041 { 7042 /* Simple, small frame with no outgoing arguments: 7043 7044 stp reg1, reg2, [sp, -frame_size]! 7045 stp reg3, reg4, [sp, 16] */ 7046 frame.callee_adjust = const_size; 7047 } 7048 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size) 7049 && frame.saved_regs_size.is_constant (&const_saved_regs_size) 7050 && const_outgoing_args_size + const_saved_regs_size < 512 7051 /* We could handle this case even with outgoing args, provided 7052 that the number of args left us with valid offsets for all 7053 predicate and vector save slots. It's such a rare case that 7054 it hardly seems worth the effort though. */ 7055 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0) 7056 && !(cfun->calls_alloca 7057 && frame.hard_fp_offset.is_constant (&const_fp_offset) 7058 && const_fp_offset < max_push_offset)) 7059 { 7060 /* Frame with small outgoing arguments: 7061 7062 sub sp, sp, frame_size 7063 stp reg1, reg2, [sp, outgoing_args_size] 7064 stp reg3, reg4, [sp, outgoing_args_size + 16] */ 7065 frame.initial_adjust = frame.frame_size; 7066 frame.callee_offset = const_outgoing_args_size; 7067 } 7068 else if (saves_below_hard_fp_p 7069 && known_eq (frame.saved_regs_size, 7070 frame.below_hard_fp_saved_regs_size)) 7071 { 7072 /* Frame in which all saves are SVE saves: 7073 7074 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size 7075 save SVE registers relative to SP 7076 sub sp, sp, outgoing_args_size */ 7077 frame.initial_adjust = (frame.hard_fp_offset 7078 + frame.below_hard_fp_saved_regs_size); 7079 frame.final_adjust = crtl->outgoing_args_size; 7080 } 7081 else if (frame.hard_fp_offset.is_constant (&const_fp_offset) 7082 && const_fp_offset < max_push_offset) 7083 { 7084 /* Frame with large outgoing arguments or SVE saves, but with 7085 a small local area: 7086 7087 stp reg1, reg2, [sp, -hard_fp_offset]! 7088 stp reg3, reg4, [sp, 16] 7089 [sub sp, sp, below_hard_fp_saved_regs_size] 7090 [save SVE registers relative to SP] 7091 sub sp, sp, outgoing_args_size */ 7092 frame.callee_adjust = const_fp_offset; 7093 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 7094 frame.final_adjust = crtl->outgoing_args_size; 7095 } 7096 else 7097 { 7098 /* Frame with large local area and outgoing arguments or SVE saves, 7099 using frame pointer: 7100 7101 sub sp, sp, hard_fp_offset 7102 stp x29, x30, [sp, 0] 7103 add x29, sp, 0 7104 stp reg3, reg4, [sp, 16] 7105 [sub sp, sp, below_hard_fp_saved_regs_size] 7106 [save SVE registers relative to SP] 7107 sub sp, sp, outgoing_args_size */ 7108 frame.initial_adjust = frame.hard_fp_offset; 7109 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; 7110 frame.final_adjust = crtl->outgoing_args_size; 7111 } 7112 7113 /* Make sure the individual adjustments add up to the full frame size. */ 7114 gcc_assert (known_eq (frame.initial_adjust 7115 + frame.callee_adjust 7116 + frame.sve_callee_adjust 7117 + frame.final_adjust, frame.frame_size)); 7118 7119 frame.laid_out = true; 7120} 7121 7122/* Return true if the register REGNO is saved on entry to 7123 the current function. */ 7124 7125static bool 7126aarch64_register_saved_on_entry (int regno) 7127{ 7128 return known_ge (cfun->machine->frame.reg_offset[regno], 0); 7129} 7130 7131/* Return the next register up from REGNO up to LIMIT for the callee 7132 to save. */ 7133 7134static unsigned 7135aarch64_next_callee_save (unsigned regno, unsigned limit) 7136{ 7137 while (regno <= limit && !aarch64_register_saved_on_entry (regno)) 7138 regno ++; 7139 return regno; 7140} 7141 7142/* Push the register number REGNO of mode MODE to the stack with write-back 7143 adjusting the stack by ADJUSTMENT. */ 7144 7145static void 7146aarch64_pushwb_single_reg (machine_mode mode, unsigned regno, 7147 HOST_WIDE_INT adjustment) 7148 { 7149 rtx base_rtx = stack_pointer_rtx; 7150 rtx insn, reg, mem; 7151 7152 reg = gen_rtx_REG (mode, regno); 7153 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx, 7154 plus_constant (Pmode, base_rtx, -adjustment)); 7155 mem = gen_frame_mem (mode, mem); 7156 7157 insn = emit_move_insn (mem, reg); 7158 RTX_FRAME_RELATED_P (insn) = 1; 7159} 7160 7161/* Generate and return an instruction to store the pair of registers 7162 REG and REG2 of mode MODE to location BASE with write-back adjusting 7163 the stack location BASE by ADJUSTMENT. */ 7164 7165static rtx 7166aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, 7167 HOST_WIDE_INT adjustment) 7168{ 7169 switch (mode) 7170 { 7171 case E_DImode: 7172 return gen_storewb_pairdi_di (base, base, reg, reg2, 7173 GEN_INT (-adjustment), 7174 GEN_INT (UNITS_PER_WORD - adjustment)); 7175 case E_DFmode: 7176 return gen_storewb_pairdf_di (base, base, reg, reg2, 7177 GEN_INT (-adjustment), 7178 GEN_INT (UNITS_PER_WORD - adjustment)); 7179 case E_TFmode: 7180 return gen_storewb_pairtf_di (base, base, reg, reg2, 7181 GEN_INT (-adjustment), 7182 GEN_INT (UNITS_PER_VREG - adjustment)); 7183 default: 7184 gcc_unreachable (); 7185 } 7186} 7187 7188/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the 7189 stack pointer by ADJUSTMENT. */ 7190 7191static void 7192aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment) 7193{ 7194 rtx_insn *insn; 7195 machine_mode mode = aarch64_reg_save_mode (regno1); 7196 7197 if (regno2 == INVALID_REGNUM) 7198 return aarch64_pushwb_single_reg (mode, regno1, adjustment); 7199 7200 rtx reg1 = gen_rtx_REG (mode, regno1); 7201 rtx reg2 = gen_rtx_REG (mode, regno2); 7202 7203 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1, 7204 reg2, adjustment)); 7205 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1; 7206 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1; 7207 RTX_FRAME_RELATED_P (insn) = 1; 7208} 7209 7210/* Load the pair of register REG, REG2 of mode MODE from stack location BASE, 7211 adjusting it by ADJUSTMENT afterwards. */ 7212 7213static rtx 7214aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, 7215 HOST_WIDE_INT adjustment) 7216{ 7217 switch (mode) 7218 { 7219 case E_DImode: 7220 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment), 7221 GEN_INT (UNITS_PER_WORD)); 7222 case E_DFmode: 7223 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment), 7224 GEN_INT (UNITS_PER_WORD)); 7225 case E_TFmode: 7226 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment), 7227 GEN_INT (UNITS_PER_VREG)); 7228 default: 7229 gcc_unreachable (); 7230 } 7231} 7232 7233/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it 7234 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes 7235 into CFI_OPS. */ 7236 7237static void 7238aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment, 7239 rtx *cfi_ops) 7240{ 7241 machine_mode mode = aarch64_reg_save_mode (regno1); 7242 rtx reg1 = gen_rtx_REG (mode, regno1); 7243 7244 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); 7245 7246 if (regno2 == INVALID_REGNUM) 7247 { 7248 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment); 7249 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem); 7250 emit_move_insn (reg1, gen_frame_mem (mode, mem)); 7251 } 7252 else 7253 { 7254 rtx reg2 = gen_rtx_REG (mode, regno2); 7255 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops); 7256 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1, 7257 reg2, adjustment)); 7258 } 7259} 7260 7261/* Generate and return a store pair instruction of mode MODE to store 7262 register REG1 to MEM1 and register REG2 to MEM2. */ 7263 7264static rtx 7265aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, 7266 rtx reg2) 7267{ 7268 switch (mode) 7269 { 7270 case E_DImode: 7271 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2); 7272 7273 case E_DFmode: 7274 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2); 7275 7276 case E_TFmode: 7277 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); 7278 7279 default: 7280 gcc_unreachable (); 7281 } 7282} 7283 7284/* Generate and regurn a load pair isntruction of mode MODE to load register 7285 REG1 from MEM1 and register REG2 from MEM2. */ 7286 7287static rtx 7288aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, 7289 rtx mem2) 7290{ 7291 switch (mode) 7292 { 7293 case E_DImode: 7294 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2); 7295 7296 case E_DFmode: 7297 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2); 7298 7299 case E_TFmode: 7300 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); 7301 7302 default: 7303 gcc_unreachable (); 7304 } 7305} 7306 7307/* Return TRUE if return address signing should be enabled for the current 7308 function, otherwise return FALSE. */ 7309 7310bool 7311aarch64_return_address_signing_enabled (void) 7312{ 7313 /* This function should only be called after frame laid out. */ 7314 gcc_assert (cfun->machine->frame.laid_out); 7315 7316 /* Turn return address signing off in any function that uses 7317 __builtin_eh_return. The address passed to __builtin_eh_return 7318 is not signed so either it has to be signed (with original sp) 7319 or the code path that uses it has to avoid authenticating it. 7320 Currently eh return introduces a return to anywhere gadget, no 7321 matter what we do here since it uses ret with user provided 7322 address. An ideal fix for that is to use indirect branch which 7323 can be protected with BTI j (to some extent). */ 7324 if (crtl->calls_eh_return) 7325 return false; 7326 7327 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function 7328 if its LR is pushed onto stack. */ 7329 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL 7330 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF 7331 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0))); 7332} 7333 7334/* Return TRUE if Branch Target Identification Mechanism is enabled. */ 7335bool 7336aarch64_bti_enabled (void) 7337{ 7338 return (aarch64_enable_bti == 1); 7339} 7340 7341/* The caller is going to use ST1D or LD1D to save or restore an SVE 7342 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in 7343 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by: 7344 7345 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D 7346 or LD1D address 7347 7348 (2) setting PRED to a valid predicate register for the ST1D or LD1D, 7349 if the variable isn't already nonnull 7350 7351 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE). 7352 Handle this case using a temporary base register that is suitable for 7353 all offsets in that range. Use ANCHOR_REG as this base register if it 7354 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */ 7355 7356static inline void 7357aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx, 7358 rtx &anchor_reg, poly_int64 &offset, 7359 rtx &ptrue) 7360{ 7361 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode))) 7362 { 7363 /* This is the maximum valid offset of the anchor from the base. 7364 Lower values would be valid too. */ 7365 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode); 7366 if (!anchor_reg) 7367 { 7368 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); 7369 emit_insn (gen_add3_insn (anchor_reg, base_rtx, 7370 gen_int_mode (anchor_offset, Pmode))); 7371 } 7372 base_rtx = anchor_reg; 7373 offset -= anchor_offset; 7374 } 7375 if (!ptrue) 7376 { 7377 int pred_reg = cfun->machine->frame.spare_pred_reg; 7378 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg), 7379 CONSTM1_RTX (VNx16BImode)); 7380 ptrue = gen_rtx_REG (VNx2BImode, pred_reg); 7381 } 7382} 7383 7384/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG 7385 is saved at BASE + OFFSET. */ 7386 7387static void 7388aarch64_add_cfa_expression (rtx_insn *insn, rtx reg, 7389 rtx base, poly_int64 offset) 7390{ 7391 rtx mem = gen_frame_mem (GET_MODE (reg), 7392 plus_constant (Pmode, base, offset)); 7393 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg)); 7394} 7395 7396/* Emit code to save the callee-saved registers from register number START 7397 to LIMIT to the stack at the location starting at offset START_OFFSET, 7398 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P 7399 is true if the hard frame pointer has been set up. */ 7400 7401static void 7402aarch64_save_callee_saves (poly_int64 start_offset, 7403 unsigned start, unsigned limit, bool skip_wb, 7404 bool hard_fp_valid_p) 7405{ 7406 rtx_insn *insn; 7407 unsigned regno; 7408 unsigned regno2; 7409 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX; 7410 7411 for (regno = aarch64_next_callee_save (start, limit); 7412 regno <= limit; 7413 regno = aarch64_next_callee_save (regno + 1, limit)) 7414 { 7415 rtx reg, mem; 7416 poly_int64 offset; 7417 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); 7418 7419 if (skip_wb 7420 && (regno == cfun->machine->frame.wb_candidate1 7421 || regno == cfun->machine->frame.wb_candidate2)) 7422 continue; 7423 7424 if (cfun->machine->reg_is_wrapped_separately[regno]) 7425 continue; 7426 7427 machine_mode mode = aarch64_reg_save_mode (regno); 7428 reg = gen_rtx_REG (mode, regno); 7429 offset = start_offset + cfun->machine->frame.reg_offset[regno]; 7430 rtx base_rtx = stack_pointer_rtx; 7431 poly_int64 sp_offset = offset; 7432 7433 HOST_WIDE_INT const_offset; 7434 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 7435 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, 7436 offset, ptrue); 7437 else if (GP_REGNUM_P (regno) 7438 && (!offset.is_constant (&const_offset) || const_offset >= 512)) 7439 { 7440 gcc_assert (known_eq (start_offset, 0)); 7441 poly_int64 fp_offset 7442 = cfun->machine->frame.below_hard_fp_saved_regs_size; 7443 if (hard_fp_valid_p) 7444 base_rtx = hard_frame_pointer_rtx; 7445 else 7446 { 7447 if (!anchor_reg) 7448 { 7449 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); 7450 emit_insn (gen_add3_insn (anchor_reg, base_rtx, 7451 gen_int_mode (fp_offset, Pmode))); 7452 } 7453 base_rtx = anchor_reg; 7454 } 7455 offset -= fp_offset; 7456 } 7457 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 7458 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx); 7459 7460 if (!aarch64_sve_mode_p (mode) 7461 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit 7462 && !cfun->machine->reg_is_wrapped_separately[regno2] 7463 && known_eq (GET_MODE_SIZE (mode), 7464 cfun->machine->frame.reg_offset[regno2] 7465 - cfun->machine->frame.reg_offset[regno])) 7466 { 7467 rtx reg2 = gen_rtx_REG (mode, regno2); 7468 rtx mem2; 7469 7470 offset += GET_MODE_SIZE (mode); 7471 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 7472 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, 7473 reg2)); 7474 7475 /* The first part of a frame-related parallel insn is 7476 always assumed to be relevant to the frame 7477 calculations; subsequent parts, are only 7478 frame-related if explicitly marked. */ 7479 if (aarch64_emit_cfi_for_reg_p (regno2)) 7480 { 7481 if (need_cfa_note_p) 7482 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx, 7483 sp_offset + GET_MODE_SIZE (mode)); 7484 else 7485 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1; 7486 } 7487 7488 regno = regno2; 7489 } 7490 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 7491 { 7492 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg)); 7493 need_cfa_note_p = true; 7494 } 7495 else if (aarch64_sve_mode_p (mode)) 7496 insn = emit_insn (gen_rtx_SET (mem, reg)); 7497 else 7498 insn = emit_move_insn (mem, reg); 7499 7500 RTX_FRAME_RELATED_P (insn) = frame_related_p; 7501 if (frame_related_p && need_cfa_note_p) 7502 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset); 7503 } 7504} 7505 7506/* Emit code to restore the callee registers from register number START 7507 up to and including LIMIT. Restore from the stack offset START_OFFSET, 7508 skipping any write-back candidates if SKIP_WB is true. Write the 7509 appropriate REG_CFA_RESTORE notes into CFI_OPS. */ 7510 7511static void 7512aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, 7513 unsigned limit, bool skip_wb, rtx *cfi_ops) 7514{ 7515 unsigned regno; 7516 unsigned regno2; 7517 poly_int64 offset; 7518 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX; 7519 7520 for (regno = aarch64_next_callee_save (start, limit); 7521 regno <= limit; 7522 regno = aarch64_next_callee_save (regno + 1, limit)) 7523 { 7524 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); 7525 if (cfun->machine->reg_is_wrapped_separately[regno]) 7526 continue; 7527 7528 rtx reg, mem; 7529 7530 if (skip_wb 7531 && (regno == cfun->machine->frame.wb_candidate1 7532 || regno == cfun->machine->frame.wb_candidate2)) 7533 continue; 7534 7535 machine_mode mode = aarch64_reg_save_mode (regno); 7536 reg = gen_rtx_REG (mode, regno); 7537 offset = start_offset + cfun->machine->frame.reg_offset[regno]; 7538 rtx base_rtx = stack_pointer_rtx; 7539 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 7540 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, 7541 offset, ptrue); 7542 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 7543 7544 if (!aarch64_sve_mode_p (mode) 7545 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit 7546 && !cfun->machine->reg_is_wrapped_separately[regno2] 7547 && known_eq (GET_MODE_SIZE (mode), 7548 cfun->machine->frame.reg_offset[regno2] 7549 - cfun->machine->frame.reg_offset[regno])) 7550 { 7551 rtx reg2 = gen_rtx_REG (mode, regno2); 7552 rtx mem2; 7553 7554 offset += GET_MODE_SIZE (mode); 7555 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 7556 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2)); 7557 7558 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops); 7559 regno = regno2; 7560 } 7561 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 7562 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem)); 7563 else if (aarch64_sve_mode_p (mode)) 7564 emit_insn (gen_rtx_SET (reg, mem)); 7565 else 7566 emit_move_insn (reg, mem); 7567 if (frame_related_p) 7568 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops); 7569 } 7570} 7571 7572/* Return true if OFFSET is a signed 4-bit value multiplied by the size 7573 of MODE. */ 7574 7575static inline bool 7576offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 7577{ 7578 HOST_WIDE_INT multiple; 7579 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 7580 && IN_RANGE (multiple, -8, 7)); 7581} 7582 7583/* Return true if OFFSET is a unsigned 6-bit value multiplied by the size 7584 of MODE. */ 7585 7586static inline bool 7587offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) 7588{ 7589 HOST_WIDE_INT multiple; 7590 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 7591 && IN_RANGE (multiple, 0, 63)); 7592} 7593 7594/* Return true if OFFSET is a signed 7-bit value multiplied by the size 7595 of MODE. */ 7596 7597bool 7598aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 7599{ 7600 HOST_WIDE_INT multiple; 7601 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 7602 && IN_RANGE (multiple, -64, 63)); 7603} 7604 7605/* Return true if OFFSET is a signed 9-bit value. */ 7606 7607bool 7608aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, 7609 poly_int64 offset) 7610{ 7611 HOST_WIDE_INT const_offset; 7612 return (offset.is_constant (&const_offset) 7613 && IN_RANGE (const_offset, -256, 255)); 7614} 7615 7616/* Return true if OFFSET is a signed 9-bit value multiplied by the size 7617 of MODE. */ 7618 7619static inline bool 7620offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 7621{ 7622 HOST_WIDE_INT multiple; 7623 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 7624 && IN_RANGE (multiple, -256, 255)); 7625} 7626 7627/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size 7628 of MODE. */ 7629 7630static inline bool 7631offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) 7632{ 7633 HOST_WIDE_INT multiple; 7634 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 7635 && IN_RANGE (multiple, 0, 4095)); 7636} 7637 7638/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ 7639 7640static sbitmap 7641aarch64_get_separate_components (void) 7642{ 7643 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); 7644 bitmap_clear (components); 7645 7646 /* The registers we need saved to the frame. */ 7647 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 7648 if (aarch64_register_saved_on_entry (regno)) 7649 { 7650 /* Punt on saves and restores that use ST1D and LD1D. We could 7651 try to be smarter, but it would involve making sure that the 7652 spare predicate register itself is safe to use at the save 7653 and restore points. Also, when a frame pointer is being used, 7654 the slots are often out of reach of ST1D and LD1D anyway. */ 7655 machine_mode mode = aarch64_reg_save_mode (regno); 7656 if (mode == VNx2DImode && BYTES_BIG_ENDIAN) 7657 continue; 7658 7659 poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 7660 7661 /* If the register is saved in the first SVE save slot, we use 7662 it as a stack probe for -fstack-clash-protection. */ 7663 if (flag_stack_clash_protection 7664 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0) 7665 && known_eq (offset, 0)) 7666 continue; 7667 7668 /* Get the offset relative to the register we'll use. */ 7669 if (frame_pointer_needed) 7670 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; 7671 else 7672 offset += crtl->outgoing_args_size; 7673 7674 /* Check that we can access the stack slot of the register with one 7675 direct load with no adjustments needed. */ 7676 if (aarch64_sve_mode_p (mode) 7677 ? offset_9bit_signed_scaled_p (mode, offset) 7678 : offset_12bit_unsigned_scaled_p (mode, offset)) 7679 bitmap_set_bit (components, regno); 7680 } 7681 7682 /* Don't mess with the hard frame pointer. */ 7683 if (frame_pointer_needed) 7684 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); 7685 7686 /* If the spare predicate register used by big-endian SVE code 7687 is call-preserved, it must be saved in the main prologue 7688 before any saves that use it. */ 7689 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) 7690 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); 7691 7692 unsigned reg1 = cfun->machine->frame.wb_candidate1; 7693 unsigned reg2 = cfun->machine->frame.wb_candidate2; 7694 /* If registers have been chosen to be stored/restored with 7695 writeback don't interfere with them to avoid having to output explicit 7696 stack adjustment instructions. */ 7697 if (reg2 != INVALID_REGNUM) 7698 bitmap_clear_bit (components, reg2); 7699 if (reg1 != INVALID_REGNUM) 7700 bitmap_clear_bit (components, reg1); 7701 7702 bitmap_clear_bit (components, LR_REGNUM); 7703 bitmap_clear_bit (components, SP_REGNUM); 7704 7705 return components; 7706} 7707 7708/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ 7709 7710static sbitmap 7711aarch64_components_for_bb (basic_block bb) 7712{ 7713 bitmap in = DF_LIVE_IN (bb); 7714 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; 7715 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; 7716 7717 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); 7718 bitmap_clear (components); 7719 7720 /* Clobbered registers don't generate values in any meaningful sense, 7721 since nothing after the clobber can rely on their value. And we can't 7722 say that partially-clobbered registers are unconditionally killed, 7723 because whether they're killed or not depends on the mode of the 7724 value they're holding. Thus partially call-clobbered registers 7725 appear in neither the kill set nor the gen set. 7726 7727 Check manually for any calls that clobber more of a register than the 7728 current function can. */ 7729 function_abi_aggregator callee_abis; 7730 rtx_insn *insn; 7731 FOR_BB_INSNS (bb, insn) 7732 if (CALL_P (insn)) 7733 callee_abis.note_callee_abi (insn_callee_abi (insn)); 7734 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi); 7735 7736 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ 7737 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 7738 if (!fixed_regs[regno] 7739 && !crtl->abi->clobbers_full_reg_p (regno) 7740 && (TEST_HARD_REG_BIT (extra_caller_saves, regno) 7741 || bitmap_bit_p (in, regno) 7742 || bitmap_bit_p (gen, regno) 7743 || bitmap_bit_p (kill, regno))) 7744 { 7745 bitmap_set_bit (components, regno); 7746 7747 /* If there is a callee-save at an adjacent offset, add it too 7748 to increase the use of LDP/STP. */ 7749 poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 7750 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1; 7751 7752 if (regno2 <= LAST_SAVED_REGNUM) 7753 { 7754 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; 7755 if (regno < regno2 7756 ? known_eq (offset + 8, offset2) 7757 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset)) 7758 bitmap_set_bit (components, regno2); 7759 } 7760 } 7761 7762 return components; 7763} 7764 7765/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. 7766 Nothing to do for aarch64. */ 7767 7768static void 7769aarch64_disqualify_components (sbitmap, edge, sbitmap, bool) 7770{ 7771} 7772 7773/* Return the next set bit in BMP from START onwards. Return the total number 7774 of bits in BMP if no set bit is found at or after START. */ 7775 7776static unsigned int 7777aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) 7778{ 7779 unsigned int nbits = SBITMAP_SIZE (bmp); 7780 if (start == nbits) 7781 return start; 7782 7783 gcc_assert (start < nbits); 7784 for (unsigned int i = start; i < nbits; i++) 7785 if (bitmap_bit_p (bmp, i)) 7786 return i; 7787 7788 return nbits; 7789} 7790 7791/* Do the work for aarch64_emit_prologue_components and 7792 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers 7793 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence 7794 for these components or the epilogue sequence. That is, it determines 7795 whether we should emit stores or loads and what kind of CFA notes to attach 7796 to the insns. Otherwise the logic for the two sequences is very 7797 similar. */ 7798 7799static void 7800aarch64_process_components (sbitmap components, bool prologue_p) 7801{ 7802 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed 7803 ? HARD_FRAME_POINTER_REGNUM 7804 : STACK_POINTER_REGNUM); 7805 7806 unsigned last_regno = SBITMAP_SIZE (components); 7807 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM); 7808 rtx_insn *insn = NULL; 7809 7810 while (regno != last_regno) 7811 { 7812 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); 7813 machine_mode mode = aarch64_reg_save_mode (regno); 7814 7815 rtx reg = gen_rtx_REG (mode, regno); 7816 poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 7817 if (frame_pointer_needed) 7818 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; 7819 else 7820 offset += crtl->outgoing_args_size; 7821 7822 rtx addr = plus_constant (Pmode, ptr_reg, offset); 7823 rtx mem = gen_frame_mem (mode, addr); 7824 7825 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem); 7826 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1); 7827 /* No more registers to handle after REGNO. 7828 Emit a single save/restore and exit. */ 7829 if (regno2 == last_regno) 7830 { 7831 insn = emit_insn (set); 7832 if (frame_related_p) 7833 { 7834 RTX_FRAME_RELATED_P (insn) = 1; 7835 if (prologue_p) 7836 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); 7837 else 7838 add_reg_note (insn, REG_CFA_RESTORE, reg); 7839 } 7840 break; 7841 } 7842 7843 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; 7844 /* The next register is not of the same class or its offset is not 7845 mergeable with the current one into a pair. */ 7846 if (aarch64_sve_mode_p (mode) 7847 || !satisfies_constraint_Ump (mem) 7848 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) 7849 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno)) 7850 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), 7851 GET_MODE_SIZE (mode))) 7852 { 7853 insn = emit_insn (set); 7854 if (frame_related_p) 7855 { 7856 RTX_FRAME_RELATED_P (insn) = 1; 7857 if (prologue_p) 7858 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); 7859 else 7860 add_reg_note (insn, REG_CFA_RESTORE, reg); 7861 } 7862 7863 regno = regno2; 7864 continue; 7865 } 7866 7867 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2); 7868 7869 /* REGNO2 can be saved/restored in a pair with REGNO. */ 7870 rtx reg2 = gen_rtx_REG (mode, regno2); 7871 if (frame_pointer_needed) 7872 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size; 7873 else 7874 offset2 += crtl->outgoing_args_size; 7875 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); 7876 rtx mem2 = gen_frame_mem (mode, addr2); 7877 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) 7878 : gen_rtx_SET (reg2, mem2); 7879 7880 if (prologue_p) 7881 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2)); 7882 else 7883 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2)); 7884 7885 if (frame_related_p || frame_related2_p) 7886 { 7887 RTX_FRAME_RELATED_P (insn) = 1; 7888 if (prologue_p) 7889 { 7890 if (frame_related_p) 7891 add_reg_note (insn, REG_CFA_OFFSET, set); 7892 if (frame_related2_p) 7893 add_reg_note (insn, REG_CFA_OFFSET, set2); 7894 } 7895 else 7896 { 7897 if (frame_related_p) 7898 add_reg_note (insn, REG_CFA_RESTORE, reg); 7899 if (frame_related2_p) 7900 add_reg_note (insn, REG_CFA_RESTORE, reg2); 7901 } 7902 } 7903 7904 regno = aarch64_get_next_set_bit (components, regno2 + 1); 7905 } 7906} 7907 7908/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ 7909 7910static void 7911aarch64_emit_prologue_components (sbitmap components) 7912{ 7913 aarch64_process_components (components, true); 7914} 7915 7916/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ 7917 7918static void 7919aarch64_emit_epilogue_components (sbitmap components) 7920{ 7921 aarch64_process_components (components, false); 7922} 7923 7924/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ 7925 7926static void 7927aarch64_set_handled_components (sbitmap components) 7928{ 7929 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 7930 if (bitmap_bit_p (components, regno)) 7931 cfun->machine->reg_is_wrapped_separately[regno] = true; 7932} 7933 7934/* On AArch64 we have an ABI defined safe buffer. This constant is used to 7935 determining the probe offset for alloca. */ 7936 7937static HOST_WIDE_INT 7938aarch64_stack_clash_protection_alloca_probe_range (void) 7939{ 7940 return STACK_CLASH_CALLER_GUARD; 7941} 7942 7943 7944/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch 7945 registers. If POLY_SIZE is not large enough to require a probe this function 7946 will only adjust the stack. When allocating the stack space 7947 FRAME_RELATED_P is then used to indicate if the allocation is frame related. 7948 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing 7949 arguments. If we are then we ensure that any allocation larger than the ABI 7950 defined buffer needs a probe so that the invariant of having a 1KB buffer is 7951 maintained. 7952 7953 We emit barriers after each stack adjustment to prevent optimizations from 7954 breaking the invariant that we never drop the stack more than a page. This 7955 invariant is needed to make it easier to correctly handle asynchronous 7956 events, e.g. if we were to allow the stack to be dropped by more than a page 7957 and then have multiple probes up and we take a signal somewhere in between 7958 then the signal handler doesn't know the state of the stack and can make no 7959 assumptions about which pages have been probed. */ 7960 7961static void 7962aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 7963 poly_int64 poly_size, 7964 bool frame_related_p, 7965 bool final_adjustment_p) 7966{ 7967 HOST_WIDE_INT guard_size 7968 = 1 << param_stack_clash_protection_guard_size; 7969 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 7970 HOST_WIDE_INT min_probe_threshold 7971 = (final_adjustment_p 7972 ? guard_used_by_caller 7973 : guard_size - guard_used_by_caller); 7974 /* When doing the final adjustment for the outgoing arguments, take into 7975 account any unprobed space there is above the current SP. There are 7976 two cases: 7977 7978 - When saving SVE registers below the hard frame pointer, we force 7979 the lowest save to take place in the prologue before doing the final 7980 adjustment (i.e. we don't allow the save to be shrink-wrapped). 7981 This acts as a probe at SP, so there is no unprobed space. 7982 7983 - When there are no SVE register saves, we use the store of the link 7984 register as a probe. We can't assume that LR was saved at position 0 7985 though, so treat any space below it as unprobed. */ 7986 if (final_adjustment_p 7987 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)) 7988 { 7989 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM]; 7990 if (known_ge (lr_offset, 0)) 7991 min_probe_threshold -= lr_offset.to_constant (); 7992 else 7993 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); 7994 } 7995 7996 poly_int64 frame_size = cfun->machine->frame.frame_size; 7997 7998 /* We should always have a positive probe threshold. */ 7999 gcc_assert (min_probe_threshold > 0); 8000 8001 if (flag_stack_clash_protection && !final_adjustment_p) 8002 { 8003 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 8004 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 8005 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 8006 8007 if (known_eq (frame_size, 0)) 8008 { 8009 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); 8010 } 8011 else if (known_lt (initial_adjust + sve_callee_adjust, 8012 guard_size - guard_used_by_caller) 8013 && known_lt (final_adjust, guard_used_by_caller)) 8014 { 8015 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); 8016 } 8017 } 8018 8019 /* If SIZE is not large enough to require probing, just adjust the stack and 8020 exit. */ 8021 if (known_lt (poly_size, min_probe_threshold) 8022 || !flag_stack_clash_protection) 8023 { 8024 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); 8025 return; 8026 } 8027 8028 HOST_WIDE_INT size; 8029 /* Handle the SVE non-constant case first. */ 8030 if (!poly_size.is_constant (&size)) 8031 { 8032 if (dump_file) 8033 { 8034 fprintf (dump_file, "Stack clash SVE prologue: "); 8035 print_dec (poly_size, dump_file); 8036 fprintf (dump_file, " bytes, dynamic probing will be required.\n"); 8037 } 8038 8039 /* First calculate the amount of bytes we're actually spilling. */ 8040 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode), 8041 poly_size, temp1, temp2, false, true); 8042 8043 rtx_insn *insn = get_last_insn (); 8044 8045 if (frame_related_p) 8046 { 8047 /* This is done to provide unwinding information for the stack 8048 adjustments we're about to do, however to prevent the optimizers 8049 from removing the R11 move and leaving the CFA note (which would be 8050 very wrong) we tie the old and new stack pointer together. 8051 The tie will expand to nothing but the optimizers will not touch 8052 the instruction. */ 8053 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); 8054 emit_move_insn (stack_ptr_copy, stack_pointer_rtx); 8055 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx)); 8056 8057 /* We want the CFA independent of the stack pointer for the 8058 duration of the loop. */ 8059 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); 8060 RTX_FRAME_RELATED_P (insn) = 1; 8061 } 8062 8063 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode); 8064 rtx guard_const = gen_int_mode (guard_size, Pmode); 8065 8066 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, 8067 stack_pointer_rtx, temp1, 8068 probe_const, guard_const)); 8069 8070 /* Now reset the CFA register if needed. */ 8071 if (frame_related_p) 8072 { 8073 add_reg_note (insn, REG_CFA_DEF_CFA, 8074 gen_rtx_PLUS (Pmode, stack_pointer_rtx, 8075 gen_int_mode (poly_size, Pmode))); 8076 RTX_FRAME_RELATED_P (insn) = 1; 8077 } 8078 8079 return; 8080 } 8081 8082 if (dump_file) 8083 fprintf (dump_file, 8084 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC 8085 " bytes, probing will be required.\n", size); 8086 8087 /* Round size to the nearest multiple of guard_size, and calculate the 8088 residual as the difference between the original size and the rounded 8089 size. */ 8090 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size); 8091 HOST_WIDE_INT residual = size - rounded_size; 8092 8093 /* We can handle a small number of allocations/probes inline. Otherwise 8094 punt to a loop. */ 8095 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size) 8096 { 8097 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size) 8098 { 8099 aarch64_sub_sp (NULL, temp2, guard_size, true); 8100 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 8101 guard_used_by_caller)); 8102 emit_insn (gen_blockage ()); 8103 } 8104 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); 8105 } 8106 else 8107 { 8108 /* Compute the ending address. */ 8109 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size, 8110 temp1, NULL, false, true); 8111 rtx_insn *insn = get_last_insn (); 8112 8113 /* For the initial allocation, we don't have a frame pointer 8114 set up, so we always need CFI notes. If we're doing the 8115 final allocation, then we may have a frame pointer, in which 8116 case it is the CFA, otherwise we need CFI notes. 8117 8118 We can determine which allocation we are doing by looking at 8119 the value of FRAME_RELATED_P since the final allocations are not 8120 frame related. */ 8121 if (frame_related_p) 8122 { 8123 /* We want the CFA independent of the stack pointer for the 8124 duration of the loop. */ 8125 add_reg_note (insn, REG_CFA_DEF_CFA, 8126 plus_constant (Pmode, temp1, rounded_size)); 8127 RTX_FRAME_RELATED_P (insn) = 1; 8128 } 8129 8130 /* This allocates and probes the stack. Note that this re-uses some of 8131 the existing Ada stack protection code. However we are guaranteed not 8132 to enter the non loop or residual branches of that code. 8133 8134 The non-loop part won't be entered because if our allocation amount 8135 doesn't require a loop, the case above would handle it. 8136 8137 The residual amount won't be entered because TEMP1 is a mutliple of 8138 the allocation size. The residual will always be 0. As such, the only 8139 part we are actually using from that code is the loop setup. The 8140 actual probing is done in aarch64_output_probe_stack_range. */ 8141 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, 8142 stack_pointer_rtx, temp1)); 8143 8144 /* Now reset the CFA register if needed. */ 8145 if (frame_related_p) 8146 { 8147 add_reg_note (insn, REG_CFA_DEF_CFA, 8148 plus_constant (Pmode, stack_pointer_rtx, rounded_size)); 8149 RTX_FRAME_RELATED_P (insn) = 1; 8150 } 8151 8152 emit_insn (gen_blockage ()); 8153 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); 8154 } 8155 8156 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to 8157 be probed. This maintains the requirement that each page is probed at 8158 least once. For initial probing we probe only if the allocation is 8159 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe 8160 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer == 8161 GUARD_SIZE. This works that for any allocation that is large enough to 8162 trigger a probe here, we'll have at least one, and if they're not large 8163 enough for this code to emit anything for them, The page would have been 8164 probed by the saving of FP/LR either by this function or any callees. If 8165 we don't have any callees then we won't have more stack adjustments and so 8166 are still safe. */ 8167 if (residual) 8168 { 8169 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; 8170 /* If we're doing final adjustments, and we've done any full page 8171 allocations then any residual needs to be probed. */ 8172 if (final_adjustment_p && rounded_size != 0) 8173 min_probe_threshold = 0; 8174 /* If doing a small final adjustment, we always probe at offset 0. 8175 This is done to avoid issues when LR is not at position 0 or when 8176 the final adjustment is smaller than the probing offset. */ 8177 else if (final_adjustment_p && rounded_size == 0) 8178 residual_probe_offset = 0; 8179 8180 aarch64_sub_sp (temp1, temp2, residual, frame_related_p); 8181 if (residual >= min_probe_threshold) 8182 { 8183 if (dump_file) 8184 fprintf (dump_file, 8185 "Stack clash AArch64 prologue residuals: " 8186 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required." 8187 "\n", residual); 8188 8189 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 8190 residual_probe_offset)); 8191 emit_insn (gen_blockage ()); 8192 } 8193 } 8194} 8195 8196/* Return 1 if the register is used by the epilogue. We need to say the 8197 return register is used, but only after epilogue generation is complete. 8198 Note that in the case of sibcalls, the values "used by the epilogue" are 8199 considered live at the start of the called function. 8200 8201 For SIMD functions we need to return 1 for FP registers that are saved and 8202 restored by a function but are not zero in call_used_regs. If we do not do 8203 this optimizations may remove the restore of the register. */ 8204 8205int 8206aarch64_epilogue_uses (int regno) 8207{ 8208 if (epilogue_completed) 8209 { 8210 if (regno == LR_REGNUM) 8211 return 1; 8212 } 8213 return 0; 8214} 8215 8216/* AArch64 stack frames generated by this compiler look like: 8217 8218 +-------------------------------+ 8219 | | 8220 | incoming stack arguments | 8221 | | 8222 +-------------------------------+ 8223 | | <-- incoming stack pointer (aligned) 8224 | callee-allocated save area | 8225 | for register varargs | 8226 | | 8227 +-------------------------------+ 8228 | local variables | <-- frame_pointer_rtx 8229 | | 8230 +-------------------------------+ 8231 | padding | \ 8232 +-------------------------------+ | 8233 | callee-saved registers | | frame.saved_regs_size 8234 +-------------------------------+ | 8235 | LR' | | 8236 +-------------------------------+ | 8237 | FP' | | 8238 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned) 8239 | SVE vector registers | | \ 8240 +-------------------------------+ | | below_hard_fp_saved_regs_size 8241 | SVE predicate registers | / / 8242 +-------------------------------+ 8243 | dynamic allocation | 8244 +-------------------------------+ 8245 | padding | 8246 +-------------------------------+ 8247 | outgoing stack arguments | <-- arg_pointer 8248 | | 8249 +-------------------------------+ 8250 | | <-- stack_pointer_rtx (aligned) 8251 8252 Dynamic stack allocations via alloca() decrease stack_pointer_rtx 8253 but leave frame_pointer_rtx and hard_frame_pointer_rtx 8254 unchanged. 8255 8256 By default for stack-clash we assume the guard is at least 64KB, but this 8257 value is configurable to either 4KB or 64KB. We also force the guard size to 8258 be the same as the probing interval and both values are kept in sync. 8259 8260 With those assumptions the callee can allocate up to 63KB (or 3KB depending 8261 on the guard size) of stack space without probing. 8262 8263 When probing is needed, we emit a probe at the start of the prologue 8264 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. 8265 8266 We have to track how much space has been allocated and the only stores 8267 to the stack we track as implicit probes are the FP/LR stores. 8268 8269 For outgoing arguments we probe if the size is larger than 1KB, such that 8270 the ABI specified buffer is maintained for the next callee. 8271 8272 The following registers are reserved during frame layout and should not be 8273 used for any other purpose: 8274 8275 - r11: Used by stack clash protection when SVE is enabled, and also 8276 as an anchor register when saving and restoring registers 8277 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment. 8278 - r14 and r15: Used for speculation tracking. 8279 - r16(IP0), r17(IP1): Used by indirect tailcalls. 8280 - r30(LR), r29(FP): Used by standard frame layout. 8281 8282 These registers must be avoided in frame layout related code unless the 8283 explicit intention is to interact with one of the features listed above. */ 8284 8285/* Generate the prologue instructions for entry into a function. 8286 Establish the stack frame by decreasing the stack pointer with a 8287 properly calculated size and, if necessary, create a frame record 8288 filled with the values of LR and previous frame pointer. The 8289 current FP is also set up if it is in use. */ 8290 8291void 8292aarch64_expand_prologue (void) 8293{ 8294 poly_int64 frame_size = cfun->machine->frame.frame_size; 8295 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 8296 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 8297 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 8298 poly_int64 callee_offset = cfun->machine->frame.callee_offset; 8299 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 8300 poly_int64 below_hard_fp_saved_regs_size 8301 = cfun->machine->frame.below_hard_fp_saved_regs_size; 8302 unsigned reg1 = cfun->machine->frame.wb_candidate1; 8303 unsigned reg2 = cfun->machine->frame.wb_candidate2; 8304 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; 8305 rtx_insn *insn; 8306 8307 if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) 8308 { 8309 /* Fold the SVE allocation into the initial allocation. 8310 We don't do this in aarch64_layout_arg to avoid pessimizing 8311 the epilogue code. */ 8312 initial_adjust += sve_callee_adjust; 8313 sve_callee_adjust = 0; 8314 } 8315 8316 /* Sign return address for functions. */ 8317 if (aarch64_return_address_signing_enabled ()) 8318 { 8319 switch (aarch64_ra_sign_key) 8320 { 8321 case AARCH64_KEY_A: 8322 insn = emit_insn (gen_paciasp ()); 8323 break; 8324 case AARCH64_KEY_B: 8325 insn = emit_insn (gen_pacibsp ()); 8326 break; 8327 default: 8328 gcc_unreachable (); 8329 } 8330 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); 8331 RTX_FRAME_RELATED_P (insn) = 1; 8332 } 8333 8334 if (flag_stack_usage_info) 8335 current_function_static_stack_size = constant_lower_bound (frame_size); 8336 8337 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) 8338 { 8339 if (crtl->is_leaf && !cfun->calls_alloca) 8340 { 8341 if (maybe_gt (frame_size, PROBE_INTERVAL) 8342 && maybe_gt (frame_size, get_stack_check_protect ())) 8343 aarch64_emit_probe_stack_range (get_stack_check_protect (), 8344 (frame_size 8345 - get_stack_check_protect ())); 8346 } 8347 else if (maybe_gt (frame_size, 0)) 8348 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); 8349 } 8350 8351 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM); 8352 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM); 8353 8354 /* In theory we should never have both an initial adjustment 8355 and a callee save adjustment. Verify that is the case since the 8356 code below does not handle it for -fstack-clash-protection. */ 8357 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0); 8358 8359 /* Will only probe if the initial adjustment is larger than the guard 8360 less the amount of the guard reserved for use by the caller's 8361 outgoing args. */ 8362 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, 8363 true, false); 8364 8365 if (callee_adjust != 0) 8366 aarch64_push_regs (reg1, reg2, callee_adjust); 8367 8368 /* The offset of the frame chain record (if any) from the current SP. */ 8369 poly_int64 chain_offset = (initial_adjust + callee_adjust 8370 - cfun->machine->frame.hard_fp_offset); 8371 gcc_assert (known_ge (chain_offset, 0)); 8372 8373 /* The offset of the bottom of the save area from the current SP. */ 8374 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size; 8375 8376 if (emit_frame_chain) 8377 { 8378 if (callee_adjust == 0) 8379 { 8380 reg1 = R29_REGNUM; 8381 reg2 = R30_REGNUM; 8382 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2, 8383 false, false); 8384 } 8385 else 8386 gcc_assert (known_eq (chain_offset, 0)); 8387 aarch64_add_offset (Pmode, hard_frame_pointer_rtx, 8388 stack_pointer_rtx, chain_offset, 8389 tmp1_rtx, tmp0_rtx, frame_pointer_needed); 8390 if (frame_pointer_needed && !frame_size.is_constant ()) 8391 { 8392 /* Variable-sized frames need to describe the save slot 8393 address using DW_CFA_expression rather than DW_CFA_offset. 8394 This means that, without taking further action, the 8395 locations of the registers that we've already saved would 8396 remain based on the stack pointer even after we redefine 8397 the CFA based on the frame pointer. We therefore need new 8398 DW_CFA_expressions to re-express the save slots with addresses 8399 based on the frame pointer. */ 8400 rtx_insn *insn = get_last_insn (); 8401 gcc_assert (RTX_FRAME_RELATED_P (insn)); 8402 8403 /* Add an explicit CFA definition if this was previously 8404 implicit. */ 8405 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) 8406 { 8407 rtx src = plus_constant (Pmode, stack_pointer_rtx, 8408 callee_offset); 8409 add_reg_note (insn, REG_CFA_ADJUST_CFA, 8410 gen_rtx_SET (hard_frame_pointer_rtx, src)); 8411 } 8412 8413 /* Change the save slot expressions for the registers that 8414 we've already saved. */ 8415 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2], 8416 hard_frame_pointer_rtx, UNITS_PER_WORD); 8417 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1], 8418 hard_frame_pointer_rtx, 0); 8419 } 8420 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); 8421 } 8422 8423 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM, 8424 callee_adjust != 0 || emit_frame_chain, 8425 emit_frame_chain); 8426 if (maybe_ne (sve_callee_adjust, 0)) 8427 { 8428 gcc_assert (!flag_stack_clash_protection 8429 || known_eq (initial_adjust, 0)); 8430 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, 8431 sve_callee_adjust, 8432 !frame_pointer_needed, false); 8433 saved_regs_offset += sve_callee_adjust; 8434 } 8435 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM, 8436 false, emit_frame_chain); 8437 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM, 8438 callee_adjust != 0 || emit_frame_chain, 8439 emit_frame_chain); 8440 8441 /* We may need to probe the final adjustment if it is larger than the guard 8442 that is assumed by the called. */ 8443 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, 8444 !frame_pointer_needed, true); 8445} 8446 8447/* Return TRUE if we can use a simple_return insn. 8448 8449 This function checks whether the callee saved stack is empty, which 8450 means no restore actions are need. The pro_and_epilogue will use 8451 this to check whether shrink-wrapping opt is feasible. */ 8452 8453bool 8454aarch64_use_return_insn_p (void) 8455{ 8456 if (!reload_completed) 8457 return false; 8458 8459 if (crtl->profile) 8460 return false; 8461 8462 return known_eq (cfun->machine->frame.frame_size, 0); 8463} 8464 8465/* Generate the epilogue instructions for returning from a function. 8466 This is almost exactly the reverse of the prolog sequence, except 8467 that we need to insert barriers to avoid scheduling loads that read 8468 from a deallocated stack, and we optimize the unwind records by 8469 emitting them all together if possible. */ 8470void 8471aarch64_expand_epilogue (bool for_sibcall) 8472{ 8473 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 8474 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 8475 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 8476 poly_int64 callee_offset = cfun->machine->frame.callee_offset; 8477 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; 8478 poly_int64 below_hard_fp_saved_regs_size 8479 = cfun->machine->frame.below_hard_fp_saved_regs_size; 8480 unsigned reg1 = cfun->machine->frame.wb_candidate1; 8481 unsigned reg2 = cfun->machine->frame.wb_candidate2; 8482 rtx cfi_ops = NULL; 8483 rtx_insn *insn; 8484 /* A stack clash protection prologue may not have left EP0_REGNUM or 8485 EP1_REGNUM in a usable state. The same is true for allocations 8486 with an SVE component, since we then need both temporary registers 8487 for each allocation. For stack clash we are in a usable state if 8488 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */ 8489 HOST_WIDE_INT guard_size 8490 = 1 << param_stack_clash_protection_guard_size; 8491 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 8492 8493 /* We can re-use the registers when: 8494 8495 (a) the deallocation amount is the same as the corresponding 8496 allocation amount (which is false if we combine the initial 8497 and SVE callee save allocations in the prologue); and 8498 8499 (b) the allocation amount doesn't need a probe (which is false 8500 if the amount is guard_size - guard_used_by_caller or greater). 8501 8502 In such situations the register should remain live with the correct 8503 value. */ 8504 bool can_inherit_p = (initial_adjust.is_constant () 8505 && final_adjust.is_constant () 8506 && (!flag_stack_clash_protection 8507 || (known_lt (initial_adjust, 8508 guard_size - guard_used_by_caller) 8509 && known_eq (sve_callee_adjust, 0)))); 8510 8511 /* We need to add memory barrier to prevent read from deallocated stack. */ 8512 bool need_barrier_p 8513 = maybe_ne (get_frame_size () 8514 + cfun->machine->frame.saved_varargs_size, 0); 8515 8516 /* Emit a barrier to prevent loads from a deallocated stack. */ 8517 if (maybe_gt (final_adjust, crtl->outgoing_args_size) 8518 || cfun->calls_alloca 8519 || crtl->calls_eh_return) 8520 { 8521 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); 8522 need_barrier_p = false; 8523 } 8524 8525 /* Restore the stack pointer from the frame pointer if it may not 8526 be the same as the stack pointer. */ 8527 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM); 8528 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM); 8529 if (frame_pointer_needed 8530 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca)) 8531 /* If writeback is used when restoring callee-saves, the CFA 8532 is restored on the instruction doing the writeback. */ 8533 aarch64_add_offset (Pmode, stack_pointer_rtx, 8534 hard_frame_pointer_rtx, 8535 -callee_offset - below_hard_fp_saved_regs_size, 8536 tmp1_rtx, tmp0_rtx, callee_adjust == 0); 8537 else 8538 /* The case where we need to re-use the register here is very rare, so 8539 avoid the complicated condition and just always emit a move if the 8540 immediate doesn't fit. */ 8541 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); 8542 8543 /* Restore the vector registers before the predicate registers, 8544 so that we can use P4 as a temporary for big-endian SVE frames. */ 8545 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM, 8546 callee_adjust != 0, &cfi_ops); 8547 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM, 8548 false, &cfi_ops); 8549 if (maybe_ne (sve_callee_adjust, 0)) 8550 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); 8551 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, 8552 R0_REGNUM, R30_REGNUM, 8553 callee_adjust != 0, &cfi_ops); 8554 8555 if (need_barrier_p) 8556 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); 8557 8558 if (callee_adjust != 0) 8559 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops); 8560 8561 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))) 8562 { 8563 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */ 8564 insn = get_last_insn (); 8565 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust); 8566 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops); 8567 RTX_FRAME_RELATED_P (insn) = 1; 8568 cfi_ops = NULL; 8569 } 8570 8571 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so 8572 add restriction on emit_move optimization to leaf functions. */ 8573 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, 8574 (!can_inherit_p || !crtl->is_leaf 8575 || df_regs_ever_live_p (EP0_REGNUM))); 8576 8577 if (cfi_ops) 8578 { 8579 /* Emit delayed restores and reset the CFA to be SP. */ 8580 insn = get_last_insn (); 8581 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops); 8582 REG_NOTES (insn) = cfi_ops; 8583 RTX_FRAME_RELATED_P (insn) = 1; 8584 } 8585 8586 /* We prefer to emit the combined return/authenticate instruction RETAA, 8587 however there are three cases in which we must instead emit an explicit 8588 authentication instruction. 8589 8590 1) Sibcalls don't return in a normal way, so if we're about to call one 8591 we must authenticate. 8592 8593 2) The RETAA instruction is not available before ARMv8.3-A, so if we are 8594 generating code for !TARGET_ARMV8_3 we can't use it and must 8595 explicitly authenticate. 8596 8597 3) On an eh_return path we make extra stack adjustments to update the 8598 canonical frame address to be the exception handler's CFA. We want 8599 to authenticate using the CFA of the function which calls eh_return. 8600 */ 8601 if (aarch64_return_address_signing_enabled () 8602 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return)) 8603 { 8604 switch (aarch64_ra_sign_key) 8605 { 8606 case AARCH64_KEY_A: 8607 insn = emit_insn (gen_autiasp ()); 8608 break; 8609 case AARCH64_KEY_B: 8610 insn = emit_insn (gen_autibsp ()); 8611 break; 8612 default: 8613 gcc_unreachable (); 8614 } 8615 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); 8616 RTX_FRAME_RELATED_P (insn) = 1; 8617 } 8618 8619 /* Stack adjustment for exception handler. */ 8620 if (crtl->calls_eh_return && !for_sibcall) 8621 { 8622 /* We need to unwind the stack by the offset computed by 8623 EH_RETURN_STACKADJ_RTX. We have already reset the CFA 8624 to be SP; letting the CFA move during this adjustment 8625 is just as correct as retaining the CFA from the body 8626 of the function. Therefore, do nothing special. */ 8627 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX)); 8628 } 8629 8630 emit_use (gen_rtx_REG (DImode, LR_REGNUM)); 8631 if (!for_sibcall) 8632 emit_jump_insn (ret_rtx); 8633} 8634 8635/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return 8636 normally or return to a previous frame after unwinding. 8637 8638 An EH return uses a single shared return sequence. The epilogue is 8639 exactly like a normal epilogue except that it has an extra input 8640 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment 8641 that must be applied after the frame has been destroyed. An extra label 8642 is inserted before the epilogue which initializes this register to zero, 8643 and this is the entry point for a normal return. 8644 8645 An actual EH return updates the return address, initializes the stack 8646 adjustment and jumps directly into the epilogue (bypassing the zeroing 8647 of the adjustment). Since the return address is typically saved on the 8648 stack when a function makes a call, the saved LR must be updated outside 8649 the epilogue. 8650 8651 This poses problems as the store is generated well before the epilogue, 8652 so the offset of LR is not known yet. Also optimizations will remove the 8653 store as it appears dead, even after the epilogue is generated (as the 8654 base or offset for loading LR is different in many cases). 8655 8656 To avoid these problems this implementation forces the frame pointer 8657 in eh_return functions so that the location of LR is fixed and known early. 8658 It also marks the store volatile, so no optimization is permitted to 8659 remove the store. */ 8660rtx 8661aarch64_eh_return_handler_rtx (void) 8662{ 8663 rtx tmp = gen_frame_mem (Pmode, 8664 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD)); 8665 8666 /* Mark the store volatile, so no optimization is permitted to remove it. */ 8667 MEM_VOLATILE_P (tmp) = true; 8668 return tmp; 8669} 8670 8671/* Output code to add DELTA to the first argument, and then jump 8672 to FUNCTION. Used for C++ multiple inheritance. */ 8673static void 8674aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, 8675 HOST_WIDE_INT delta, 8676 HOST_WIDE_INT vcall_offset, 8677 tree function) 8678{ 8679 /* The this pointer is always in x0. Note that this differs from 8680 Arm where the this pointer maybe bumped to r1 if r0 is required 8681 to return a pointer to an aggregate. On AArch64 a result value 8682 pointer will be in x8. */ 8683 int this_regno = R0_REGNUM; 8684 rtx this_rtx, temp0, temp1, addr, funexp; 8685 rtx_insn *insn; 8686 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk)); 8687 8688 if (aarch64_bti_enabled ()) 8689 emit_insn (gen_bti_c()); 8690 8691 reload_completed = 1; 8692 emit_note (NOTE_INSN_PROLOGUE_END); 8693 8694 this_rtx = gen_rtx_REG (Pmode, this_regno); 8695 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM); 8696 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM); 8697 8698 if (vcall_offset == 0) 8699 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false); 8700 else 8701 { 8702 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); 8703 8704 addr = this_rtx; 8705 if (delta != 0) 8706 { 8707 if (delta >= -256 && delta < 256) 8708 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx, 8709 plus_constant (Pmode, this_rtx, delta)); 8710 else 8711 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, 8712 temp1, temp0, false); 8713 } 8714 8715 if (Pmode == ptr_mode) 8716 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr)); 8717 else 8718 aarch64_emit_move (temp0, 8719 gen_rtx_ZERO_EXTEND (Pmode, 8720 gen_rtx_MEM (ptr_mode, addr))); 8721 8722 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES) 8723 addr = plus_constant (Pmode, temp0, vcall_offset); 8724 else 8725 { 8726 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true, 8727 Pmode); 8728 addr = gen_rtx_PLUS (Pmode, temp0, temp1); 8729 } 8730 8731 if (Pmode == ptr_mode) 8732 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr)); 8733 else 8734 aarch64_emit_move (temp1, 8735 gen_rtx_SIGN_EXTEND (Pmode, 8736 gen_rtx_MEM (ptr_mode, addr))); 8737 8738 emit_insn (gen_add2_insn (this_rtx, temp1)); 8739 } 8740 8741 /* Generate a tail call to the target function. */ 8742 if (!TREE_USED (function)) 8743 { 8744 assemble_external (function); 8745 TREE_USED (function) = 1; 8746 } 8747 funexp = XEXP (DECL_RTL (function), 0); 8748 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp); 8749 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode); 8750 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi)); 8751 SIBLING_CALL_P (insn) = 1; 8752 8753 insn = get_insns (); 8754 shorten_branches (insn); 8755 8756 assemble_start_function (thunk, fnname); 8757 final_start_function (insn, file, 1); 8758 final (insn, file, 1); 8759 final_end_function (); 8760 assemble_end_function (thunk, fnname); 8761 8762 /* Stop pretending to be a post-reload pass. */ 8763 reload_completed = 0; 8764} 8765 8766static bool 8767aarch64_tls_referenced_p (rtx x) 8768{ 8769 if (!TARGET_HAVE_TLS) 8770 return false; 8771 subrtx_iterator::array_type array; 8772 FOR_EACH_SUBRTX (iter, array, x, ALL) 8773 { 8774 const_rtx x = *iter; 8775 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0) 8776 return true; 8777 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are 8778 TLS offsets, not real symbol references. */ 8779 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) 8780 iter.skip_subrtxes (); 8781 } 8782 return false; 8783} 8784 8785 8786/* Return true if val can be encoded as a 12-bit unsigned immediate with 8787 a left shift of 0 or 12 bits. */ 8788bool 8789aarch64_uimm12_shift (HOST_WIDE_INT val) 8790{ 8791 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val 8792 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val 8793 ); 8794} 8795 8796/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate 8797 that can be created with a left shift of 0 or 12. */ 8798static HOST_WIDE_INT 8799aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val) 8800{ 8801 /* Check to see if the value fits in 24 bits, as that is the maximum we can 8802 handle correctly. */ 8803 gcc_assert ((val & 0xffffff) == val); 8804 8805 if (((val & 0xfff) << 0) == val) 8806 return val; 8807 8808 return val & (0xfff << 12); 8809} 8810 8811/* Return true if val is an immediate that can be loaded into a 8812 register by a MOVZ instruction. */ 8813static bool 8814aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode) 8815{ 8816 if (GET_MODE_SIZE (mode) > 4) 8817 { 8818 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val 8819 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val) 8820 return 1; 8821 } 8822 else 8823 { 8824 /* Ignore sign extension. */ 8825 val &= (HOST_WIDE_INT) 0xffffffff; 8826 } 8827 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val 8828 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val); 8829} 8830 8831/* Test whether: 8832 8833 X = (X & AND_VAL) | IOR_VAL; 8834 8835 can be implemented using: 8836 8837 MOVK X, #(IOR_VAL >> shift), LSL #shift 8838 8839 Return the shift if so, otherwise return -1. */ 8840int 8841aarch64_movk_shift (const wide_int_ref &and_val, 8842 const wide_int_ref &ior_val) 8843{ 8844 unsigned int precision = and_val.get_precision (); 8845 unsigned HOST_WIDE_INT mask = 0xffff; 8846 for (unsigned int shift = 0; shift < precision; shift += 16) 8847 { 8848 if (and_val == ~mask && (ior_val & mask) == ior_val) 8849 return shift; 8850 mask <<= 16; 8851 } 8852 return -1; 8853} 8854 8855/* VAL is a value with the inner mode of MODE. Replicate it to fill a 8856 64-bit (DImode) integer. */ 8857 8858static unsigned HOST_WIDE_INT 8859aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode) 8860{ 8861 unsigned int size = GET_MODE_UNIT_PRECISION (mode); 8862 while (size < 64) 8863 { 8864 val &= (HOST_WIDE_INT_1U << size) - 1; 8865 val |= val << size; 8866 size *= 2; 8867 } 8868 return val; 8869} 8870 8871/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */ 8872 8873static const unsigned HOST_WIDE_INT bitmask_imm_mul[] = 8874 { 8875 0x0000000100000001ull, 8876 0x0001000100010001ull, 8877 0x0101010101010101ull, 8878 0x1111111111111111ull, 8879 0x5555555555555555ull, 8880 }; 8881 8882 8883/* Return true if val is a valid bitmask immediate. */ 8884 8885bool 8886aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode) 8887{ 8888 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one; 8889 int bits; 8890 8891 /* Check for a single sequence of one bits and return quickly if so. 8892 The special cases of all ones and all zeroes returns false. */ 8893 val = aarch64_replicate_bitmask_imm (val_in, mode); 8894 tmp = val + (val & -val); 8895 8896 if (tmp == (tmp & -tmp)) 8897 return (val + 1) > 1; 8898 8899 /* Replicate 32-bit immediates so we can treat them as 64-bit. */ 8900 if (mode == SImode) 8901 val = (val << 32) | (val & 0xffffffff); 8902 8903 /* Invert if the immediate doesn't start with a zero bit - this means we 8904 only need to search for sequences of one bits. */ 8905 if (val & 1) 8906 val = ~val; 8907 8908 /* Find the first set bit and set tmp to val with the first sequence of one 8909 bits removed. Return success if there is a single sequence of ones. */ 8910 first_one = val & -val; 8911 tmp = val & (val + first_one); 8912 8913 if (tmp == 0) 8914 return true; 8915 8916 /* Find the next set bit and compute the difference in bit position. */ 8917 next_one = tmp & -tmp; 8918 bits = clz_hwi (first_one) - clz_hwi (next_one); 8919 mask = val ^ tmp; 8920 8921 /* Check the bit position difference is a power of 2, and that the first 8922 sequence of one bits fits within 'bits' bits. */ 8923 if ((mask >> bits) != 0 || bits != (bits & -bits)) 8924 return false; 8925 8926 /* Check the sequence of one bits is repeated 64/bits times. */ 8927 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26]; 8928} 8929 8930/* Create mask of ones, covering the lowest to highest bits set in VAL_IN. 8931 Assumed precondition: VAL_IN Is not zero. */ 8932 8933unsigned HOST_WIDE_INT 8934aarch64_and_split_imm1 (HOST_WIDE_INT val_in) 8935{ 8936 int lowest_bit_set = ctz_hwi (val_in); 8937 int highest_bit_set = floor_log2 (val_in); 8938 gcc_assert (val_in != 0); 8939 8940 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) - 8941 (HOST_WIDE_INT_1U << lowest_bit_set)); 8942} 8943 8944/* Create constant where bits outside of lowest bit set to highest bit set 8945 are set to 1. */ 8946 8947unsigned HOST_WIDE_INT 8948aarch64_and_split_imm2 (HOST_WIDE_INT val_in) 8949{ 8950 return val_in | ~aarch64_and_split_imm1 (val_in); 8951} 8952 8953/* Return true if VAL_IN is a valid 'and' bitmask immediate. */ 8954 8955bool 8956aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode) 8957{ 8958 scalar_int_mode int_mode; 8959 if (!is_a <scalar_int_mode> (mode, &int_mode)) 8960 return false; 8961 8962 if (aarch64_bitmask_imm (val_in, int_mode)) 8963 return false; 8964 8965 if (aarch64_move_imm (val_in, int_mode)) 8966 return false; 8967 8968 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in); 8969 8970 return aarch64_bitmask_imm (imm2, int_mode); 8971} 8972 8973/* Return true if val is an immediate that can be loaded into a 8974 register in a single instruction. */ 8975bool 8976aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode) 8977{ 8978 scalar_int_mode int_mode; 8979 if (!is_a <scalar_int_mode> (mode, &int_mode)) 8980 return false; 8981 8982 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode)) 8983 return 1; 8984 return aarch64_bitmask_imm (val, int_mode); 8985} 8986 8987static bool 8988aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) 8989{ 8990 if (GET_CODE (x) == HIGH) 8991 return true; 8992 8993 /* There's no way to calculate VL-based values using relocations. */ 8994 subrtx_iterator::array_type array; 8995 FOR_EACH_SUBRTX (iter, array, x, ALL) 8996 if (GET_CODE (*iter) == CONST_POLY_INT) 8997 return true; 8998 8999 poly_int64 offset; 9000 rtx base = strip_offset_and_salt (x, &offset); 9001 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF) 9002 { 9003 /* We checked for POLY_INT_CST offsets above. */ 9004 if (aarch64_classify_symbol (base, offset.to_constant ()) 9005 != SYMBOL_FORCE_TO_MEM) 9006 return true; 9007 else 9008 /* Avoid generating a 64-bit relocation in ILP32; leave 9009 to aarch64_expand_mov_immediate to handle it properly. */ 9010 return mode != ptr_mode; 9011 } 9012 9013 return aarch64_tls_referenced_p (x); 9014} 9015 9016/* Implement TARGET_CASE_VALUES_THRESHOLD. 9017 The expansion for a table switch is quite expensive due to the number 9018 of instructions, the table lookup and hard to predict indirect jump. 9019 When optimizing for speed, and -O3 enabled, use the per-core tuning if 9020 set, otherwise use tables for > 16 cases as a tradeoff between size and 9021 performance. When optimizing for size, use the default setting. */ 9022 9023static unsigned int 9024aarch64_case_values_threshold (void) 9025{ 9026 /* Use the specified limit for the number of cases before using jump 9027 tables at higher optimization levels. */ 9028 if (optimize > 2 9029 && selected_cpu->tune->max_case_values != 0) 9030 return selected_cpu->tune->max_case_values; 9031 else 9032 return optimize_size ? default_case_values_threshold () : 17; 9033} 9034 9035/* Return true if register REGNO is a valid index register. 9036 STRICT_P is true if REG_OK_STRICT is in effect. */ 9037 9038bool 9039aarch64_regno_ok_for_index_p (int regno, bool strict_p) 9040{ 9041 if (!HARD_REGISTER_NUM_P (regno)) 9042 { 9043 if (!strict_p) 9044 return true; 9045 9046 if (!reg_renumber) 9047 return false; 9048 9049 regno = reg_renumber[regno]; 9050 } 9051 return GP_REGNUM_P (regno); 9052} 9053 9054/* Return true if register REGNO is a valid base register for mode MODE. 9055 STRICT_P is true if REG_OK_STRICT is in effect. */ 9056 9057bool 9058aarch64_regno_ok_for_base_p (int regno, bool strict_p) 9059{ 9060 if (!HARD_REGISTER_NUM_P (regno)) 9061 { 9062 if (!strict_p) 9063 return true; 9064 9065 if (!reg_renumber) 9066 return false; 9067 9068 regno = reg_renumber[regno]; 9069 } 9070 9071 /* The fake registers will be eliminated to either the stack or 9072 hard frame pointer, both of which are usually valid base registers. 9073 Reload deals with the cases where the eliminated form isn't valid. */ 9074 return (GP_REGNUM_P (regno) 9075 || regno == SP_REGNUM 9076 || regno == FRAME_POINTER_REGNUM 9077 || regno == ARG_POINTER_REGNUM); 9078} 9079 9080/* Return true if X is a valid base register for mode MODE. 9081 STRICT_P is true if REG_OK_STRICT is in effect. */ 9082 9083static bool 9084aarch64_base_register_rtx_p (rtx x, bool strict_p) 9085{ 9086 if (!strict_p 9087 && GET_CODE (x) == SUBREG 9088 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))]) 9089 x = SUBREG_REG (x); 9090 9091 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p)); 9092} 9093 9094/* Return true if address offset is a valid index. If it is, fill in INFO 9095 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */ 9096 9097static bool 9098aarch64_classify_index (struct aarch64_address_info *info, rtx x, 9099 machine_mode mode, bool strict_p) 9100{ 9101 enum aarch64_address_type type; 9102 rtx index; 9103 int shift; 9104 9105 /* (reg:P) */ 9106 if ((REG_P (x) || GET_CODE (x) == SUBREG) 9107 && GET_MODE (x) == Pmode) 9108 { 9109 type = ADDRESS_REG_REG; 9110 index = x; 9111 shift = 0; 9112 } 9113 /* (sign_extend:DI (reg:SI)) */ 9114 else if ((GET_CODE (x) == SIGN_EXTEND 9115 || GET_CODE (x) == ZERO_EXTEND) 9116 && GET_MODE (x) == DImode 9117 && GET_MODE (XEXP (x, 0)) == SImode) 9118 { 9119 type = (GET_CODE (x) == SIGN_EXTEND) 9120 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 9121 index = XEXP (x, 0); 9122 shift = 0; 9123 } 9124 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */ 9125 else if (GET_CODE (x) == MULT 9126 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND 9127 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND) 9128 && GET_MODE (XEXP (x, 0)) == DImode 9129 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode 9130 && CONST_INT_P (XEXP (x, 1))) 9131 { 9132 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND) 9133 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 9134 index = XEXP (XEXP (x, 0), 0); 9135 shift = exact_log2 (INTVAL (XEXP (x, 1))); 9136 } 9137 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */ 9138 else if (GET_CODE (x) == ASHIFT 9139 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND 9140 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND) 9141 && GET_MODE (XEXP (x, 0)) == DImode 9142 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode 9143 && CONST_INT_P (XEXP (x, 1))) 9144 { 9145 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND) 9146 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 9147 index = XEXP (XEXP (x, 0), 0); 9148 shift = INTVAL (XEXP (x, 1)); 9149 } 9150 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */ 9151 else if ((GET_CODE (x) == SIGN_EXTRACT 9152 || GET_CODE (x) == ZERO_EXTRACT) 9153 && GET_MODE (x) == DImode 9154 && GET_CODE (XEXP (x, 0)) == MULT 9155 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 9156 && CONST_INT_P (XEXP (XEXP (x, 0), 1))) 9157 { 9158 type = (GET_CODE (x) == SIGN_EXTRACT) 9159 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 9160 index = XEXP (XEXP (x, 0), 0); 9161 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1))); 9162 if (INTVAL (XEXP (x, 1)) != 32 + shift 9163 || INTVAL (XEXP (x, 2)) != 0) 9164 shift = -1; 9165 } 9166 /* (and:DI (mult:DI (reg:DI) (const_int scale)) 9167 (const_int 0xffffffff<<shift)) */ 9168 else if (GET_CODE (x) == AND 9169 && GET_MODE (x) == DImode 9170 && GET_CODE (XEXP (x, 0)) == MULT 9171 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 9172 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 9173 && CONST_INT_P (XEXP (x, 1))) 9174 { 9175 type = ADDRESS_REG_UXTW; 9176 index = XEXP (XEXP (x, 0), 0); 9177 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1))); 9178 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift) 9179 shift = -1; 9180 } 9181 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */ 9182 else if ((GET_CODE (x) == SIGN_EXTRACT 9183 || GET_CODE (x) == ZERO_EXTRACT) 9184 && GET_MODE (x) == DImode 9185 && GET_CODE (XEXP (x, 0)) == ASHIFT 9186 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 9187 && CONST_INT_P (XEXP (XEXP (x, 0), 1))) 9188 { 9189 type = (GET_CODE (x) == SIGN_EXTRACT) 9190 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 9191 index = XEXP (XEXP (x, 0), 0); 9192 shift = INTVAL (XEXP (XEXP (x, 0), 1)); 9193 if (INTVAL (XEXP (x, 1)) != 32 + shift 9194 || INTVAL (XEXP (x, 2)) != 0) 9195 shift = -1; 9196 } 9197 /* (and:DI (ashift:DI (reg:DI) (const_int shift)) 9198 (const_int 0xffffffff<<shift)) */ 9199 else if (GET_CODE (x) == AND 9200 && GET_MODE (x) == DImode 9201 && GET_CODE (XEXP (x, 0)) == ASHIFT 9202 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 9203 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 9204 && CONST_INT_P (XEXP (x, 1))) 9205 { 9206 type = ADDRESS_REG_UXTW; 9207 index = XEXP (XEXP (x, 0), 0); 9208 shift = INTVAL (XEXP (XEXP (x, 0), 1)); 9209 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift) 9210 shift = -1; 9211 } 9212 /* (mult:P (reg:P) (const_int scale)) */ 9213 else if (GET_CODE (x) == MULT 9214 && GET_MODE (x) == Pmode 9215 && GET_MODE (XEXP (x, 0)) == Pmode 9216 && CONST_INT_P (XEXP (x, 1))) 9217 { 9218 type = ADDRESS_REG_REG; 9219 index = XEXP (x, 0); 9220 shift = exact_log2 (INTVAL (XEXP (x, 1))); 9221 } 9222 /* (ashift:P (reg:P) (const_int shift)) */ 9223 else if (GET_CODE (x) == ASHIFT 9224 && GET_MODE (x) == Pmode 9225 && GET_MODE (XEXP (x, 0)) == Pmode 9226 && CONST_INT_P (XEXP (x, 1))) 9227 { 9228 type = ADDRESS_REG_REG; 9229 index = XEXP (x, 0); 9230 shift = INTVAL (XEXP (x, 1)); 9231 } 9232 else 9233 return false; 9234 9235 if (!strict_p 9236 && GET_CODE (index) == SUBREG 9237 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))]) 9238 index = SUBREG_REG (index); 9239 9240 if (aarch64_sve_data_mode_p (mode)) 9241 { 9242 if (type != ADDRESS_REG_REG 9243 || (1 << shift) != GET_MODE_UNIT_SIZE (mode)) 9244 return false; 9245 } 9246 else 9247 { 9248 if (shift != 0 9249 && !(IN_RANGE (shift, 1, 3) 9250 && known_eq (1 << shift, GET_MODE_SIZE (mode)))) 9251 return false; 9252 } 9253 9254 if (REG_P (index) 9255 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p)) 9256 { 9257 info->type = type; 9258 info->offset = index; 9259 info->shift = shift; 9260 return true; 9261 } 9262 9263 return false; 9264} 9265 9266/* Return true if MODE is one of the modes for which we 9267 support LDP/STP operations. */ 9268 9269static bool 9270aarch64_mode_valid_for_sched_fusion_p (machine_mode mode) 9271{ 9272 return mode == SImode || mode == DImode 9273 || mode == SFmode || mode == DFmode 9274 || (aarch64_vector_mode_supported_p (mode) 9275 && (known_eq (GET_MODE_SIZE (mode), 8) 9276 || (known_eq (GET_MODE_SIZE (mode), 16) 9277 && (aarch64_tune_params.extra_tuning_flags 9278 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0))); 9279} 9280 9281/* Return true if REGNO is a virtual pointer register, or an eliminable 9282 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't 9283 include stack_pointer or hard_frame_pointer. */ 9284static bool 9285virt_or_elim_regno_p (unsigned regno) 9286{ 9287 return ((regno >= FIRST_VIRTUAL_REGISTER 9288 && regno <= LAST_VIRTUAL_POINTER_REGISTER) 9289 || regno == FRAME_POINTER_REGNUM 9290 || regno == ARG_POINTER_REGNUM); 9291} 9292 9293/* Return true if X is a valid address of type TYPE for machine mode MODE. 9294 If it is, fill in INFO appropriately. STRICT_P is true if 9295 REG_OK_STRICT is in effect. */ 9296 9297bool 9298aarch64_classify_address (struct aarch64_address_info *info, 9299 rtx x, machine_mode mode, bool strict_p, 9300 aarch64_addr_query_type type) 9301{ 9302 enum rtx_code code = GET_CODE (x); 9303 rtx op0, op1; 9304 poly_int64 offset; 9305 9306 HOST_WIDE_INT const_size; 9307 9308 /* Whether a vector mode is partial doesn't affect address legitimacy. 9309 Partial vectors like VNx8QImode allow the same indexed addressing 9310 mode and MUL VL addressing mode as full vectors like VNx16QImode; 9311 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */ 9312 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 9313 vec_flags &= ~VEC_PARTIAL; 9314 9315 /* On BE, we use load/store pair for all large int mode load/stores. 9316 TI/TFmode may also use a load/store pair. */ 9317 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT)); 9318 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP 9319 || type == ADDR_QUERY_LDP_STP_N 9320 || mode == TImode 9321 || mode == TFmode 9322 || (BYTES_BIG_ENDIAN && advsimd_struct_p)); 9323 9324 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode 9325 corresponds to the actual size of the memory being loaded/stored and the 9326 mode of the corresponding addressing mode is half of that. */ 9327 if (type == ADDR_QUERY_LDP_STP_N 9328 && known_eq (GET_MODE_SIZE (mode), 16)) 9329 mode = DFmode; 9330 9331 bool allow_reg_index_p = (!load_store_pair_p 9332 && (known_lt (GET_MODE_SIZE (mode), 16) 9333 || vec_flags == VEC_ADVSIMD 9334 || vec_flags & VEC_SVE_DATA)); 9335 9336 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and 9337 [Rn, #offset, MUL VL]. */ 9338 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0 9339 && (code != REG && code != PLUS)) 9340 return false; 9341 9342 /* On LE, for AdvSIMD, don't support anything other than POST_INC or 9343 REG addressing. */ 9344 if (advsimd_struct_p 9345 && !BYTES_BIG_ENDIAN 9346 && (code != POST_INC && code != REG)) 9347 return false; 9348 9349 gcc_checking_assert (GET_MODE (x) == VOIDmode 9350 || SCALAR_INT_MODE_P (GET_MODE (x))); 9351 9352 switch (code) 9353 { 9354 case REG: 9355 case SUBREG: 9356 info->type = ADDRESS_REG_IMM; 9357 info->base = x; 9358 info->offset = const0_rtx; 9359 info->const_offset = 0; 9360 return aarch64_base_register_rtx_p (x, strict_p); 9361 9362 case PLUS: 9363 op0 = XEXP (x, 0); 9364 op1 = XEXP (x, 1); 9365 9366 if (! strict_p 9367 && REG_P (op0) 9368 && virt_or_elim_regno_p (REGNO (op0)) 9369 && poly_int_rtx_p (op1, &offset)) 9370 { 9371 info->type = ADDRESS_REG_IMM; 9372 info->base = op0; 9373 info->offset = op1; 9374 info->const_offset = offset; 9375 9376 return true; 9377 } 9378 9379 if (maybe_ne (GET_MODE_SIZE (mode), 0) 9380 && aarch64_base_register_rtx_p (op0, strict_p) 9381 && poly_int_rtx_p (op1, &offset)) 9382 { 9383 info->type = ADDRESS_REG_IMM; 9384 info->base = op0; 9385 info->offset = op1; 9386 info->const_offset = offset; 9387 9388 /* TImode and TFmode values are allowed in both pairs of X 9389 registers and individual Q registers. The available 9390 address modes are: 9391 X,X: 7-bit signed scaled offset 9392 Q: 9-bit signed offset 9393 We conservatively require an offset representable in either mode. 9394 When performing the check for pairs of X registers i.e. LDP/STP 9395 pass down DImode since that is the natural size of the LDP/STP 9396 instruction memory accesses. */ 9397 if (mode == TImode || mode == TFmode) 9398 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset) 9399 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset) 9400 || offset_12bit_unsigned_scaled_p (mode, offset))); 9401 9402 /* A 7bit offset check because OImode will emit a ldp/stp 9403 instruction (only big endian will get here). 9404 For ldp/stp instructions, the offset is scaled for the size of a 9405 single element of the pair. */ 9406 if (mode == OImode) 9407 return aarch64_offset_7bit_signed_scaled_p (TImode, offset); 9408 9409 /* Three 9/12 bit offsets checks because CImode will emit three 9410 ldr/str instructions (only big endian will get here). */ 9411 if (mode == CImode) 9412 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) 9413 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, 9414 offset + 32) 9415 || offset_12bit_unsigned_scaled_p (V16QImode, 9416 offset + 32))); 9417 9418 /* Two 7bit offsets checks because XImode will emit two ldp/stp 9419 instructions (only big endian will get here). */ 9420 if (mode == XImode) 9421 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) 9422 && aarch64_offset_7bit_signed_scaled_p (TImode, 9423 offset + 32)); 9424 9425 /* Make "m" use the LD1 offset range for SVE data modes, so 9426 that pre-RTL optimizers like ivopts will work to that 9427 instead of the wider LDR/STR range. */ 9428 if (vec_flags == VEC_SVE_DATA) 9429 return (type == ADDR_QUERY_M 9430 ? offset_4bit_signed_scaled_p (mode, offset) 9431 : offset_9bit_signed_scaled_p (mode, offset)); 9432 9433 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT)) 9434 { 9435 poly_int64 end_offset = (offset 9436 + GET_MODE_SIZE (mode) 9437 - BYTES_PER_SVE_VECTOR); 9438 return (type == ADDR_QUERY_M 9439 ? offset_4bit_signed_scaled_p (mode, offset) 9440 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset) 9441 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE, 9442 end_offset))); 9443 } 9444 9445 if (vec_flags == VEC_SVE_PRED) 9446 return offset_9bit_signed_scaled_p (mode, offset); 9447 9448 if (load_store_pair_p) 9449 return ((known_eq (GET_MODE_SIZE (mode), 4) 9450 || known_eq (GET_MODE_SIZE (mode), 8) 9451 || known_eq (GET_MODE_SIZE (mode), 16)) 9452 && aarch64_offset_7bit_signed_scaled_p (mode, offset)); 9453 else 9454 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset) 9455 || offset_12bit_unsigned_scaled_p (mode, offset)); 9456 } 9457 9458 if (allow_reg_index_p) 9459 { 9460 /* Look for base + (scaled/extended) index register. */ 9461 if (aarch64_base_register_rtx_p (op0, strict_p) 9462 && aarch64_classify_index (info, op1, mode, strict_p)) 9463 { 9464 info->base = op0; 9465 return true; 9466 } 9467 if (aarch64_base_register_rtx_p (op1, strict_p) 9468 && aarch64_classify_index (info, op0, mode, strict_p)) 9469 { 9470 info->base = op1; 9471 return true; 9472 } 9473 } 9474 9475 return false; 9476 9477 case POST_INC: 9478 case POST_DEC: 9479 case PRE_INC: 9480 case PRE_DEC: 9481 info->type = ADDRESS_REG_WB; 9482 info->base = XEXP (x, 0); 9483 info->offset = NULL_RTX; 9484 return aarch64_base_register_rtx_p (info->base, strict_p); 9485 9486 case POST_MODIFY: 9487 case PRE_MODIFY: 9488 info->type = ADDRESS_REG_WB; 9489 info->base = XEXP (x, 0); 9490 if (GET_CODE (XEXP (x, 1)) == PLUS 9491 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset) 9492 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base) 9493 && aarch64_base_register_rtx_p (info->base, strict_p)) 9494 { 9495 info->offset = XEXP (XEXP (x, 1), 1); 9496 info->const_offset = offset; 9497 9498 /* TImode and TFmode values are allowed in both pairs of X 9499 registers and individual Q registers. The available 9500 address modes are: 9501 X,X: 7-bit signed scaled offset 9502 Q: 9-bit signed offset 9503 We conservatively require an offset representable in either mode. 9504 */ 9505 if (mode == TImode || mode == TFmode) 9506 return (aarch64_offset_7bit_signed_scaled_p (mode, offset) 9507 && aarch64_offset_9bit_signed_unscaled_p (mode, offset)); 9508 9509 if (load_store_pair_p) 9510 return ((known_eq (GET_MODE_SIZE (mode), 4) 9511 || known_eq (GET_MODE_SIZE (mode), 8) 9512 || known_eq (GET_MODE_SIZE (mode), 16)) 9513 && aarch64_offset_7bit_signed_scaled_p (mode, offset)); 9514 else 9515 return aarch64_offset_9bit_signed_unscaled_p (mode, offset); 9516 } 9517 return false; 9518 9519 case CONST: 9520 case SYMBOL_REF: 9521 case LABEL_REF: 9522 /* load literal: pc-relative constant pool entry. Only supported 9523 for SI mode or larger. */ 9524 info->type = ADDRESS_SYMBOLIC; 9525 9526 if (!load_store_pair_p 9527 && GET_MODE_SIZE (mode).is_constant (&const_size) 9528 && const_size >= 4) 9529 { 9530 poly_int64 offset; 9531 rtx sym = strip_offset_and_salt (x, &offset); 9532 return ((GET_CODE (sym) == LABEL_REF 9533 || (GET_CODE (sym) == SYMBOL_REF 9534 && CONSTANT_POOL_ADDRESS_P (sym) 9535 && aarch64_pcrelative_literal_loads))); 9536 } 9537 return false; 9538 9539 case LO_SUM: 9540 info->type = ADDRESS_LO_SUM; 9541 info->base = XEXP (x, 0); 9542 info->offset = XEXP (x, 1); 9543 if (allow_reg_index_p 9544 && aarch64_base_register_rtx_p (info->base, strict_p)) 9545 { 9546 poly_int64 offset; 9547 HOST_WIDE_INT const_offset; 9548 rtx sym = strip_offset_and_salt (info->offset, &offset); 9549 if (GET_CODE (sym) == SYMBOL_REF 9550 && offset.is_constant (&const_offset) 9551 && (aarch64_classify_symbol (sym, const_offset) 9552 == SYMBOL_SMALL_ABSOLUTE)) 9553 { 9554 /* The symbol and offset must be aligned to the access size. */ 9555 unsigned int align; 9556 9557 if (CONSTANT_POOL_ADDRESS_P (sym)) 9558 align = GET_MODE_ALIGNMENT (get_pool_mode (sym)); 9559 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym)) 9560 { 9561 tree exp = SYMBOL_REF_DECL (sym); 9562 align = TYPE_ALIGN (TREE_TYPE (exp)); 9563 align = aarch64_constant_alignment (exp, align); 9564 } 9565 else if (SYMBOL_REF_DECL (sym)) 9566 align = DECL_ALIGN (SYMBOL_REF_DECL (sym)); 9567 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym) 9568 && SYMBOL_REF_BLOCK (sym) != NULL) 9569 align = SYMBOL_REF_BLOCK (sym)->alignment; 9570 else 9571 align = BITS_PER_UNIT; 9572 9573 poly_int64 ref_size = GET_MODE_SIZE (mode); 9574 if (known_eq (ref_size, 0)) 9575 ref_size = GET_MODE_SIZE (DImode); 9576 9577 return (multiple_p (const_offset, ref_size) 9578 && multiple_p (align / BITS_PER_UNIT, ref_size)); 9579 } 9580 } 9581 return false; 9582 9583 default: 9584 return false; 9585 } 9586} 9587 9588/* Return true if the address X is valid for a PRFM instruction. 9589 STRICT_P is true if we should do strict checking with 9590 aarch64_classify_address. */ 9591 9592bool 9593aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p) 9594{ 9595 struct aarch64_address_info addr; 9596 9597 /* PRFM accepts the same addresses as DImode... */ 9598 bool res = aarch64_classify_address (&addr, x, DImode, strict_p); 9599 if (!res) 9600 return false; 9601 9602 /* ... except writeback forms. */ 9603 return addr.type != ADDRESS_REG_WB; 9604} 9605 9606bool 9607aarch64_symbolic_address_p (rtx x) 9608{ 9609 poly_int64 offset; 9610 x = strip_offset_and_salt (x, &offset); 9611 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF; 9612} 9613 9614/* Classify the base of symbolic expression X. */ 9615 9616enum aarch64_symbol_type 9617aarch64_classify_symbolic_expression (rtx x) 9618{ 9619 rtx offset; 9620 9621 split_const (x, &x, &offset); 9622 return aarch64_classify_symbol (x, INTVAL (offset)); 9623} 9624 9625 9626/* Return TRUE if X is a legitimate address for accessing memory in 9627 mode MODE. */ 9628static bool 9629aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p) 9630{ 9631 struct aarch64_address_info addr; 9632 9633 return aarch64_classify_address (&addr, x, mode, strict_p); 9634} 9635 9636/* Return TRUE if X is a legitimate address of type TYPE for accessing 9637 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */ 9638bool 9639aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p, 9640 aarch64_addr_query_type type) 9641{ 9642 struct aarch64_address_info addr; 9643 9644 return aarch64_classify_address (&addr, x, mode, strict_p, type); 9645} 9646 9647/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */ 9648 9649static bool 9650aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2, 9651 poly_int64 orig_offset, 9652 machine_mode mode) 9653{ 9654 HOST_WIDE_INT size; 9655 if (GET_MODE_SIZE (mode).is_constant (&size)) 9656 { 9657 HOST_WIDE_INT const_offset, second_offset; 9658 9659 /* A general SVE offset is A * VQ + B. Remove the A component from 9660 coefficient 0 in order to get the constant B. */ 9661 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1]; 9662 9663 /* Split an out-of-range address displacement into a base and 9664 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB 9665 range otherwise to increase opportunities for sharing the base 9666 address of different sizes. Unaligned accesses use the signed 9667 9-bit range, TImode/TFmode use the intersection of signed 9668 scaled 7-bit and signed 9-bit offset. */ 9669 if (mode == TImode || mode == TFmode) 9670 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100; 9671 else if ((const_offset & (size - 1)) != 0) 9672 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100; 9673 else 9674 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc); 9675 9676 if (second_offset == 0 || known_eq (orig_offset, second_offset)) 9677 return false; 9678 9679 /* Split the offset into second_offset and the rest. */ 9680 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode); 9681 *offset2 = gen_int_mode (second_offset, Pmode); 9682 return true; 9683 } 9684 else 9685 { 9686 /* Get the mode we should use as the basis of the range. For structure 9687 modes this is the mode of one vector. */ 9688 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 9689 machine_mode step_mode 9690 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode; 9691 9692 /* Get the "mul vl" multiplier we'd like to use. */ 9693 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1]; 9694 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor; 9695 if (vec_flags & VEC_SVE_DATA) 9696 /* LDR supports a 9-bit range, but the move patterns for 9697 structure modes require all vectors to be in range of the 9698 same base. The simplest way of accomodating that while still 9699 promoting reuse of anchor points between different modes is 9700 to use an 8-bit range unconditionally. */ 9701 vnum = ((vnum + 128) & 255) - 128; 9702 else 9703 /* Predicates are only handled singly, so we might as well use 9704 the full range. */ 9705 vnum = ((vnum + 256) & 511) - 256; 9706 if (vnum == 0) 9707 return false; 9708 9709 /* Convert the "mul vl" multiplier into a byte offset. */ 9710 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum; 9711 if (known_eq (second_offset, orig_offset)) 9712 return false; 9713 9714 /* Split the offset into second_offset and the rest. */ 9715 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode); 9716 *offset2 = gen_int_mode (second_offset, Pmode); 9717 return true; 9718 } 9719} 9720 9721/* Return the binary representation of floating point constant VALUE in INTVAL. 9722 If the value cannot be converted, return false without setting INTVAL. 9723 The conversion is done in the given MODE. */ 9724bool 9725aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval) 9726{ 9727 9728 /* We make a general exception for 0. */ 9729 if (aarch64_float_const_zero_rtx_p (value)) 9730 { 9731 *intval = 0; 9732 return true; 9733 } 9734 9735 scalar_float_mode mode; 9736 if (GET_CODE (value) != CONST_DOUBLE 9737 || !is_a <scalar_float_mode> (GET_MODE (value), &mode) 9738 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT 9739 /* Only support up to DF mode. */ 9740 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode)) 9741 return false; 9742 9743 unsigned HOST_WIDE_INT ival = 0; 9744 9745 long res[2]; 9746 real_to_target (res, 9747 CONST_DOUBLE_REAL_VALUE (value), 9748 REAL_MODE_FORMAT (mode)); 9749 9750 if (mode == DFmode) 9751 { 9752 int order = BYTES_BIG_ENDIAN ? 1 : 0; 9753 ival = zext_hwi (res[order], 32); 9754 ival |= (zext_hwi (res[1 - order], 32) << 32); 9755 } 9756 else 9757 ival = zext_hwi (res[0], 32); 9758 9759 *intval = ival; 9760 return true; 9761} 9762 9763/* Return TRUE if rtx X is an immediate constant that can be moved using a 9764 single MOV(+MOVK) followed by an FMOV. */ 9765bool 9766aarch64_float_const_rtx_p (rtx x) 9767{ 9768 machine_mode mode = GET_MODE (x); 9769 if (mode == VOIDmode) 9770 return false; 9771 9772 /* Determine whether it's cheaper to write float constants as 9773 mov/movk pairs over ldr/adrp pairs. */ 9774 unsigned HOST_WIDE_INT ival; 9775 9776 if (GET_CODE (x) == CONST_DOUBLE 9777 && SCALAR_FLOAT_MODE_P (mode) 9778 && aarch64_reinterpret_float_as_int (x, &ival)) 9779 { 9780 scalar_int_mode imode = (mode == HFmode 9781 ? SImode 9782 : int_mode_for_mode (mode).require ()); 9783 int num_instr = aarch64_internal_mov_immediate 9784 (NULL_RTX, gen_int_mode (ival, imode), false, imode); 9785 return num_instr < 3; 9786 } 9787 9788 return false; 9789} 9790 9791/* Return TRUE if rtx X is immediate constant 0.0 */ 9792bool 9793aarch64_float_const_zero_rtx_p (rtx x) 9794{ 9795 if (GET_MODE (x) == VOIDmode) 9796 return false; 9797 9798 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x))) 9799 return !HONOR_SIGNED_ZEROS (GET_MODE (x)); 9800 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0); 9801} 9802 9803/* Return TRUE if rtx X is immediate constant that fits in a single 9804 MOVI immediate operation. */ 9805bool 9806aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode) 9807{ 9808 if (!TARGET_SIMD) 9809 return false; 9810 9811 machine_mode vmode; 9812 scalar_int_mode imode; 9813 unsigned HOST_WIDE_INT ival; 9814 9815 if (GET_CODE (x) == CONST_DOUBLE 9816 && SCALAR_FLOAT_MODE_P (mode)) 9817 { 9818 if (!aarch64_reinterpret_float_as_int (x, &ival)) 9819 return false; 9820 9821 /* We make a general exception for 0. */ 9822 if (aarch64_float_const_zero_rtx_p (x)) 9823 return true; 9824 9825 imode = int_mode_for_mode (mode).require (); 9826 } 9827 else if (GET_CODE (x) == CONST_INT 9828 && is_a <scalar_int_mode> (mode, &imode)) 9829 ival = INTVAL (x); 9830 else 9831 return false; 9832 9833 /* use a 64 bit mode for everything except for DI/DF mode, where we use 9834 a 128 bit vector mode. */ 9835 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64; 9836 9837 vmode = aarch64_simd_container_mode (imode, width); 9838 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival); 9839 9840 return aarch64_simd_valid_immediate (v_op, NULL); 9841} 9842 9843 9844/* Return the fixed registers used for condition codes. */ 9845 9846static bool 9847aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) 9848{ 9849 *p1 = CC_REGNUM; 9850 *p2 = INVALID_REGNUM; 9851 return true; 9852} 9853 9854/* This function is used by the call expanders of the machine description. 9855 RESULT is the register in which the result is returned. It's NULL for 9856 "call" and "sibcall". 9857 MEM is the location of the function call. 9858 CALLEE_ABI is a const_int that gives the arm_pcs of the callee. 9859 SIBCALL indicates whether this function call is normal call or sibling call. 9860 It will generate different pattern accordingly. */ 9861 9862void 9863aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall) 9864{ 9865 rtx call, callee, tmp; 9866 rtvec vec; 9867 machine_mode mode; 9868 9869 gcc_assert (MEM_P (mem)); 9870 callee = XEXP (mem, 0); 9871 mode = GET_MODE (callee); 9872 gcc_assert (mode == Pmode); 9873 9874 /* Decide if we should generate indirect calls by loading the 9875 address of the callee into a register before performing 9876 the branch-and-link. */ 9877 if (SYMBOL_REF_P (callee) 9878 ? (aarch64_is_long_call_p (callee) 9879 || aarch64_is_noplt_call_p (callee)) 9880 : !REG_P (callee)) 9881 XEXP (mem, 0) = force_reg (mode, callee); 9882 9883 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); 9884 9885 if (result != NULL_RTX) 9886 call = gen_rtx_SET (result, call); 9887 9888 if (sibcall) 9889 tmp = ret_rtx; 9890 else 9891 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM)); 9892 9893 gcc_assert (CONST_INT_P (callee_abi)); 9894 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi), 9895 UNSPEC_CALLEE_ABI); 9896 9897 vec = gen_rtvec (3, call, callee_abi, tmp); 9898 call = gen_rtx_PARALLEL (VOIDmode, vec); 9899 9900 aarch64_emit_call_insn (call); 9901} 9902 9903/* Emit call insn with PAT and do aarch64-specific handling. */ 9904 9905void 9906aarch64_emit_call_insn (rtx pat) 9907{ 9908 rtx insn = emit_call_insn (pat); 9909 9910 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn); 9911 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM)); 9912 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM)); 9913} 9914 9915machine_mode 9916aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y) 9917{ 9918 machine_mode mode_x = GET_MODE (x); 9919 rtx_code code_x = GET_CODE (x); 9920 9921 /* All floating point compares return CCFP if it is an equality 9922 comparison, and CCFPE otherwise. */ 9923 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT) 9924 { 9925 switch (code) 9926 { 9927 case EQ: 9928 case NE: 9929 case UNORDERED: 9930 case ORDERED: 9931 case UNLT: 9932 case UNLE: 9933 case UNGT: 9934 case UNGE: 9935 case UNEQ: 9936 return CCFPmode; 9937 9938 case LT: 9939 case LE: 9940 case GT: 9941 case GE: 9942 case LTGT: 9943 return CCFPEmode; 9944 9945 default: 9946 gcc_unreachable (); 9947 } 9948 } 9949 9950 /* Equality comparisons of short modes against zero can be performed 9951 using the TST instruction with the appropriate bitmask. */ 9952 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x)) 9953 && (code == EQ || code == NE) 9954 && (mode_x == HImode || mode_x == QImode)) 9955 return CC_NZmode; 9956 9957 /* Similarly, comparisons of zero_extends from shorter modes can 9958 be performed using an ANDS with an immediate mask. */ 9959 if (y == const0_rtx && code_x == ZERO_EXTEND 9960 && (mode_x == SImode || mode_x == DImode) 9961 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode) 9962 && (code == EQ || code == NE)) 9963 return CC_NZmode; 9964 9965 if ((mode_x == SImode || mode_x == DImode) 9966 && y == const0_rtx 9967 && (code == EQ || code == NE || code == LT || code == GE) 9968 && (code_x == PLUS || code_x == MINUS || code_x == AND 9969 || code_x == NEG 9970 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1)) 9971 && CONST_INT_P (XEXP (x, 2))))) 9972 return CC_NZmode; 9973 9974 /* A compare with a shifted operand. Because of canonicalization, 9975 the comparison will have to be swapped when we emit the assembly 9976 code. */ 9977 if ((mode_x == SImode || mode_x == DImode) 9978 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx) 9979 && (code_x == ASHIFT || code_x == ASHIFTRT 9980 || code_x == LSHIFTRT 9981 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND)) 9982 return CC_SWPmode; 9983 9984 /* Similarly for a negated operand, but we can only do this for 9985 equalities. */ 9986 if ((mode_x == SImode || mode_x == DImode) 9987 && (REG_P (y) || GET_CODE (y) == SUBREG) 9988 && (code == EQ || code == NE) 9989 && code_x == NEG) 9990 return CC_Zmode; 9991 9992 /* A test for unsigned overflow from an addition. */ 9993 if ((mode_x == DImode || mode_x == TImode) 9994 && (code == LTU || code == GEU) 9995 && code_x == PLUS 9996 && rtx_equal_p (XEXP (x, 0), y)) 9997 return CC_Cmode; 9998 9999 /* A test for unsigned overflow from an add with carry. */ 10000 if ((mode_x == DImode || mode_x == TImode) 10001 && (code == LTU || code == GEU) 10002 && code_x == PLUS 10003 && CONST_SCALAR_INT_P (y) 10004 && (rtx_mode_t (y, mode_x) 10005 == (wi::shwi (1, mode_x) 10006 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2)))) 10007 return CC_ADCmode; 10008 10009 /* A test for signed overflow. */ 10010 if ((mode_x == DImode || mode_x == TImode) 10011 && code == NE 10012 && code_x == PLUS 10013 && GET_CODE (y) == SIGN_EXTEND) 10014 return CC_Vmode; 10015 10016 /* For everything else, return CCmode. */ 10017 return CCmode; 10018} 10019 10020static int 10021aarch64_get_condition_code_1 (machine_mode, enum rtx_code); 10022 10023int 10024aarch64_get_condition_code (rtx x) 10025{ 10026 machine_mode mode = GET_MODE (XEXP (x, 0)); 10027 enum rtx_code comp_code = GET_CODE (x); 10028 10029 if (GET_MODE_CLASS (mode) != MODE_CC) 10030 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1)); 10031 return aarch64_get_condition_code_1 (mode, comp_code); 10032} 10033 10034static int 10035aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code) 10036{ 10037 switch (mode) 10038 { 10039 case E_CCFPmode: 10040 case E_CCFPEmode: 10041 switch (comp_code) 10042 { 10043 case GE: return AARCH64_GE; 10044 case GT: return AARCH64_GT; 10045 case LE: return AARCH64_LS; 10046 case LT: return AARCH64_MI; 10047 case NE: return AARCH64_NE; 10048 case EQ: return AARCH64_EQ; 10049 case ORDERED: return AARCH64_VC; 10050 case UNORDERED: return AARCH64_VS; 10051 case UNLT: return AARCH64_LT; 10052 case UNLE: return AARCH64_LE; 10053 case UNGT: return AARCH64_HI; 10054 case UNGE: return AARCH64_PL; 10055 default: return -1; 10056 } 10057 break; 10058 10059 case E_CCmode: 10060 switch (comp_code) 10061 { 10062 case NE: return AARCH64_NE; 10063 case EQ: return AARCH64_EQ; 10064 case GE: return AARCH64_GE; 10065 case GT: return AARCH64_GT; 10066 case LE: return AARCH64_LE; 10067 case LT: return AARCH64_LT; 10068 case GEU: return AARCH64_CS; 10069 case GTU: return AARCH64_HI; 10070 case LEU: return AARCH64_LS; 10071 case LTU: return AARCH64_CC; 10072 default: return -1; 10073 } 10074 break; 10075 10076 case E_CC_SWPmode: 10077 switch (comp_code) 10078 { 10079 case NE: return AARCH64_NE; 10080 case EQ: return AARCH64_EQ; 10081 case GE: return AARCH64_LE; 10082 case GT: return AARCH64_LT; 10083 case LE: return AARCH64_GE; 10084 case LT: return AARCH64_GT; 10085 case GEU: return AARCH64_LS; 10086 case GTU: return AARCH64_CC; 10087 case LEU: return AARCH64_CS; 10088 case LTU: return AARCH64_HI; 10089 default: return -1; 10090 } 10091 break; 10092 10093 case E_CC_NZCmode: 10094 switch (comp_code) 10095 { 10096 case NE: return AARCH64_NE; /* = any */ 10097 case EQ: return AARCH64_EQ; /* = none */ 10098 case GE: return AARCH64_PL; /* = nfrst */ 10099 case LT: return AARCH64_MI; /* = first */ 10100 case GEU: return AARCH64_CS; /* = nlast */ 10101 case GTU: return AARCH64_HI; /* = pmore */ 10102 case LEU: return AARCH64_LS; /* = plast */ 10103 case LTU: return AARCH64_CC; /* = last */ 10104 default: return -1; 10105 } 10106 break; 10107 10108 case E_CC_NZmode: 10109 switch (comp_code) 10110 { 10111 case NE: return AARCH64_NE; 10112 case EQ: return AARCH64_EQ; 10113 case GE: return AARCH64_PL; 10114 case LT: return AARCH64_MI; 10115 default: return -1; 10116 } 10117 break; 10118 10119 case E_CC_Zmode: 10120 switch (comp_code) 10121 { 10122 case NE: return AARCH64_NE; 10123 case EQ: return AARCH64_EQ; 10124 default: return -1; 10125 } 10126 break; 10127 10128 case E_CC_Cmode: 10129 switch (comp_code) 10130 { 10131 case LTU: return AARCH64_CS; 10132 case GEU: return AARCH64_CC; 10133 default: return -1; 10134 } 10135 break; 10136 10137 case E_CC_ADCmode: 10138 switch (comp_code) 10139 { 10140 case GEU: return AARCH64_CS; 10141 case LTU: return AARCH64_CC; 10142 default: return -1; 10143 } 10144 break; 10145 10146 case E_CC_Vmode: 10147 switch (comp_code) 10148 { 10149 case NE: return AARCH64_VS; 10150 case EQ: return AARCH64_VC; 10151 default: return -1; 10152 } 10153 break; 10154 10155 default: 10156 return -1; 10157 } 10158 10159 return -1; 10160} 10161 10162bool 10163aarch64_const_vec_all_same_in_range_p (rtx x, 10164 HOST_WIDE_INT minval, 10165 HOST_WIDE_INT maxval) 10166{ 10167 rtx elt; 10168 return (const_vec_duplicate_p (x, &elt) 10169 && CONST_INT_P (elt) 10170 && IN_RANGE (INTVAL (elt), minval, maxval)); 10171} 10172 10173bool 10174aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val) 10175{ 10176 return aarch64_const_vec_all_same_in_range_p (x, val, val); 10177} 10178 10179/* Return true if VEC is a constant in which every element is in the range 10180 [MINVAL, MAXVAL]. The elements do not need to have the same value. */ 10181 10182static bool 10183aarch64_const_vec_all_in_range_p (rtx vec, 10184 HOST_WIDE_INT minval, 10185 HOST_WIDE_INT maxval) 10186{ 10187 if (GET_CODE (vec) != CONST_VECTOR 10188 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT) 10189 return false; 10190 10191 int nunits; 10192 if (!CONST_VECTOR_STEPPED_P (vec)) 10193 nunits = const_vector_encoded_nelts (vec); 10194 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits)) 10195 return false; 10196 10197 for (int i = 0; i < nunits; i++) 10198 { 10199 rtx vec_elem = CONST_VECTOR_ELT (vec, i); 10200 if (!CONST_INT_P (vec_elem) 10201 || !IN_RANGE (INTVAL (vec_elem), minval, maxval)) 10202 return false; 10203 } 10204 return true; 10205} 10206 10207/* N Z C V. */ 10208#define AARCH64_CC_V 1 10209#define AARCH64_CC_C (1 << 1) 10210#define AARCH64_CC_Z (1 << 2) 10211#define AARCH64_CC_N (1 << 3) 10212 10213/* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */ 10214static const int aarch64_nzcv_codes[] = 10215{ 10216 0, /* EQ, Z == 1. */ 10217 AARCH64_CC_Z, /* NE, Z == 0. */ 10218 0, /* CS, C == 1. */ 10219 AARCH64_CC_C, /* CC, C == 0. */ 10220 0, /* MI, N == 1. */ 10221 AARCH64_CC_N, /* PL, N == 0. */ 10222 0, /* VS, V == 1. */ 10223 AARCH64_CC_V, /* VC, V == 0. */ 10224 0, /* HI, C ==1 && Z == 0. */ 10225 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */ 10226 AARCH64_CC_V, /* GE, N == V. */ 10227 0, /* LT, N != V. */ 10228 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */ 10229 0, /* LE, !(Z == 0 && N == V). */ 10230 0, /* AL, Any. */ 10231 0 /* NV, Any. */ 10232}; 10233 10234/* Print floating-point vector immediate operand X to F, negating it 10235 first if NEGATE is true. Return true on success, false if it isn't 10236 a constant we can handle. */ 10237 10238static bool 10239aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate) 10240{ 10241 rtx elt; 10242 10243 if (!const_vec_duplicate_p (x, &elt)) 10244 return false; 10245 10246 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt); 10247 if (negate) 10248 r = real_value_negate (&r); 10249 10250 /* Handle the SVE single-bit immediates specially, since they have a 10251 fixed form in the assembly syntax. */ 10252 if (real_equal (&r, &dconst0)) 10253 asm_fprintf (f, "0.0"); 10254 else if (real_equal (&r, &dconst2)) 10255 asm_fprintf (f, "2.0"); 10256 else if (real_equal (&r, &dconst1)) 10257 asm_fprintf (f, "1.0"); 10258 else if (real_equal (&r, &dconsthalf)) 10259 asm_fprintf (f, "0.5"); 10260 else 10261 { 10262 const int buf_size = 20; 10263 char float_buf[buf_size] = {'\0'}; 10264 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 10265 1, GET_MODE (elt)); 10266 asm_fprintf (f, "%s", float_buf); 10267 } 10268 10269 return true; 10270} 10271 10272/* Return the equivalent letter for size. */ 10273static char 10274sizetochar (int size) 10275{ 10276 switch (size) 10277 { 10278 case 64: return 'd'; 10279 case 32: return 's'; 10280 case 16: return 'h'; 10281 case 8 : return 'b'; 10282 default: gcc_unreachable (); 10283 } 10284} 10285 10286/* Print operand X to file F in a target specific manner according to CODE. 10287 The acceptable formatting commands given by CODE are: 10288 'c': An integer or symbol address without a preceding # 10289 sign. 10290 'C': Take the duplicated element in a vector constant 10291 and print it in hex. 10292 'D': Take the duplicated element in a vector constant 10293 and print it as an unsigned integer, in decimal. 10294 'e': Print the sign/zero-extend size as a character 8->b, 10295 16->h, 32->w. Can also be used for masks: 10296 0xff->b, 0xffff->h, 0xffffffff->w. 10297 'I': If the operand is a duplicated vector constant, 10298 replace it with the duplicated scalar. If the 10299 operand is then a floating-point constant, replace 10300 it with the integer bit representation. Print the 10301 transformed constant as a signed decimal number. 10302 'p': Prints N such that 2^N == X (X must be power of 2 and 10303 const int). 10304 'P': Print the number of non-zero bits in X (a const_int). 10305 'H': Print the higher numbered register of a pair (TImode) 10306 of regs. 10307 'm': Print a condition (eq, ne, etc). 10308 'M': Same as 'm', but invert condition. 10309 'N': Take the duplicated element in a vector constant 10310 and print the negative of it in decimal. 10311 'b/h/s/d/q': Print a scalar FP/SIMD register name. 10312 'S/T/U/V': Print a FP/SIMD register name for a register list. 10313 The register printed is the FP/SIMD register name 10314 of X + 0/1/2/3 for S/T/U/V. 10315 'R': Print a scalar Integer/FP/SIMD register name + 1. 10316 'X': Print bottom 16 bits of integer constant in hex. 10317 'w/x': Print a general register name or the zero register 10318 (32-bit or 64-bit). 10319 '0': Print a normal operand, if it's a general register, 10320 then we assume DImode. 10321 'k': Print NZCV for conditional compare instructions. 10322 'A': Output address constant representing the first 10323 argument of X, specifying a relocation offset 10324 if appropriate. 10325 'L': Output constant address specified by X 10326 with a relocation offset if appropriate. 10327 'G': Prints address of X, specifying a PC relative 10328 relocation mode if appropriate. 10329 'y': Output address of LDP or STP - this is used for 10330 some LDP/STPs which don't use a PARALLEL in their 10331 pattern (so the mode needs to be adjusted). 10332 'z': Output address of a typical LDP or STP. */ 10333 10334static void 10335aarch64_print_operand (FILE *f, rtx x, int code) 10336{ 10337 rtx elt; 10338 switch (code) 10339 { 10340 case 'c': 10341 if (CONST_INT_P (x)) 10342 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); 10343 else 10344 { 10345 poly_int64 offset; 10346 rtx base = strip_offset_and_salt (x, &offset); 10347 if (SYMBOL_REF_P (base)) 10348 output_addr_const (f, x); 10349 else 10350 output_operand_lossage ("unsupported operand for code '%c'", code); 10351 } 10352 break; 10353 10354 case 'e': 10355 { 10356 x = unwrap_const_vec_duplicate (x); 10357 if (!CONST_INT_P (x)) 10358 { 10359 output_operand_lossage ("invalid operand for '%%%c'", code); 10360 return; 10361 } 10362 10363 HOST_WIDE_INT val = INTVAL (x); 10364 if ((val & ~7) == 8 || val == 0xff) 10365 fputc ('b', f); 10366 else if ((val & ~7) == 16 || val == 0xffff) 10367 fputc ('h', f); 10368 else if ((val & ~7) == 32 || val == 0xffffffff) 10369 fputc ('w', f); 10370 else 10371 { 10372 output_operand_lossage ("invalid operand for '%%%c'", code); 10373 return; 10374 } 10375 } 10376 break; 10377 10378 case 'p': 10379 { 10380 int n; 10381 10382 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0) 10383 { 10384 output_operand_lossage ("invalid operand for '%%%c'", code); 10385 return; 10386 } 10387 10388 asm_fprintf (f, "%d", n); 10389 } 10390 break; 10391 10392 case 'P': 10393 if (!CONST_INT_P (x)) 10394 { 10395 output_operand_lossage ("invalid operand for '%%%c'", code); 10396 return; 10397 } 10398 10399 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x))); 10400 break; 10401 10402 case 'H': 10403 if (x == const0_rtx) 10404 { 10405 asm_fprintf (f, "xzr"); 10406 break; 10407 } 10408 10409 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1)) 10410 { 10411 output_operand_lossage ("invalid operand for '%%%c'", code); 10412 return; 10413 } 10414 10415 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]); 10416 break; 10417 10418 case 'I': 10419 { 10420 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x)); 10421 if (CONST_INT_P (x)) 10422 asm_fprintf (f, "%wd", INTVAL (x)); 10423 else 10424 { 10425 output_operand_lossage ("invalid operand for '%%%c'", code); 10426 return; 10427 } 10428 break; 10429 } 10430 10431 case 'M': 10432 case 'm': 10433 { 10434 int cond_code; 10435 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */ 10436 if (x == const_true_rtx) 10437 { 10438 if (code == 'M') 10439 fputs ("nv", f); 10440 return; 10441 } 10442 10443 if (!COMPARISON_P (x)) 10444 { 10445 output_operand_lossage ("invalid operand for '%%%c'", code); 10446 return; 10447 } 10448 10449 cond_code = aarch64_get_condition_code (x); 10450 gcc_assert (cond_code >= 0); 10451 if (code == 'M') 10452 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code); 10453 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode) 10454 fputs (aarch64_sve_condition_codes[cond_code], f); 10455 else 10456 fputs (aarch64_condition_codes[cond_code], f); 10457 } 10458 break; 10459 10460 case 'N': 10461 if (!const_vec_duplicate_p (x, &elt)) 10462 { 10463 output_operand_lossage ("invalid vector constant"); 10464 return; 10465 } 10466 10467 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) 10468 asm_fprintf (f, "%wd", -INTVAL (elt)); 10469 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT 10470 && aarch64_print_vector_float_operand (f, x, true)) 10471 ; 10472 else 10473 { 10474 output_operand_lossage ("invalid vector constant"); 10475 return; 10476 } 10477 break; 10478 10479 case 'b': 10480 case 'h': 10481 case 's': 10482 case 'd': 10483 case 'q': 10484 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) 10485 { 10486 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); 10487 return; 10488 } 10489 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM); 10490 break; 10491 10492 case 'S': 10493 case 'T': 10494 case 'U': 10495 case 'V': 10496 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) 10497 { 10498 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); 10499 return; 10500 } 10501 asm_fprintf (f, "%c%d", 10502 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v', 10503 REGNO (x) - V0_REGNUM + (code - 'S')); 10504 break; 10505 10506 case 'R': 10507 if (REG_P (x) && FP_REGNUM_P (REGNO (x))) 10508 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1); 10509 else if (REG_P (x) && GP_REGNUM_P (REGNO (x))) 10510 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1); 10511 else 10512 output_operand_lossage ("incompatible register operand for '%%%c'", 10513 code); 10514 break; 10515 10516 case 'X': 10517 if (!CONST_INT_P (x)) 10518 { 10519 output_operand_lossage ("invalid operand for '%%%c'", code); 10520 return; 10521 } 10522 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff); 10523 break; 10524 10525 case 'C': 10526 { 10527 /* Print a replicated constant in hex. */ 10528 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt)) 10529 { 10530 output_operand_lossage ("invalid operand for '%%%c'", code); 10531 return; 10532 } 10533 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x)); 10534 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode)); 10535 } 10536 break; 10537 10538 case 'D': 10539 { 10540 /* Print a replicated constant in decimal, treating it as 10541 unsigned. */ 10542 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt)) 10543 { 10544 output_operand_lossage ("invalid operand for '%%%c'", code); 10545 return; 10546 } 10547 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x)); 10548 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode)); 10549 } 10550 break; 10551 10552 case 'w': 10553 case 'x': 10554 if (x == const0_rtx 10555 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x))) 10556 { 10557 asm_fprintf (f, "%czr", code); 10558 break; 10559 } 10560 10561 if (REG_P (x) && GP_REGNUM_P (REGNO (x))) 10562 { 10563 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM); 10564 break; 10565 } 10566 10567 if (REG_P (x) && REGNO (x) == SP_REGNUM) 10568 { 10569 asm_fprintf (f, "%ssp", code == 'w' ? "w" : ""); 10570 break; 10571 } 10572 10573 /* Fall through */ 10574 10575 case 0: 10576 if (x == NULL) 10577 { 10578 output_operand_lossage ("missing operand"); 10579 return; 10580 } 10581 10582 switch (GET_CODE (x)) 10583 { 10584 case REG: 10585 if (aarch64_sve_data_mode_p (GET_MODE (x))) 10586 { 10587 if (REG_NREGS (x) == 1) 10588 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM); 10589 else 10590 { 10591 char suffix 10592 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x))); 10593 asm_fprintf (f, "{z%d.%c - z%d.%c}", 10594 REGNO (x) - V0_REGNUM, suffix, 10595 END_REGNO (x) - V0_REGNUM - 1, suffix); 10596 } 10597 } 10598 else 10599 asm_fprintf (f, "%s", reg_names [REGNO (x)]); 10600 break; 10601 10602 case MEM: 10603 output_address (GET_MODE (x), XEXP (x, 0)); 10604 break; 10605 10606 case LABEL_REF: 10607 case SYMBOL_REF: 10608 output_addr_const (asm_out_file, x); 10609 break; 10610 10611 case CONST_INT: 10612 asm_fprintf (f, "%wd", INTVAL (x)); 10613 break; 10614 10615 case CONST: 10616 if (!VECTOR_MODE_P (GET_MODE (x))) 10617 { 10618 output_addr_const (asm_out_file, x); 10619 break; 10620 } 10621 /* fall through */ 10622 10623 case CONST_VECTOR: 10624 if (!const_vec_duplicate_p (x, &elt)) 10625 { 10626 output_operand_lossage ("invalid vector constant"); 10627 return; 10628 } 10629 10630 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) 10631 asm_fprintf (f, "%wd", INTVAL (elt)); 10632 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT 10633 && aarch64_print_vector_float_operand (f, x, false)) 10634 ; 10635 else 10636 { 10637 output_operand_lossage ("invalid vector constant"); 10638 return; 10639 } 10640 break; 10641 10642 case CONST_DOUBLE: 10643 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever 10644 be getting CONST_DOUBLEs holding integers. */ 10645 gcc_assert (GET_MODE (x) != VOIDmode); 10646 if (aarch64_float_const_zero_rtx_p (x)) 10647 { 10648 fputc ('0', f); 10649 break; 10650 } 10651 else if (aarch64_float_const_representable_p (x)) 10652 { 10653#define buf_size 20 10654 char float_buf[buf_size] = {'\0'}; 10655 real_to_decimal_for_mode (float_buf, 10656 CONST_DOUBLE_REAL_VALUE (x), 10657 buf_size, buf_size, 10658 1, GET_MODE (x)); 10659 asm_fprintf (asm_out_file, "%s", float_buf); 10660 break; 10661#undef buf_size 10662 } 10663 output_operand_lossage ("invalid constant"); 10664 return; 10665 default: 10666 output_operand_lossage ("invalid operand"); 10667 return; 10668 } 10669 break; 10670 10671 case 'A': 10672 if (GET_CODE (x) == HIGH) 10673 x = XEXP (x, 0); 10674 10675 switch (aarch64_classify_symbolic_expression (x)) 10676 { 10677 case SYMBOL_SMALL_GOT_4G: 10678 asm_fprintf (asm_out_file, ":got:"); 10679 break; 10680 10681 case SYMBOL_SMALL_TLSGD: 10682 asm_fprintf (asm_out_file, ":tlsgd:"); 10683 break; 10684 10685 case SYMBOL_SMALL_TLSDESC: 10686 asm_fprintf (asm_out_file, ":tlsdesc:"); 10687 break; 10688 10689 case SYMBOL_SMALL_TLSIE: 10690 asm_fprintf (asm_out_file, ":gottprel:"); 10691 break; 10692 10693 case SYMBOL_TLSLE24: 10694 asm_fprintf (asm_out_file, ":tprel:"); 10695 break; 10696 10697 case SYMBOL_TINY_GOT: 10698 gcc_unreachable (); 10699 break; 10700 10701 default: 10702 break; 10703 } 10704 output_addr_const (asm_out_file, x); 10705 break; 10706 10707 case 'L': 10708 switch (aarch64_classify_symbolic_expression (x)) 10709 { 10710 case SYMBOL_SMALL_GOT_4G: 10711 asm_fprintf (asm_out_file, ":lo12:"); 10712 break; 10713 10714 case SYMBOL_SMALL_TLSGD: 10715 asm_fprintf (asm_out_file, ":tlsgd_lo12:"); 10716 break; 10717 10718 case SYMBOL_SMALL_TLSDESC: 10719 asm_fprintf (asm_out_file, ":tlsdesc_lo12:"); 10720 break; 10721 10722 case SYMBOL_SMALL_TLSIE: 10723 asm_fprintf (asm_out_file, ":gottprel_lo12:"); 10724 break; 10725 10726 case SYMBOL_TLSLE12: 10727 asm_fprintf (asm_out_file, ":tprel_lo12:"); 10728 break; 10729 10730 case SYMBOL_TLSLE24: 10731 asm_fprintf (asm_out_file, ":tprel_lo12_nc:"); 10732 break; 10733 10734 case SYMBOL_TINY_GOT: 10735 asm_fprintf (asm_out_file, ":got:"); 10736 break; 10737 10738 case SYMBOL_TINY_TLSIE: 10739 asm_fprintf (asm_out_file, ":gottprel:"); 10740 break; 10741 10742 default: 10743 break; 10744 } 10745 output_addr_const (asm_out_file, x); 10746 break; 10747 10748 case 'G': 10749 switch (aarch64_classify_symbolic_expression (x)) 10750 { 10751 case SYMBOL_TLSLE24: 10752 asm_fprintf (asm_out_file, ":tprel_hi12:"); 10753 break; 10754 default: 10755 break; 10756 } 10757 output_addr_const (asm_out_file, x); 10758 break; 10759 10760 case 'k': 10761 { 10762 HOST_WIDE_INT cond_code; 10763 10764 if (!CONST_INT_P (x)) 10765 { 10766 output_operand_lossage ("invalid operand for '%%%c'", code); 10767 return; 10768 } 10769 10770 cond_code = INTVAL (x); 10771 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV); 10772 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]); 10773 } 10774 break; 10775 10776 case 'y': 10777 case 'z': 10778 { 10779 machine_mode mode = GET_MODE (x); 10780 10781 if (GET_CODE (x) != MEM 10782 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16))) 10783 { 10784 output_operand_lossage ("invalid operand for '%%%c'", code); 10785 return; 10786 } 10787 10788 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0), 10789 code == 'y' 10790 ? ADDR_QUERY_LDP_STP_N 10791 : ADDR_QUERY_LDP_STP)) 10792 output_operand_lossage ("invalid operand prefix '%%%c'", code); 10793 } 10794 break; 10795 10796 default: 10797 output_operand_lossage ("invalid operand prefix '%%%c'", code); 10798 return; 10799 } 10800} 10801 10802/* Print address 'x' of a memory access with mode 'mode'. 10803 'op' is the context required by aarch64_classify_address. It can either be 10804 MEM for a normal memory access or PARALLEL for LDP/STP. */ 10805static bool 10806aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x, 10807 aarch64_addr_query_type type) 10808{ 10809 struct aarch64_address_info addr; 10810 unsigned int size, vec_flags; 10811 10812 /* Check all addresses are Pmode - including ILP32. */ 10813 if (GET_MODE (x) != Pmode 10814 && (!CONST_INT_P (x) 10815 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x))) 10816 { 10817 output_operand_lossage ("invalid address mode"); 10818 return false; 10819 } 10820 10821 if (aarch64_classify_address (&addr, x, mode, true, type)) 10822 switch (addr.type) 10823 { 10824 case ADDRESS_REG_IMM: 10825 if (known_eq (addr.const_offset, 0)) 10826 { 10827 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]); 10828 return true; 10829 } 10830 10831 vec_flags = aarch64_classify_vector_mode (mode); 10832 if (vec_flags & VEC_ANY_SVE) 10833 { 10834 HOST_WIDE_INT vnum 10835 = exact_div (addr.const_offset, 10836 aarch64_vl_bytes (mode, vec_flags)).to_constant (); 10837 asm_fprintf (f, "[%s, #%wd, mul vl]", 10838 reg_names[REGNO (addr.base)], vnum); 10839 return true; 10840 } 10841 10842 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)], 10843 INTVAL (addr.offset)); 10844 return true; 10845 10846 case ADDRESS_REG_REG: 10847 if (addr.shift == 0) 10848 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)], 10849 reg_names [REGNO (addr.offset)]); 10850 else 10851 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)], 10852 reg_names [REGNO (addr.offset)], addr.shift); 10853 return true; 10854 10855 case ADDRESS_REG_UXTW: 10856 if (addr.shift == 0) 10857 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)], 10858 REGNO (addr.offset) - R0_REGNUM); 10859 else 10860 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)], 10861 REGNO (addr.offset) - R0_REGNUM, addr.shift); 10862 return true; 10863 10864 case ADDRESS_REG_SXTW: 10865 if (addr.shift == 0) 10866 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)], 10867 REGNO (addr.offset) - R0_REGNUM); 10868 else 10869 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)], 10870 REGNO (addr.offset) - R0_REGNUM, addr.shift); 10871 return true; 10872 10873 case ADDRESS_REG_WB: 10874 /* Writeback is only supported for fixed-width modes. */ 10875 size = GET_MODE_SIZE (mode).to_constant (); 10876 switch (GET_CODE (x)) 10877 { 10878 case PRE_INC: 10879 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size); 10880 return true; 10881 case POST_INC: 10882 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size); 10883 return true; 10884 case PRE_DEC: 10885 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size); 10886 return true; 10887 case POST_DEC: 10888 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size); 10889 return true; 10890 case PRE_MODIFY: 10891 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)], 10892 INTVAL (addr.offset)); 10893 return true; 10894 case POST_MODIFY: 10895 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)], 10896 INTVAL (addr.offset)); 10897 return true; 10898 default: 10899 break; 10900 } 10901 break; 10902 10903 case ADDRESS_LO_SUM: 10904 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]); 10905 output_addr_const (f, addr.offset); 10906 asm_fprintf (f, "]"); 10907 return true; 10908 10909 case ADDRESS_SYMBOLIC: 10910 output_addr_const (f, x); 10911 return true; 10912 } 10913 10914 return false; 10915} 10916 10917/* Print address 'x' of a memory access with mode 'mode'. */ 10918static void 10919aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x) 10920{ 10921 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY)) 10922 output_addr_const (f, x); 10923} 10924 10925/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ 10926 10927static bool 10928aarch64_output_addr_const_extra (FILE *file, rtx x) 10929{ 10930 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR) 10931 { 10932 output_addr_const (file, XVECEXP (x, 0, 0)); 10933 return true; 10934 } 10935 return false; 10936} 10937 10938bool 10939aarch64_label_mentioned_p (rtx x) 10940{ 10941 const char *fmt; 10942 int i; 10943 10944 if (GET_CODE (x) == LABEL_REF) 10945 return true; 10946 10947 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the 10948 referencing instruction, but they are constant offsets, not 10949 symbols. */ 10950 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) 10951 return false; 10952 10953 fmt = GET_RTX_FORMAT (GET_CODE (x)); 10954 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) 10955 { 10956 if (fmt[i] == 'E') 10957 { 10958 int j; 10959 10960 for (j = XVECLEN (x, i) - 1; j >= 0; j--) 10961 if (aarch64_label_mentioned_p (XVECEXP (x, i, j))) 10962 return 1; 10963 } 10964 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i))) 10965 return 1; 10966 } 10967 10968 return 0; 10969} 10970 10971/* Implement REGNO_REG_CLASS. */ 10972 10973enum reg_class 10974aarch64_regno_regclass (unsigned regno) 10975{ 10976 if (STUB_REGNUM_P (regno)) 10977 return STUB_REGS; 10978 10979 if (GP_REGNUM_P (regno)) 10980 return GENERAL_REGS; 10981 10982 if (regno == SP_REGNUM) 10983 return STACK_REG; 10984 10985 if (regno == FRAME_POINTER_REGNUM 10986 || regno == ARG_POINTER_REGNUM) 10987 return POINTER_REGS; 10988 10989 if (FP_REGNUM_P (regno)) 10990 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS 10991 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS); 10992 10993 if (PR_REGNUM_P (regno)) 10994 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS; 10995 10996 if (regno == FFR_REGNUM || regno == FFRT_REGNUM) 10997 return FFR_REGS; 10998 10999 return NO_REGS; 11000} 11001 11002/* OFFSET is an address offset for mode MODE, which has SIZE bytes. 11003 If OFFSET is out of range, return an offset of an anchor point 11004 that is in range. Return 0 otherwise. */ 11005 11006static HOST_WIDE_INT 11007aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size, 11008 machine_mode mode) 11009{ 11010 /* Does it look like we'll need a 16-byte load/store-pair operation? */ 11011 if (size > 16) 11012 return (offset + 0x400) & ~0x7f0; 11013 11014 /* For offsets that aren't a multiple of the access size, the limit is 11015 -256...255. */ 11016 if (offset & (size - 1)) 11017 { 11018 /* BLKmode typically uses LDP of X-registers. */ 11019 if (mode == BLKmode) 11020 return (offset + 512) & ~0x3ff; 11021 return (offset + 0x100) & ~0x1ff; 11022 } 11023 11024 /* Small negative offsets are supported. */ 11025 if (IN_RANGE (offset, -256, 0)) 11026 return 0; 11027 11028 if (mode == TImode || mode == TFmode) 11029 return (offset + 0x100) & ~0x1ff; 11030 11031 /* Use 12-bit offset by access size. */ 11032 return offset & (~0xfff * size); 11033} 11034 11035static rtx 11036aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode) 11037{ 11038 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask), 11039 where mask is selected by alignment and size of the offset. 11040 We try to pick as large a range for the offset as possible to 11041 maximize the chance of a CSE. However, for aligned addresses 11042 we limit the range to 4k so that structures with different sized 11043 elements are likely to use the same base. We need to be careful 11044 not to split a CONST for some forms of address expression, otherwise 11045 it will generate sub-optimal code. */ 11046 11047 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) 11048 { 11049 rtx base = XEXP (x, 0); 11050 rtx offset_rtx = XEXP (x, 1); 11051 HOST_WIDE_INT offset = INTVAL (offset_rtx); 11052 11053 if (GET_CODE (base) == PLUS) 11054 { 11055 rtx op0 = XEXP (base, 0); 11056 rtx op1 = XEXP (base, 1); 11057 11058 /* Force any scaling into a temp for CSE. */ 11059 op0 = force_reg (Pmode, op0); 11060 op1 = force_reg (Pmode, op1); 11061 11062 /* Let the pointer register be in op0. */ 11063 if (REG_POINTER (op1)) 11064 std::swap (op0, op1); 11065 11066 /* If the pointer is virtual or frame related, then we know that 11067 virtual register instantiation or register elimination is going 11068 to apply a second constant. We want the two constants folded 11069 together easily. Therefore, emit as (OP0 + CONST) + OP1. */ 11070 if (virt_or_elim_regno_p (REGNO (op0))) 11071 { 11072 base = expand_binop (Pmode, add_optab, op0, offset_rtx, 11073 NULL_RTX, true, OPTAB_DIRECT); 11074 return gen_rtx_PLUS (Pmode, base, op1); 11075 } 11076 11077 /* Otherwise, in order to encourage CSE (and thence loop strength 11078 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */ 11079 base = expand_binop (Pmode, add_optab, op0, op1, 11080 NULL_RTX, true, OPTAB_DIRECT); 11081 x = gen_rtx_PLUS (Pmode, base, offset_rtx); 11082 } 11083 11084 HOST_WIDE_INT size; 11085 if (GET_MODE_SIZE (mode).is_constant (&size)) 11086 { 11087 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size, 11088 mode); 11089 if (base_offset != 0) 11090 { 11091 base = plus_constant (Pmode, base, base_offset); 11092 base = force_operand (base, NULL_RTX); 11093 return plus_constant (Pmode, base, offset - base_offset); 11094 } 11095 } 11096 } 11097 11098 return x; 11099} 11100 11101static reg_class_t 11102aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x, 11103 reg_class_t rclass, 11104 machine_mode mode, 11105 secondary_reload_info *sri) 11106{ 11107 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use 11108 LDR and STR. See the comment at the head of aarch64-sve.md for 11109 more details about the big-endian handling. */ 11110 if (reg_class_subset_p (rclass, FP_REGS) 11111 && !((REG_P (x) && HARD_REGISTER_P (x)) 11112 || aarch64_simd_valid_immediate (x, NULL)) 11113 && mode != VNx16QImode) 11114 { 11115 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 11116 if ((vec_flags & VEC_SVE_DATA) 11117 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN)) 11118 { 11119 sri->icode = CODE_FOR_aarch64_sve_reload_mem; 11120 return NO_REGS; 11121 } 11122 } 11123 11124 /* If we have to disable direct literal pool loads and stores because the 11125 function is too big, then we need a scratch register. */ 11126 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x) 11127 && (SCALAR_FLOAT_MODE_P (GET_MODE (x)) 11128 || targetm.vector_mode_supported_p (GET_MODE (x))) 11129 && !aarch64_pcrelative_literal_loads) 11130 { 11131 sri->icode = code_for_aarch64_reload_movcp (mode, DImode); 11132 return NO_REGS; 11133 } 11134 11135 /* Without the TARGET_SIMD instructions we cannot move a Q register 11136 to a Q register directly. We need a scratch. */ 11137 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x) 11138 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD 11139 && reg_class_subset_p (rclass, FP_REGS)) 11140 { 11141 sri->icode = code_for_aarch64_reload_mov (mode); 11142 return NO_REGS; 11143 } 11144 11145 /* A TFmode or TImode memory access should be handled via an FP_REGS 11146 because AArch64 has richer addressing modes for LDR/STR instructions 11147 than LDP/STP instructions. */ 11148 if (TARGET_FLOAT && rclass == GENERAL_REGS 11149 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x)) 11150 return FP_REGS; 11151 11152 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x)) 11153 return GENERAL_REGS; 11154 11155 return NO_REGS; 11156} 11157 11158static bool 11159aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) 11160{ 11161 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM); 11162 11163 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM 11164 can only eliminate to HARD_FRAME_POINTER_REGNUM. */ 11165 if (frame_pointer_needed) 11166 return to == HARD_FRAME_POINTER_REGNUM; 11167 return true; 11168} 11169 11170poly_int64 11171aarch64_initial_elimination_offset (unsigned from, unsigned to) 11172{ 11173 if (to == HARD_FRAME_POINTER_REGNUM) 11174 { 11175 if (from == ARG_POINTER_REGNUM) 11176 return cfun->machine->frame.hard_fp_offset; 11177 11178 if (from == FRAME_POINTER_REGNUM) 11179 return cfun->machine->frame.hard_fp_offset 11180 - cfun->machine->frame.locals_offset; 11181 } 11182 11183 if (to == STACK_POINTER_REGNUM) 11184 { 11185 if (from == FRAME_POINTER_REGNUM) 11186 return cfun->machine->frame.frame_size 11187 - cfun->machine->frame.locals_offset; 11188 } 11189 11190 return cfun->machine->frame.frame_size; 11191} 11192 11193 11194/* Get return address without mangling. */ 11195 11196rtx 11197aarch64_return_addr_rtx (void) 11198{ 11199 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM); 11200 /* Note: aarch64_return_address_signing_enabled only 11201 works after cfun->machine->frame.laid_out is set, 11202 so here we don't know if the return address will 11203 be signed or not. */ 11204 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM); 11205 emit_move_insn (lr, val); 11206 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ()); 11207 return lr; 11208} 11209 11210 11211/* Implement RETURN_ADDR_RTX. We do not support moving back to a 11212 previous frame. */ 11213 11214rtx 11215aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) 11216{ 11217 if (count != 0) 11218 return const0_rtx; 11219 return aarch64_return_addr_rtx (); 11220} 11221 11222 11223static void 11224aarch64_asm_trampoline_template (FILE *f) 11225{ 11226 int offset1 = 24; 11227 int offset2 = 28; 11228 11229 if (aarch64_bti_enabled ()) 11230 { 11231 asm_fprintf (f, "\thint\t34 // bti c\n"); 11232 offset1 -= 4; 11233 offset2 -= 4; 11234 } 11235 11236 if (TARGET_ILP32) 11237 { 11238 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1); 11239 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM, 11240 offset1); 11241 } 11242 else 11243 { 11244 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1); 11245 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM], 11246 offset2); 11247 } 11248 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]); 11249 11250 /* We always emit a speculation barrier. 11251 This is because the same trampoline template is used for every nested 11252 function. Since nested functions are not particularly common or 11253 performant we don't worry too much about the extra instructions to copy 11254 around. 11255 This is not yet a problem, since we have not yet implemented function 11256 specific attributes to choose between hardening against straight line 11257 speculation or not, but such function specific attributes are likely to 11258 happen in the future. */ 11259 asm_fprintf (f, "\tdsb\tsy\n\tisb\n"); 11260 11261 /* The trampoline needs an extra padding instruction. In case if BTI is 11262 enabled the padding instruction is replaced by the BTI instruction at 11263 the beginning. */ 11264 if (!aarch64_bti_enabled ()) 11265 assemble_aligned_integer (4, const0_rtx); 11266 11267 assemble_aligned_integer (POINTER_BYTES, const0_rtx); 11268 assemble_aligned_integer (POINTER_BYTES, const0_rtx); 11269} 11270 11271static void 11272aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) 11273{ 11274 rtx fnaddr, mem, a_tramp; 11275 const int tramp_code_sz = 24; 11276 11277 /* Don't need to copy the trailing D-words, we fill those in below. */ 11278 /* We create our own memory address in Pmode so that `emit_block_move` can 11279 use parts of the backend which expect Pmode addresses. */ 11280 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0)); 11281 emit_block_move (gen_rtx_MEM (BLKmode, temp), 11282 assemble_trampoline_template (), 11283 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL); 11284 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz); 11285 fnaddr = XEXP (DECL_RTL (fndecl), 0); 11286 if (GET_MODE (fnaddr) != ptr_mode) 11287 fnaddr = convert_memory_address (ptr_mode, fnaddr); 11288 emit_move_insn (mem, fnaddr); 11289 11290 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES); 11291 emit_move_insn (mem, chain_value); 11292 11293 /* XXX We should really define a "clear_cache" pattern and use 11294 gen_clear_cache(). */ 11295 a_tramp = XEXP (m_tramp, 0); 11296 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"), 11297 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode, 11298 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE), 11299 ptr_mode); 11300} 11301 11302static unsigned char 11303aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode) 11304{ 11305 /* ??? Logically we should only need to provide a value when 11306 HARD_REGNO_MODE_OK says that at least one register in REGCLASS 11307 can hold MODE, but at the moment we need to handle all modes. 11308 Just ignore any runtime parts for registers that can't store them. */ 11309 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode)); 11310 unsigned int nregs, vec_flags; 11311 switch (regclass) 11312 { 11313 case STUB_REGS: 11314 case TAILCALL_ADDR_REGS: 11315 case POINTER_REGS: 11316 case GENERAL_REGS: 11317 case ALL_REGS: 11318 case POINTER_AND_FP_REGS: 11319 case FP_REGS: 11320 case FP_LO_REGS: 11321 case FP_LO8_REGS: 11322 vec_flags = aarch64_classify_vector_mode (mode); 11323 if ((vec_flags & VEC_SVE_DATA) 11324 && constant_multiple_p (GET_MODE_SIZE (mode), 11325 aarch64_vl_bytes (mode, vec_flags), &nregs)) 11326 return nregs; 11327 return (vec_flags & VEC_ADVSIMD 11328 ? CEIL (lowest_size, UNITS_PER_VREG) 11329 : CEIL (lowest_size, UNITS_PER_WORD)); 11330 case STACK_REG: 11331 case PR_REGS: 11332 case PR_LO_REGS: 11333 case PR_HI_REGS: 11334 case FFR_REGS: 11335 case PR_AND_FFR_REGS: 11336 return 1; 11337 11338 case NO_REGS: 11339 return 0; 11340 11341 default: 11342 break; 11343 } 11344 gcc_unreachable (); 11345} 11346 11347static reg_class_t 11348aarch64_preferred_reload_class (rtx x, reg_class_t regclass) 11349{ 11350 if (regclass == POINTER_REGS) 11351 return GENERAL_REGS; 11352 11353 if (regclass == STACK_REG) 11354 { 11355 if (REG_P(x) 11356 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS)) 11357 return regclass; 11358 11359 return NO_REGS; 11360 } 11361 11362 /* Register eliminiation can result in a request for 11363 SP+constant->FP_REGS. We cannot support such operations which 11364 use SP as source and an FP_REG as destination, so reject out 11365 right now. */ 11366 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS) 11367 { 11368 rtx lhs = XEXP (x, 0); 11369 11370 /* Look through a possible SUBREG introduced by ILP32. */ 11371 if (GET_CODE (lhs) == SUBREG) 11372 lhs = SUBREG_REG (lhs); 11373 11374 gcc_assert (REG_P (lhs)); 11375 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)), 11376 POINTER_REGS)); 11377 return NO_REGS; 11378 } 11379 11380 return regclass; 11381} 11382 11383void 11384aarch64_asm_output_labelref (FILE* f, const char *name) 11385{ 11386 asm_fprintf (f, "%U%s", name); 11387} 11388 11389static void 11390aarch64_elf_asm_constructor (rtx symbol, int priority) 11391{ 11392 if (priority == DEFAULT_INIT_PRIORITY) 11393 default_ctor_section_asm_out_constructor (symbol, priority); 11394 else 11395 { 11396 section *s; 11397 /* While priority is known to be in range [0, 65535], so 18 bytes 11398 would be enough, the compiler might not know that. To avoid 11399 -Wformat-truncation false positive, use a larger size. */ 11400 char buf[23]; 11401 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority); 11402 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL); 11403 switch_to_section (s); 11404 assemble_align (POINTER_SIZE); 11405 assemble_aligned_integer (POINTER_BYTES, symbol); 11406 } 11407} 11408 11409static void 11410aarch64_elf_asm_destructor (rtx symbol, int priority) 11411{ 11412 if (priority == DEFAULT_INIT_PRIORITY) 11413 default_dtor_section_asm_out_destructor (symbol, priority); 11414 else 11415 { 11416 section *s; 11417 /* While priority is known to be in range [0, 65535], so 18 bytes 11418 would be enough, the compiler might not know that. To avoid 11419 -Wformat-truncation false positive, use a larger size. */ 11420 char buf[23]; 11421 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority); 11422 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL); 11423 switch_to_section (s); 11424 assemble_align (POINTER_SIZE); 11425 assemble_aligned_integer (POINTER_BYTES, symbol); 11426 } 11427} 11428 11429const char* 11430aarch64_output_casesi (rtx *operands) 11431{ 11432 char buf[100]; 11433 char label[100]; 11434 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2]))); 11435 int index; 11436 static const char *const patterns[4][2] = 11437 { 11438 { 11439 "ldrb\t%w3, [%0,%w1,uxtw]", 11440 "add\t%3, %4, %w3, sxtb #2" 11441 }, 11442 { 11443 "ldrh\t%w3, [%0,%w1,uxtw #1]", 11444 "add\t%3, %4, %w3, sxth #2" 11445 }, 11446 { 11447 "ldr\t%w3, [%0,%w1,uxtw #2]", 11448 "add\t%3, %4, %w3, sxtw #2" 11449 }, 11450 /* We assume that DImode is only generated when not optimizing and 11451 that we don't really need 64-bit address offsets. That would 11452 imply an object file with 8GB of code in a single function! */ 11453 { 11454 "ldr\t%w3, [%0,%w1,uxtw #2]", 11455 "add\t%3, %4, %w3, sxtw #2" 11456 } 11457 }; 11458 11459 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); 11460 11461 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec)); 11462 index = exact_log2 (GET_MODE_SIZE (mode)); 11463 11464 gcc_assert (index >= 0 && index <= 3); 11465 11466 /* Need to implement table size reduction, by chaning the code below. */ 11467 output_asm_insn (patterns[index][0], operands); 11468 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2])); 11469 snprintf (buf, sizeof (buf), 11470 "adr\t%%4, %s", targetm.strip_name_encoding (label)); 11471 output_asm_insn (buf, operands); 11472 output_asm_insn (patterns[index][1], operands); 11473 output_asm_insn ("br\t%3", operands); 11474 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()), 11475 operands); 11476 assemble_label (asm_out_file, label); 11477 return ""; 11478} 11479 11480 11481/* Return size in bits of an arithmetic operand which is shifted/scaled and 11482 masked such that it is suitable for a UXTB, UXTH, or UXTW extend 11483 operator. */ 11484 11485int 11486aarch64_uxt_size (int shift, HOST_WIDE_INT mask) 11487{ 11488 if (shift >= 0 && shift <= 3) 11489 { 11490 int size; 11491 for (size = 8; size <= 32; size *= 2) 11492 { 11493 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1; 11494 if (mask == bits << shift) 11495 return size; 11496 } 11497 } 11498 return 0; 11499} 11500 11501/* Constant pools are per function only when PC relative 11502 literal loads are true or we are in the large memory 11503 model. */ 11504 11505static inline bool 11506aarch64_can_use_per_function_literal_pools_p (void) 11507{ 11508 return (aarch64_pcrelative_literal_loads 11509 || aarch64_cmodel == AARCH64_CMODEL_LARGE); 11510} 11511 11512static bool 11513aarch64_use_blocks_for_constant_p (machine_mode, const_rtx) 11514{ 11515 /* We can't use blocks for constants when we're using a per-function 11516 constant pool. */ 11517 return !aarch64_can_use_per_function_literal_pools_p (); 11518} 11519 11520/* Select appropriate section for constants depending 11521 on where we place literal pools. */ 11522 11523static section * 11524aarch64_select_rtx_section (machine_mode mode, 11525 rtx x, 11526 unsigned HOST_WIDE_INT align) 11527{ 11528 if (aarch64_can_use_per_function_literal_pools_p ()) 11529 return function_section (current_function_decl); 11530 11531 return default_elf_select_rtx_section (mode, x, align); 11532} 11533 11534/* Implement ASM_OUTPUT_POOL_EPILOGUE. */ 11535void 11536aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree, 11537 HOST_WIDE_INT offset) 11538{ 11539 /* When using per-function literal pools, we must ensure that any code 11540 section is aligned to the minimal instruction length, lest we get 11541 errors from the assembler re "unaligned instructions". */ 11542 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ()) 11543 ASM_OUTPUT_ALIGN (f, 2); 11544} 11545 11546/* Costs. */ 11547 11548/* Helper function for rtx cost calculation. Strip a shift expression 11549 from X. Returns the inner operand if successful, or the original 11550 expression on failure. */ 11551static rtx 11552aarch64_strip_shift (rtx x) 11553{ 11554 rtx op = x; 11555 11556 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant 11557 we can convert both to ROR during final output. */ 11558 if ((GET_CODE (op) == ASHIFT 11559 || GET_CODE (op) == ASHIFTRT 11560 || GET_CODE (op) == LSHIFTRT 11561 || GET_CODE (op) == ROTATERT 11562 || GET_CODE (op) == ROTATE) 11563 && CONST_INT_P (XEXP (op, 1))) 11564 return XEXP (op, 0); 11565 11566 if (GET_CODE (op) == MULT 11567 && CONST_INT_P (XEXP (op, 1)) 11568 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64) 11569 return XEXP (op, 0); 11570 11571 return x; 11572} 11573 11574/* Helper function for rtx cost calculation. Strip an extend 11575 expression from X. Returns the inner operand if successful, or the 11576 original expression on failure. We deal with a number of possible 11577 canonicalization variations here. If STRIP_SHIFT is true, then 11578 we can strip off a shift also. */ 11579static rtx 11580aarch64_strip_extend (rtx x, bool strip_shift) 11581{ 11582 scalar_int_mode mode; 11583 rtx op = x; 11584 11585 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode)) 11586 return op; 11587 11588 /* Zero and sign extraction of a widened value. */ 11589 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT) 11590 && XEXP (op, 2) == const0_rtx 11591 && GET_CODE (XEXP (op, 0)) == MULT 11592 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1), 11593 XEXP (op, 1))) 11594 return XEXP (XEXP (op, 0), 0); 11595 11596 /* It can also be represented (for zero-extend) as an AND with an 11597 immediate. */ 11598 if (GET_CODE (op) == AND 11599 && GET_CODE (XEXP (op, 0)) == MULT 11600 && CONST_INT_P (XEXP (XEXP (op, 0), 1)) 11601 && CONST_INT_P (XEXP (op, 1)) 11602 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))), 11603 INTVAL (XEXP (op, 1))) != 0) 11604 return XEXP (XEXP (op, 0), 0); 11605 11606 /* Now handle extended register, as this may also have an optional 11607 left shift by 1..4. */ 11608 if (strip_shift 11609 && GET_CODE (op) == ASHIFT 11610 && CONST_INT_P (XEXP (op, 1)) 11611 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4) 11612 op = XEXP (op, 0); 11613 11614 if (GET_CODE (op) == ZERO_EXTEND 11615 || GET_CODE (op) == SIGN_EXTEND) 11616 op = XEXP (op, 0); 11617 11618 if (op != x) 11619 return op; 11620 11621 return x; 11622} 11623 11624/* Return true iff CODE is a shift supported in combination 11625 with arithmetic instructions. */ 11626 11627static bool 11628aarch64_shift_p (enum rtx_code code) 11629{ 11630 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT; 11631} 11632 11633 11634/* Return true iff X is a cheap shift without a sign extend. */ 11635 11636static bool 11637aarch64_cheap_mult_shift_p (rtx x) 11638{ 11639 rtx op0, op1; 11640 11641 op0 = XEXP (x, 0); 11642 op1 = XEXP (x, 1); 11643 11644 if (!(aarch64_tune_params.extra_tuning_flags 11645 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND)) 11646 return false; 11647 11648 if (GET_CODE (op0) == SIGN_EXTEND) 11649 return false; 11650 11651 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1) 11652 && UINTVAL (op1) <= 4) 11653 return true; 11654 11655 if (GET_CODE (x) != MULT || !CONST_INT_P (op1)) 11656 return false; 11657 11658 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1)); 11659 11660 if (l2 > 0 && l2 <= 4) 11661 return true; 11662 11663 return false; 11664} 11665 11666/* Helper function for rtx cost calculation. Calculate the cost of 11667 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx. 11668 Return the calculated cost of the expression, recursing manually in to 11669 operands where needed. */ 11670 11671static int 11672aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) 11673{ 11674 rtx op0, op1; 11675 const struct cpu_cost_table *extra_cost 11676 = aarch64_tune_params.insn_extra_cost; 11677 int cost = 0; 11678 bool compound_p = (outer == PLUS || outer == MINUS); 11679 machine_mode mode = GET_MODE (x); 11680 11681 gcc_checking_assert (code == MULT); 11682 11683 op0 = XEXP (x, 0); 11684 op1 = XEXP (x, 1); 11685 11686 if (VECTOR_MODE_P (mode)) 11687 mode = GET_MODE_INNER (mode); 11688 11689 /* Integer multiply/fma. */ 11690 if (GET_MODE_CLASS (mode) == MODE_INT) 11691 { 11692 /* The multiply will be canonicalized as a shift, cost it as such. */ 11693 if (aarch64_shift_p (GET_CODE (x)) 11694 || (CONST_INT_P (op1) 11695 && exact_log2 (INTVAL (op1)) > 0)) 11696 { 11697 bool is_extend = GET_CODE (op0) == ZERO_EXTEND 11698 || GET_CODE (op0) == SIGN_EXTEND; 11699 if (speed) 11700 { 11701 if (compound_p) 11702 { 11703 /* If the shift is considered cheap, 11704 then don't add any cost. */ 11705 if (aarch64_cheap_mult_shift_p (x)) 11706 ; 11707 else if (REG_P (op1)) 11708 /* ARITH + shift-by-register. */ 11709 cost += extra_cost->alu.arith_shift_reg; 11710 else if (is_extend) 11711 /* ARITH + extended register. We don't have a cost field 11712 for ARITH+EXTEND+SHIFT, so use extend_arith here. */ 11713 cost += extra_cost->alu.extend_arith; 11714 else 11715 /* ARITH + shift-by-immediate. */ 11716 cost += extra_cost->alu.arith_shift; 11717 } 11718 else 11719 /* LSL (immediate). */ 11720 cost += extra_cost->alu.shift; 11721 11722 } 11723 /* Strip extends as we will have costed them in the case above. */ 11724 if (is_extend) 11725 op0 = aarch64_strip_extend (op0, true); 11726 11727 cost += rtx_cost (op0, VOIDmode, code, 0, speed); 11728 11729 return cost; 11730 } 11731 11732 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a 11733 compound and let the below cases handle it. After all, MNEG is a 11734 special-case alias of MSUB. */ 11735 if (GET_CODE (op0) == NEG) 11736 { 11737 op0 = XEXP (op0, 0); 11738 compound_p = true; 11739 } 11740 11741 /* Integer multiplies or FMAs have zero/sign extending variants. */ 11742 if ((GET_CODE (op0) == ZERO_EXTEND 11743 && GET_CODE (op1) == ZERO_EXTEND) 11744 || (GET_CODE (op0) == SIGN_EXTEND 11745 && GET_CODE (op1) == SIGN_EXTEND)) 11746 { 11747 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed); 11748 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed); 11749 11750 if (speed) 11751 { 11752 if (compound_p) 11753 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */ 11754 cost += extra_cost->mult[0].extend_add; 11755 else 11756 /* MUL/SMULL/UMULL. */ 11757 cost += extra_cost->mult[0].extend; 11758 } 11759 11760 return cost; 11761 } 11762 11763 /* This is either an integer multiply or a MADD. In both cases 11764 we want to recurse and cost the operands. */ 11765 cost += rtx_cost (op0, mode, MULT, 0, speed); 11766 cost += rtx_cost (op1, mode, MULT, 1, speed); 11767 11768 if (speed) 11769 { 11770 if (compound_p) 11771 /* MADD/MSUB. */ 11772 cost += extra_cost->mult[mode == DImode].add; 11773 else 11774 /* MUL. */ 11775 cost += extra_cost->mult[mode == DImode].simple; 11776 } 11777 11778 return cost; 11779 } 11780 else 11781 { 11782 if (speed) 11783 { 11784 /* Floating-point FMA/FMUL can also support negations of the 11785 operands, unless the rounding mode is upward or downward in 11786 which case FNMUL is different than FMUL with operand negation. */ 11787 bool neg0 = GET_CODE (op0) == NEG; 11788 bool neg1 = GET_CODE (op1) == NEG; 11789 if (compound_p || !flag_rounding_math || (neg0 && neg1)) 11790 { 11791 if (neg0) 11792 op0 = XEXP (op0, 0); 11793 if (neg1) 11794 op1 = XEXP (op1, 0); 11795 } 11796 11797 if (compound_p) 11798 /* FMADD/FNMADD/FNMSUB/FMSUB. */ 11799 cost += extra_cost->fp[mode == DFmode].fma; 11800 else 11801 /* FMUL/FNMUL. */ 11802 cost += extra_cost->fp[mode == DFmode].mult; 11803 } 11804 11805 cost += rtx_cost (op0, mode, MULT, 0, speed); 11806 cost += rtx_cost (op1, mode, MULT, 1, speed); 11807 return cost; 11808 } 11809} 11810 11811static int 11812aarch64_address_cost (rtx x, 11813 machine_mode mode, 11814 addr_space_t as ATTRIBUTE_UNUSED, 11815 bool speed) 11816{ 11817 enum rtx_code c = GET_CODE (x); 11818 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost; 11819 struct aarch64_address_info info; 11820 int cost = 0; 11821 info.shift = 0; 11822 11823 if (!aarch64_classify_address (&info, x, mode, false)) 11824 { 11825 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF) 11826 { 11827 /* This is a CONST or SYMBOL ref which will be split 11828 in a different way depending on the code model in use. 11829 Cost it through the generic infrastructure. */ 11830 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed); 11831 /* Divide through by the cost of one instruction to 11832 bring it to the same units as the address costs. */ 11833 cost_symbol_ref /= COSTS_N_INSNS (1); 11834 /* The cost is then the cost of preparing the address, 11835 followed by an immediate (possibly 0) offset. */ 11836 return cost_symbol_ref + addr_cost->imm_offset; 11837 } 11838 else 11839 { 11840 /* This is most likely a jump table from a case 11841 statement. */ 11842 return addr_cost->register_offset; 11843 } 11844 } 11845 11846 switch (info.type) 11847 { 11848 case ADDRESS_LO_SUM: 11849 case ADDRESS_SYMBOLIC: 11850 case ADDRESS_REG_IMM: 11851 cost += addr_cost->imm_offset; 11852 break; 11853 11854 case ADDRESS_REG_WB: 11855 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY) 11856 cost += addr_cost->pre_modify; 11857 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY) 11858 cost += addr_cost->post_modify; 11859 else 11860 gcc_unreachable (); 11861 11862 break; 11863 11864 case ADDRESS_REG_REG: 11865 cost += addr_cost->register_offset; 11866 break; 11867 11868 case ADDRESS_REG_SXTW: 11869 cost += addr_cost->register_sextend; 11870 break; 11871 11872 case ADDRESS_REG_UXTW: 11873 cost += addr_cost->register_zextend; 11874 break; 11875 11876 default: 11877 gcc_unreachable (); 11878 } 11879 11880 11881 if (info.shift > 0) 11882 { 11883 /* For the sake of calculating the cost of the shifted register 11884 component, we can treat same sized modes in the same way. */ 11885 if (known_eq (GET_MODE_BITSIZE (mode), 16)) 11886 cost += addr_cost->addr_scale_costs.hi; 11887 else if (known_eq (GET_MODE_BITSIZE (mode), 32)) 11888 cost += addr_cost->addr_scale_costs.si; 11889 else if (known_eq (GET_MODE_BITSIZE (mode), 64)) 11890 cost += addr_cost->addr_scale_costs.di; 11891 else 11892 /* We can't tell, or this is a 128-bit vector. */ 11893 cost += addr_cost->addr_scale_costs.ti; 11894 } 11895 11896 return cost; 11897} 11898 11899/* Return the cost of a branch. If SPEED_P is true then the compiler is 11900 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted 11901 to be taken. */ 11902 11903int 11904aarch64_branch_cost (bool speed_p, bool predictable_p) 11905{ 11906 /* When optimizing for speed, use the cost of unpredictable branches. */ 11907 const struct cpu_branch_cost *branch_costs = 11908 aarch64_tune_params.branch_costs; 11909 11910 if (!speed_p || predictable_p) 11911 return branch_costs->predictable; 11912 else 11913 return branch_costs->unpredictable; 11914} 11915 11916/* Return true if the RTX X in mode MODE is a zero or sign extract 11917 usable in an ADD or SUB (extended register) instruction. */ 11918static bool 11919aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode) 11920{ 11921 /* Catch add with a sign extract. 11922 This is add_<optab><mode>_multp2. */ 11923 if (GET_CODE (x) == SIGN_EXTRACT 11924 || GET_CODE (x) == ZERO_EXTRACT) 11925 { 11926 rtx op0 = XEXP (x, 0); 11927 rtx op1 = XEXP (x, 1); 11928 rtx op2 = XEXP (x, 2); 11929 11930 if (GET_CODE (op0) == MULT 11931 && CONST_INT_P (op1) 11932 && op2 == const0_rtx 11933 && CONST_INT_P (XEXP (op0, 1)) 11934 && aarch64_is_extend_from_extract (mode, 11935 XEXP (op0, 1), 11936 op1)) 11937 { 11938 return true; 11939 } 11940 } 11941 /* The simple case <ARITH>, XD, XN, XM, [us]xt. 11942 No shift. */ 11943 else if (GET_CODE (x) == SIGN_EXTEND 11944 || GET_CODE (x) == ZERO_EXTEND) 11945 return REG_P (XEXP (x, 0)); 11946 11947 return false; 11948} 11949 11950static bool 11951aarch64_frint_unspec_p (unsigned int u) 11952{ 11953 switch (u) 11954 { 11955 case UNSPEC_FRINTZ: 11956 case UNSPEC_FRINTP: 11957 case UNSPEC_FRINTM: 11958 case UNSPEC_FRINTA: 11959 case UNSPEC_FRINTN: 11960 case UNSPEC_FRINTX: 11961 case UNSPEC_FRINTI: 11962 return true; 11963 11964 default: 11965 return false; 11966 } 11967} 11968 11969/* Return true iff X is an rtx that will match an extr instruction 11970 i.e. as described in the *extr<mode>5_insn family of patterns. 11971 OP0 and OP1 will be set to the operands of the shifts involved 11972 on success and will be NULL_RTX otherwise. */ 11973 11974static bool 11975aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1) 11976{ 11977 rtx op0, op1; 11978 scalar_int_mode mode; 11979 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode)) 11980 return false; 11981 11982 *res_op0 = NULL_RTX; 11983 *res_op1 = NULL_RTX; 11984 11985 if (GET_CODE (x) != IOR) 11986 return false; 11987 11988 op0 = XEXP (x, 0); 11989 op1 = XEXP (x, 1); 11990 11991 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT) 11992 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT)) 11993 { 11994 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */ 11995 if (GET_CODE (op1) == ASHIFT) 11996 std::swap (op0, op1); 11997 11998 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1))) 11999 return false; 12000 12001 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1)); 12002 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1)); 12003 12004 if (shft_amnt_0 < GET_MODE_BITSIZE (mode) 12005 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode)) 12006 { 12007 *res_op0 = XEXP (op0, 0); 12008 *res_op1 = XEXP (op1, 0); 12009 return true; 12010 } 12011 } 12012 12013 return false; 12014} 12015 12016/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)), 12017 storing it in *COST. Result is true if the total cost of the operation 12018 has now been calculated. */ 12019static bool 12020aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed) 12021{ 12022 rtx inner; 12023 rtx comparator; 12024 enum rtx_code cmpcode; 12025 const struct cpu_cost_table *extra_cost 12026 = aarch64_tune_params.insn_extra_cost; 12027 12028 if (COMPARISON_P (op0)) 12029 { 12030 inner = XEXP (op0, 0); 12031 comparator = XEXP (op0, 1); 12032 cmpcode = GET_CODE (op0); 12033 } 12034 else 12035 { 12036 inner = op0; 12037 comparator = const0_rtx; 12038 cmpcode = NE; 12039 } 12040 12041 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC) 12042 { 12043 /* Conditional branch. */ 12044 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) 12045 return true; 12046 else 12047 { 12048 if (cmpcode == NE || cmpcode == EQ) 12049 { 12050 if (comparator == const0_rtx) 12051 { 12052 /* TBZ/TBNZ/CBZ/CBNZ. */ 12053 if (GET_CODE (inner) == ZERO_EXTRACT) 12054 /* TBZ/TBNZ. */ 12055 *cost += rtx_cost (XEXP (inner, 0), VOIDmode, 12056 ZERO_EXTRACT, 0, speed); 12057 else 12058 /* CBZ/CBNZ. */ 12059 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed); 12060 12061 return true; 12062 } 12063 if (register_operand (inner, VOIDmode) 12064 && aarch64_imm24 (comparator, VOIDmode)) 12065 { 12066 /* SUB and SUBS. */ 12067 *cost += COSTS_N_INSNS (2); 12068 if (speed) 12069 *cost += extra_cost->alu.arith * 2; 12070 return true; 12071 } 12072 } 12073 else if (cmpcode == LT || cmpcode == GE) 12074 { 12075 /* TBZ/TBNZ. */ 12076 if (comparator == const0_rtx) 12077 return true; 12078 } 12079 } 12080 } 12081 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) 12082 { 12083 /* CCMP. */ 12084 if (GET_CODE (op1) == COMPARE) 12085 { 12086 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */ 12087 if (XEXP (op1, 1) == const0_rtx) 12088 *cost += 1; 12089 if (speed) 12090 { 12091 machine_mode mode = GET_MODE (XEXP (op1, 0)); 12092 const struct cpu_cost_table *extra_cost 12093 = aarch64_tune_params.insn_extra_cost; 12094 12095 if (GET_MODE_CLASS (mode) == MODE_INT) 12096 *cost += extra_cost->alu.arith; 12097 else 12098 *cost += extra_cost->fp[mode == DFmode].compare; 12099 } 12100 return true; 12101 } 12102 12103 /* It's a conditional operation based on the status flags, 12104 so it must be some flavor of CSEL. */ 12105 12106 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */ 12107 if (GET_CODE (op1) == NEG 12108 || GET_CODE (op1) == NOT 12109 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx)) 12110 op1 = XEXP (op1, 0); 12111 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND) 12112 { 12113 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */ 12114 op1 = XEXP (op1, 0); 12115 op2 = XEXP (op2, 0); 12116 } 12117 12118 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed); 12119 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed); 12120 return true; 12121 } 12122 12123 /* We don't know what this is, cost all operands. */ 12124 return false; 12125} 12126 12127/* Check whether X is a bitfield operation of the form shift + extend that 12128 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the 12129 operand to which the bitfield operation is applied. Otherwise return 12130 NULL_RTX. */ 12131 12132static rtx 12133aarch64_extend_bitfield_pattern_p (rtx x) 12134{ 12135 rtx_code outer_code = GET_CODE (x); 12136 machine_mode outer_mode = GET_MODE (x); 12137 12138 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND 12139 && outer_mode != SImode && outer_mode != DImode) 12140 return NULL_RTX; 12141 12142 rtx inner = XEXP (x, 0); 12143 rtx_code inner_code = GET_CODE (inner); 12144 machine_mode inner_mode = GET_MODE (inner); 12145 rtx op = NULL_RTX; 12146 12147 switch (inner_code) 12148 { 12149 case ASHIFT: 12150 if (CONST_INT_P (XEXP (inner, 1)) 12151 && (inner_mode == QImode || inner_mode == HImode)) 12152 op = XEXP (inner, 0); 12153 break; 12154 case LSHIFTRT: 12155 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1)) 12156 && (inner_mode == QImode || inner_mode == HImode)) 12157 op = XEXP (inner, 0); 12158 break; 12159 case ASHIFTRT: 12160 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1)) 12161 && (inner_mode == QImode || inner_mode == HImode)) 12162 op = XEXP (inner, 0); 12163 break; 12164 default: 12165 break; 12166 } 12167 12168 return op; 12169} 12170 12171/* Return true if the mask and a shift amount from an RTX of the form 12172 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of 12173 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */ 12174 12175bool 12176aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask, 12177 rtx shft_amnt) 12178{ 12179 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt) 12180 && INTVAL (mask) > 0 12181 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode) 12182 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0 12183 && (UINTVAL (mask) 12184 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0; 12185} 12186 12187/* Return true if the masks and a shift amount from an RTX of the form 12188 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into 12189 a BFI instruction of mode MODE. See *arch64_bfi patterns. */ 12190 12191bool 12192aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode, 12193 unsigned HOST_WIDE_INT mask1, 12194 unsigned HOST_WIDE_INT shft_amnt, 12195 unsigned HOST_WIDE_INT mask2) 12196{ 12197 unsigned HOST_WIDE_INT t; 12198 12199 /* Verify that there is no overlap in what bits are set in the two masks. */ 12200 if (mask1 != ~mask2) 12201 return false; 12202 12203 /* Verify that mask2 is not all zeros or ones. */ 12204 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U) 12205 return false; 12206 12207 /* The shift amount should always be less than the mode size. */ 12208 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode)); 12209 12210 /* Verify that the mask being shifted is contiguous and would be in the 12211 least significant bits after shifting by shft_amnt. */ 12212 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt); 12213 return (t == (t & -t)); 12214} 12215 12216/* Calculate the cost of calculating X, storing it in *COST. Result 12217 is true if the total cost of the operation has now been calculated. */ 12218static bool 12219aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, 12220 int param ATTRIBUTE_UNUSED, int *cost, bool speed) 12221{ 12222 rtx op0, op1, op2; 12223 const struct cpu_cost_table *extra_cost 12224 = aarch64_tune_params.insn_extra_cost; 12225 int code = GET_CODE (x); 12226 scalar_int_mode int_mode; 12227 12228 /* By default, assume that everything has equivalent cost to the 12229 cheapest instruction. Any additional costs are applied as a delta 12230 above this default. */ 12231 *cost = COSTS_N_INSNS (1); 12232 12233 switch (code) 12234 { 12235 case SET: 12236 /* The cost depends entirely on the operands to SET. */ 12237 *cost = 0; 12238 op0 = SET_DEST (x); 12239 op1 = SET_SRC (x); 12240 12241 switch (GET_CODE (op0)) 12242 { 12243 case MEM: 12244 if (speed) 12245 { 12246 rtx address = XEXP (op0, 0); 12247 if (VECTOR_MODE_P (mode)) 12248 *cost += extra_cost->ldst.storev; 12249 else if (GET_MODE_CLASS (mode) == MODE_INT) 12250 *cost += extra_cost->ldst.store; 12251 else if (mode == SFmode) 12252 *cost += extra_cost->ldst.storef; 12253 else if (mode == DFmode) 12254 *cost += extra_cost->ldst.stored; 12255 12256 *cost += 12257 COSTS_N_INSNS (aarch64_address_cost (address, mode, 12258 0, speed)); 12259 } 12260 12261 *cost += rtx_cost (op1, mode, SET, 1, speed); 12262 return true; 12263 12264 case SUBREG: 12265 if (! REG_P (SUBREG_REG (op0))) 12266 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed); 12267 12268 /* Fall through. */ 12269 case REG: 12270 /* The cost is one per vector-register copied. */ 12271 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1)) 12272 { 12273 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0)); 12274 *cost = COSTS_N_INSNS (nregs); 12275 } 12276 /* const0_rtx is in general free, but we will use an 12277 instruction to set a register to 0. */ 12278 else if (REG_P (op1) || op1 == const0_rtx) 12279 { 12280 /* The cost is 1 per register copied. */ 12281 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0)); 12282 *cost = COSTS_N_INSNS (nregs); 12283 } 12284 else 12285 /* Cost is just the cost of the RHS of the set. */ 12286 *cost += rtx_cost (op1, mode, SET, 1, speed); 12287 return true; 12288 12289 case ZERO_EXTRACT: 12290 case SIGN_EXTRACT: 12291 /* Bit-field insertion. Strip any redundant widening of 12292 the RHS to meet the width of the target. */ 12293 if (GET_CODE (op1) == SUBREG) 12294 op1 = SUBREG_REG (op1); 12295 if ((GET_CODE (op1) == ZERO_EXTEND 12296 || GET_CODE (op1) == SIGN_EXTEND) 12297 && CONST_INT_P (XEXP (op0, 1)) 12298 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode) 12299 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1))) 12300 op1 = XEXP (op1, 0); 12301 12302 if (CONST_INT_P (op1)) 12303 { 12304 /* MOV immediate is assumed to always be cheap. */ 12305 *cost = COSTS_N_INSNS (1); 12306 } 12307 else 12308 { 12309 /* BFM. */ 12310 if (speed) 12311 *cost += extra_cost->alu.bfi; 12312 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed); 12313 } 12314 12315 return true; 12316 12317 default: 12318 /* We can't make sense of this, assume default cost. */ 12319 *cost = COSTS_N_INSNS (1); 12320 return false; 12321 } 12322 return false; 12323 12324 case CONST_INT: 12325 /* If an instruction can incorporate a constant within the 12326 instruction, the instruction's expression avoids calling 12327 rtx_cost() on the constant. If rtx_cost() is called on a 12328 constant, then it is usually because the constant must be 12329 moved into a register by one or more instructions. 12330 12331 The exception is constant 0, which can be expressed 12332 as XZR/WZR and is therefore free. The exception to this is 12333 if we have (set (reg) (const0_rtx)) in which case we must cost 12334 the move. However, we can catch that when we cost the SET, so 12335 we don't need to consider that here. */ 12336 if (x == const0_rtx) 12337 *cost = 0; 12338 else 12339 { 12340 /* To an approximation, building any other constant is 12341 proportionally expensive to the number of instructions 12342 required to build that constant. This is true whether we 12343 are compiling for SPEED or otherwise. */ 12344 if (!is_a <scalar_int_mode> (mode, &int_mode)) 12345 int_mode = word_mode; 12346 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate 12347 (NULL_RTX, x, false, int_mode)); 12348 } 12349 return true; 12350 12351 case CONST_DOUBLE: 12352 12353 /* First determine number of instructions to do the move 12354 as an integer constant. */ 12355 if (!aarch64_float_const_representable_p (x) 12356 && !aarch64_can_const_movi_rtx_p (x, mode) 12357 && aarch64_float_const_rtx_p (x)) 12358 { 12359 unsigned HOST_WIDE_INT ival; 12360 bool succeed = aarch64_reinterpret_float_as_int (x, &ival); 12361 gcc_assert (succeed); 12362 12363 scalar_int_mode imode = (mode == HFmode 12364 ? SImode 12365 : int_mode_for_mode (mode).require ()); 12366 int ncost = aarch64_internal_mov_immediate 12367 (NULL_RTX, gen_int_mode (ival, imode), false, imode); 12368 *cost += COSTS_N_INSNS (ncost); 12369 return true; 12370 } 12371 12372 if (speed) 12373 { 12374 /* mov[df,sf]_aarch64. */ 12375 if (aarch64_float_const_representable_p (x)) 12376 /* FMOV (scalar immediate). */ 12377 *cost += extra_cost->fp[mode == DFmode].fpconst; 12378 else if (!aarch64_float_const_zero_rtx_p (x)) 12379 { 12380 /* This will be a load from memory. */ 12381 if (mode == DFmode) 12382 *cost += extra_cost->ldst.loadd; 12383 else 12384 *cost += extra_cost->ldst.loadf; 12385 } 12386 else 12387 /* Otherwise this is +0.0. We get this using MOVI d0, #0 12388 or MOV v0.s[0], wzr - neither of which are modeled by the 12389 cost tables. Just use the default cost. */ 12390 { 12391 } 12392 } 12393 12394 return true; 12395 12396 case MEM: 12397 if (speed) 12398 { 12399 /* For loads we want the base cost of a load, plus an 12400 approximation for the additional cost of the addressing 12401 mode. */ 12402 rtx address = XEXP (x, 0); 12403 if (VECTOR_MODE_P (mode)) 12404 *cost += extra_cost->ldst.loadv; 12405 else if (GET_MODE_CLASS (mode) == MODE_INT) 12406 *cost += extra_cost->ldst.load; 12407 else if (mode == SFmode) 12408 *cost += extra_cost->ldst.loadf; 12409 else if (mode == DFmode) 12410 *cost += extra_cost->ldst.loadd; 12411 12412 *cost += 12413 COSTS_N_INSNS (aarch64_address_cost (address, mode, 12414 0, speed)); 12415 } 12416 12417 return true; 12418 12419 case NEG: 12420 op0 = XEXP (x, 0); 12421 12422 if (VECTOR_MODE_P (mode)) 12423 { 12424 if (speed) 12425 { 12426 /* FNEG. */ 12427 *cost += extra_cost->vect.alu; 12428 } 12429 return false; 12430 } 12431 12432 if (GET_MODE_CLASS (mode) == MODE_INT) 12433 { 12434 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE 12435 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) 12436 { 12437 /* CSETM. */ 12438 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed); 12439 return true; 12440 } 12441 12442 /* Cost this as SUB wzr, X. */ 12443 op0 = CONST0_RTX (mode); 12444 op1 = XEXP (x, 0); 12445 goto cost_minus; 12446 } 12447 12448 if (GET_MODE_CLASS (mode) == MODE_FLOAT) 12449 { 12450 /* Support (neg(fma...)) as a single instruction only if 12451 sign of zeros is unimportant. This matches the decision 12452 making in aarch64.md. */ 12453 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0))) 12454 { 12455 /* FNMADD. */ 12456 *cost = rtx_cost (op0, mode, NEG, 0, speed); 12457 return true; 12458 } 12459 if (GET_CODE (op0) == MULT) 12460 { 12461 /* FNMUL. */ 12462 *cost = rtx_cost (op0, mode, NEG, 0, speed); 12463 return true; 12464 } 12465 if (speed) 12466 /* FNEG. */ 12467 *cost += extra_cost->fp[mode == DFmode].neg; 12468 return false; 12469 } 12470 12471 return false; 12472 12473 case CLRSB: 12474 case CLZ: 12475 if (speed) 12476 { 12477 if (VECTOR_MODE_P (mode)) 12478 *cost += extra_cost->vect.alu; 12479 else 12480 *cost += extra_cost->alu.clz; 12481 } 12482 12483 return false; 12484 12485 case CTZ: 12486 *cost = COSTS_N_INSNS (2); 12487 12488 if (speed) 12489 *cost += extra_cost->alu.clz + extra_cost->alu.rev; 12490 return false; 12491 12492 case COMPARE: 12493 op0 = XEXP (x, 0); 12494 op1 = XEXP (x, 1); 12495 12496 if (op1 == const0_rtx 12497 && GET_CODE (op0) == AND) 12498 { 12499 x = op0; 12500 mode = GET_MODE (op0); 12501 goto cost_logic; 12502 } 12503 12504 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT) 12505 { 12506 /* TODO: A write to the CC flags possibly costs extra, this 12507 needs encoding in the cost tables. */ 12508 12509 mode = GET_MODE (op0); 12510 /* ANDS. */ 12511 if (GET_CODE (op0) == AND) 12512 { 12513 x = op0; 12514 goto cost_logic; 12515 } 12516 12517 if (GET_CODE (op0) == PLUS) 12518 { 12519 /* ADDS (and CMN alias). */ 12520 x = op0; 12521 goto cost_plus; 12522 } 12523 12524 if (GET_CODE (op0) == MINUS) 12525 { 12526 /* SUBS. */ 12527 x = op0; 12528 goto cost_minus; 12529 } 12530 12531 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx 12532 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1)) 12533 && CONST_INT_P (XEXP (op0, 2))) 12534 { 12535 /* COMPARE of ZERO_EXTRACT form of TST-immediate. 12536 Handle it here directly rather than going to cost_logic 12537 since we know the immediate generated for the TST is valid 12538 so we can avoid creating an intermediate rtx for it only 12539 for costing purposes. */ 12540 if (speed) 12541 *cost += extra_cost->alu.logical; 12542 12543 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0), 12544 ZERO_EXTRACT, 0, speed); 12545 return true; 12546 } 12547 12548 if (GET_CODE (op1) == NEG) 12549 { 12550 /* CMN. */ 12551 if (speed) 12552 *cost += extra_cost->alu.arith; 12553 12554 *cost += rtx_cost (op0, mode, COMPARE, 0, speed); 12555 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed); 12556 return true; 12557 } 12558 12559 /* CMP. 12560 12561 Compare can freely swap the order of operands, and 12562 canonicalization puts the more complex operation first. 12563 But the integer MINUS logic expects the shift/extend 12564 operation in op1. */ 12565 if (! (REG_P (op0) 12566 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0))))) 12567 { 12568 op0 = XEXP (x, 1); 12569 op1 = XEXP (x, 0); 12570 } 12571 goto cost_minus; 12572 } 12573 12574 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT) 12575 { 12576 /* FCMP. */ 12577 if (speed) 12578 *cost += extra_cost->fp[mode == DFmode].compare; 12579 12580 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1)) 12581 { 12582 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed); 12583 /* FCMP supports constant 0.0 for no extra cost. */ 12584 return true; 12585 } 12586 return false; 12587 } 12588 12589 if (VECTOR_MODE_P (mode)) 12590 { 12591 /* Vector compare. */ 12592 if (speed) 12593 *cost += extra_cost->vect.alu; 12594 12595 if (aarch64_float_const_zero_rtx_p (op1)) 12596 { 12597 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra 12598 cost. */ 12599 return true; 12600 } 12601 return false; 12602 } 12603 return false; 12604 12605 case MINUS: 12606 { 12607 op0 = XEXP (x, 0); 12608 op1 = XEXP (x, 1); 12609 12610cost_minus: 12611 *cost += rtx_cost (op0, mode, MINUS, 0, speed); 12612 12613 /* Detect valid immediates. */ 12614 if ((GET_MODE_CLASS (mode) == MODE_INT 12615 || (GET_MODE_CLASS (mode) == MODE_CC 12616 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)) 12617 && CONST_INT_P (op1) 12618 && aarch64_uimm12_shift (INTVAL (op1))) 12619 { 12620 if (speed) 12621 /* SUB(S) (immediate). */ 12622 *cost += extra_cost->alu.arith; 12623 return true; 12624 } 12625 12626 /* Look for SUB (extended register). */ 12627 if (is_a <scalar_int_mode> (mode, &int_mode) 12628 && aarch64_rtx_arith_op_extract_p (op1, int_mode)) 12629 { 12630 if (speed) 12631 *cost += extra_cost->alu.extend_arith; 12632 12633 op1 = aarch64_strip_extend (op1, true); 12634 *cost += rtx_cost (op1, VOIDmode, 12635 (enum rtx_code) GET_CODE (op1), 0, speed); 12636 return true; 12637 } 12638 12639 rtx new_op1 = aarch64_strip_extend (op1, false); 12640 12641 /* Cost this as an FMA-alike operation. */ 12642 if ((GET_CODE (new_op1) == MULT 12643 || aarch64_shift_p (GET_CODE (new_op1))) 12644 && code != COMPARE) 12645 { 12646 *cost += aarch64_rtx_mult_cost (new_op1, MULT, 12647 (enum rtx_code) code, 12648 speed); 12649 return true; 12650 } 12651 12652 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed); 12653 12654 if (speed) 12655 { 12656 if (VECTOR_MODE_P (mode)) 12657 { 12658 /* Vector SUB. */ 12659 *cost += extra_cost->vect.alu; 12660 } 12661 else if (GET_MODE_CLASS (mode) == MODE_INT) 12662 { 12663 /* SUB(S). */ 12664 *cost += extra_cost->alu.arith; 12665 } 12666 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 12667 { 12668 /* FSUB. */ 12669 *cost += extra_cost->fp[mode == DFmode].addsub; 12670 } 12671 } 12672 return true; 12673 } 12674 12675 case PLUS: 12676 { 12677 rtx new_op0; 12678 12679 op0 = XEXP (x, 0); 12680 op1 = XEXP (x, 1); 12681 12682cost_plus: 12683 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE 12684 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) 12685 { 12686 /* CSINC. */ 12687 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed); 12688 *cost += rtx_cost (op1, mode, PLUS, 1, speed); 12689 return true; 12690 } 12691 12692 if (GET_MODE_CLASS (mode) == MODE_INT 12693 && (aarch64_plus_immediate (op1, mode) 12694 || aarch64_sve_addvl_addpl_immediate (op1, mode))) 12695 { 12696 *cost += rtx_cost (op0, mode, PLUS, 0, speed); 12697 12698 if (speed) 12699 { 12700 /* ADD (immediate). */ 12701 *cost += extra_cost->alu.arith; 12702 12703 /* Some tunings prefer to not use the VL-based scalar ops. 12704 Increase the cost of the poly immediate to prevent their 12705 formation. */ 12706 if (GET_CODE (op1) == CONST_POLY_INT 12707 && (aarch64_tune_params.extra_tuning_flags 12708 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS)) 12709 *cost += COSTS_N_INSNS (1); 12710 } 12711 return true; 12712 } 12713 12714 *cost += rtx_cost (op1, mode, PLUS, 1, speed); 12715 12716 /* Look for ADD (extended register). */ 12717 if (is_a <scalar_int_mode> (mode, &int_mode) 12718 && aarch64_rtx_arith_op_extract_p (op0, int_mode)) 12719 { 12720 if (speed) 12721 *cost += extra_cost->alu.extend_arith; 12722 12723 op0 = aarch64_strip_extend (op0, true); 12724 *cost += rtx_cost (op0, VOIDmode, 12725 (enum rtx_code) GET_CODE (op0), 0, speed); 12726 return true; 12727 } 12728 12729 /* Strip any extend, leave shifts behind as we will 12730 cost them through mult_cost. */ 12731 new_op0 = aarch64_strip_extend (op0, false); 12732 12733 if (GET_CODE (new_op0) == MULT 12734 || aarch64_shift_p (GET_CODE (new_op0))) 12735 { 12736 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS, 12737 speed); 12738 return true; 12739 } 12740 12741 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed); 12742 12743 if (speed) 12744 { 12745 if (VECTOR_MODE_P (mode)) 12746 { 12747 /* Vector ADD. */ 12748 *cost += extra_cost->vect.alu; 12749 } 12750 else if (GET_MODE_CLASS (mode) == MODE_INT) 12751 { 12752 /* ADD. */ 12753 *cost += extra_cost->alu.arith; 12754 } 12755 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 12756 { 12757 /* FADD. */ 12758 *cost += extra_cost->fp[mode == DFmode].addsub; 12759 } 12760 } 12761 return true; 12762 } 12763 12764 case BSWAP: 12765 *cost = COSTS_N_INSNS (1); 12766 12767 if (speed) 12768 { 12769 if (VECTOR_MODE_P (mode)) 12770 *cost += extra_cost->vect.alu; 12771 else 12772 *cost += extra_cost->alu.rev; 12773 } 12774 return false; 12775 12776 case IOR: 12777 if (aarch_rev16_p (x)) 12778 { 12779 *cost = COSTS_N_INSNS (1); 12780 12781 if (speed) 12782 { 12783 if (VECTOR_MODE_P (mode)) 12784 *cost += extra_cost->vect.alu; 12785 else 12786 *cost += extra_cost->alu.rev; 12787 } 12788 return true; 12789 } 12790 12791 if (aarch64_extr_rtx_p (x, &op0, &op1)) 12792 { 12793 *cost += rtx_cost (op0, mode, IOR, 0, speed); 12794 *cost += rtx_cost (op1, mode, IOR, 1, speed); 12795 if (speed) 12796 *cost += extra_cost->alu.shift; 12797 12798 return true; 12799 } 12800 /* Fall through. */ 12801 case XOR: 12802 case AND: 12803 cost_logic: 12804 op0 = XEXP (x, 0); 12805 op1 = XEXP (x, 1); 12806 12807 if (VECTOR_MODE_P (mode)) 12808 { 12809 if (speed) 12810 *cost += extra_cost->vect.alu; 12811 return true; 12812 } 12813 12814 if (code == AND 12815 && GET_CODE (op0) == MULT 12816 && CONST_INT_P (XEXP (op0, 1)) 12817 && CONST_INT_P (op1) 12818 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))), 12819 INTVAL (op1)) != 0) 12820 { 12821 /* This is a UBFM/SBFM. */ 12822 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed); 12823 if (speed) 12824 *cost += extra_cost->alu.bfx; 12825 return true; 12826 } 12827 12828 if (is_int_mode (mode, &int_mode)) 12829 { 12830 if (CONST_INT_P (op1)) 12831 { 12832 /* We have a mask + shift version of a UBFIZ 12833 i.e. the *andim_ashift<mode>_bfiz pattern. */ 12834 if (GET_CODE (op0) == ASHIFT 12835 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1, 12836 XEXP (op0, 1))) 12837 { 12838 *cost += rtx_cost (XEXP (op0, 0), int_mode, 12839 (enum rtx_code) code, 0, speed); 12840 if (speed) 12841 *cost += extra_cost->alu.bfx; 12842 12843 return true; 12844 } 12845 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode)) 12846 { 12847 /* We possibly get the immediate for free, this is not 12848 modelled. */ 12849 *cost += rtx_cost (op0, int_mode, 12850 (enum rtx_code) code, 0, speed); 12851 if (speed) 12852 *cost += extra_cost->alu.logical; 12853 12854 return true; 12855 } 12856 } 12857 else 12858 { 12859 rtx new_op0 = op0; 12860 12861 /* Handle ORN, EON, or BIC. */ 12862 if (GET_CODE (op0) == NOT) 12863 op0 = XEXP (op0, 0); 12864 12865 new_op0 = aarch64_strip_shift (op0); 12866 12867 /* If we had a shift on op0 then this is a logical-shift- 12868 by-register/immediate operation. Otherwise, this is just 12869 a logical operation. */ 12870 if (speed) 12871 { 12872 if (new_op0 != op0) 12873 { 12874 /* Shift by immediate. */ 12875 if (CONST_INT_P (XEXP (op0, 1))) 12876 *cost += extra_cost->alu.log_shift; 12877 else 12878 *cost += extra_cost->alu.log_shift_reg; 12879 } 12880 else 12881 *cost += extra_cost->alu.logical; 12882 } 12883 12884 /* In both cases we want to cost both operands. */ 12885 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code, 12886 0, speed); 12887 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code, 12888 1, speed); 12889 12890 return true; 12891 } 12892 } 12893 return false; 12894 12895 case NOT: 12896 x = XEXP (x, 0); 12897 op0 = aarch64_strip_shift (x); 12898 12899 if (VECTOR_MODE_P (mode)) 12900 { 12901 /* Vector NOT. */ 12902 *cost += extra_cost->vect.alu; 12903 return false; 12904 } 12905 12906 /* MVN-shifted-reg. */ 12907 if (op0 != x) 12908 { 12909 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed); 12910 12911 if (speed) 12912 *cost += extra_cost->alu.log_shift; 12913 12914 return true; 12915 } 12916 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)). 12917 Handle the second form here taking care that 'a' in the above can 12918 be a shift. */ 12919 else if (GET_CODE (op0) == XOR) 12920 { 12921 rtx newop0 = XEXP (op0, 0); 12922 rtx newop1 = XEXP (op0, 1); 12923 rtx op0_stripped = aarch64_strip_shift (newop0); 12924 12925 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed); 12926 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed); 12927 12928 if (speed) 12929 { 12930 if (op0_stripped != newop0) 12931 *cost += extra_cost->alu.log_shift; 12932 else 12933 *cost += extra_cost->alu.logical; 12934 } 12935 12936 return true; 12937 } 12938 /* MVN. */ 12939 if (speed) 12940 *cost += extra_cost->alu.logical; 12941 12942 return false; 12943 12944 case ZERO_EXTEND: 12945 12946 op0 = XEXP (x, 0); 12947 /* If a value is written in SI mode, then zero extended to DI 12948 mode, the operation will in general be free as a write to 12949 a 'w' register implicitly zeroes the upper bits of an 'x' 12950 register. However, if this is 12951 12952 (set (reg) (zero_extend (reg))) 12953 12954 we must cost the explicit register move. */ 12955 if (mode == DImode 12956 && GET_MODE (op0) == SImode 12957 && outer == SET) 12958 { 12959 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed); 12960 12961 /* If OP_COST is non-zero, then the cost of the zero extend 12962 is effectively the cost of the inner operation. Otherwise 12963 we have a MOV instruction and we take the cost from the MOV 12964 itself. This is true independently of whether we are 12965 optimizing for space or time. */ 12966 if (op_cost) 12967 *cost = op_cost; 12968 12969 return true; 12970 } 12971 else if (MEM_P (op0)) 12972 { 12973 /* All loads can zero extend to any size for free. */ 12974 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed); 12975 return true; 12976 } 12977 12978 op0 = aarch64_extend_bitfield_pattern_p (x); 12979 if (op0) 12980 { 12981 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed); 12982 if (speed) 12983 *cost += extra_cost->alu.bfx; 12984 return true; 12985 } 12986 12987 if (speed) 12988 { 12989 if (VECTOR_MODE_P (mode)) 12990 { 12991 /* UMOV. */ 12992 *cost += extra_cost->vect.alu; 12993 } 12994 else 12995 { 12996 /* We generate an AND instead of UXTB/UXTH. */ 12997 *cost += extra_cost->alu.logical; 12998 } 12999 } 13000 return false; 13001 13002 case SIGN_EXTEND: 13003 if (MEM_P (XEXP (x, 0))) 13004 { 13005 /* LDRSH. */ 13006 if (speed) 13007 { 13008 rtx address = XEXP (XEXP (x, 0), 0); 13009 *cost += extra_cost->ldst.load_sign_extend; 13010 13011 *cost += 13012 COSTS_N_INSNS (aarch64_address_cost (address, mode, 13013 0, speed)); 13014 } 13015 return true; 13016 } 13017 13018 op0 = aarch64_extend_bitfield_pattern_p (x); 13019 if (op0) 13020 { 13021 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed); 13022 if (speed) 13023 *cost += extra_cost->alu.bfx; 13024 return true; 13025 } 13026 13027 if (speed) 13028 { 13029 if (VECTOR_MODE_P (mode)) 13030 *cost += extra_cost->vect.alu; 13031 else 13032 *cost += extra_cost->alu.extend; 13033 } 13034 return false; 13035 13036 case ASHIFT: 13037 op0 = XEXP (x, 0); 13038 op1 = XEXP (x, 1); 13039 13040 if (CONST_INT_P (op1)) 13041 { 13042 if (speed) 13043 { 13044 if (VECTOR_MODE_P (mode)) 13045 { 13046 /* Vector shift (immediate). */ 13047 *cost += extra_cost->vect.alu; 13048 } 13049 else 13050 { 13051 /* LSL (immediate), UBMF, UBFIZ and friends. These are all 13052 aliases. */ 13053 *cost += extra_cost->alu.shift; 13054 } 13055 } 13056 13057 /* We can incorporate zero/sign extend for free. */ 13058 if (GET_CODE (op0) == ZERO_EXTEND 13059 || GET_CODE (op0) == SIGN_EXTEND) 13060 op0 = XEXP (op0, 0); 13061 13062 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed); 13063 return true; 13064 } 13065 else 13066 { 13067 if (VECTOR_MODE_P (mode)) 13068 { 13069 if (speed) 13070 /* Vector shift (register). */ 13071 *cost += extra_cost->vect.alu; 13072 } 13073 else 13074 { 13075 if (speed) 13076 /* LSLV. */ 13077 *cost += extra_cost->alu.shift_reg; 13078 13079 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) 13080 && CONST_INT_P (XEXP (op1, 1)) 13081 && known_eq (INTVAL (XEXP (op1, 1)), 13082 GET_MODE_BITSIZE (mode) - 1)) 13083 { 13084 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); 13085 /* We already demanded XEXP (op1, 0) to be REG_P, so 13086 don't recurse into it. */ 13087 return true; 13088 } 13089 } 13090 return false; /* All arguments need to be in registers. */ 13091 } 13092 13093 case ROTATE: 13094 case ROTATERT: 13095 case LSHIFTRT: 13096 case ASHIFTRT: 13097 op0 = XEXP (x, 0); 13098 op1 = XEXP (x, 1); 13099 13100 if (CONST_INT_P (op1)) 13101 { 13102 /* ASR (immediate) and friends. */ 13103 if (speed) 13104 { 13105 if (VECTOR_MODE_P (mode)) 13106 *cost += extra_cost->vect.alu; 13107 else 13108 *cost += extra_cost->alu.shift; 13109 } 13110 13111 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed); 13112 return true; 13113 } 13114 else 13115 { 13116 if (VECTOR_MODE_P (mode)) 13117 { 13118 if (speed) 13119 /* Vector shift (register). */ 13120 *cost += extra_cost->vect.alu; 13121 } 13122 else 13123 { 13124 if (speed) 13125 /* ASR (register) and friends. */ 13126 *cost += extra_cost->alu.shift_reg; 13127 13128 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) 13129 && CONST_INT_P (XEXP (op1, 1)) 13130 && known_eq (INTVAL (XEXP (op1, 1)), 13131 GET_MODE_BITSIZE (mode) - 1)) 13132 { 13133 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); 13134 /* We already demanded XEXP (op1, 0) to be REG_P, so 13135 don't recurse into it. */ 13136 return true; 13137 } 13138 } 13139 return false; /* All arguments need to be in registers. */ 13140 } 13141 13142 case SYMBOL_REF: 13143 13144 if (aarch64_cmodel == AARCH64_CMODEL_LARGE 13145 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC) 13146 { 13147 /* LDR. */ 13148 if (speed) 13149 *cost += extra_cost->ldst.load; 13150 } 13151 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL 13152 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC) 13153 { 13154 /* ADRP, followed by ADD. */ 13155 *cost += COSTS_N_INSNS (1); 13156 if (speed) 13157 *cost += 2 * extra_cost->alu.arith; 13158 } 13159 else if (aarch64_cmodel == AARCH64_CMODEL_TINY 13160 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) 13161 { 13162 /* ADR. */ 13163 if (speed) 13164 *cost += extra_cost->alu.arith; 13165 } 13166 13167 if (flag_pic) 13168 { 13169 /* One extra load instruction, after accessing the GOT. */ 13170 *cost += COSTS_N_INSNS (1); 13171 if (speed) 13172 *cost += extra_cost->ldst.load; 13173 } 13174 return true; 13175 13176 case HIGH: 13177 case LO_SUM: 13178 /* ADRP/ADD (immediate). */ 13179 if (speed) 13180 *cost += extra_cost->alu.arith; 13181 return true; 13182 13183 case ZERO_EXTRACT: 13184 case SIGN_EXTRACT: 13185 /* UBFX/SBFX. */ 13186 if (speed) 13187 { 13188 if (VECTOR_MODE_P (mode)) 13189 *cost += extra_cost->vect.alu; 13190 else 13191 *cost += extra_cost->alu.bfx; 13192 } 13193 13194 /* We can trust that the immediates used will be correct (there 13195 are no by-register forms), so we need only cost op0. */ 13196 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed); 13197 return true; 13198 13199 case MULT: 13200 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed); 13201 /* aarch64_rtx_mult_cost always handles recursion to its 13202 operands. */ 13203 return true; 13204 13205 case MOD: 13206 /* We can expand signed mod by power of 2 using a NEGS, two parallel 13207 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of 13208 an unconditional negate. This case should only ever be reached through 13209 the set_smod_pow2_cheap check in expmed.c. */ 13210 if (CONST_INT_P (XEXP (x, 1)) 13211 && exact_log2 (INTVAL (XEXP (x, 1))) > 0 13212 && (mode == SImode || mode == DImode)) 13213 { 13214 /* We expand to 4 instructions. Reset the baseline. */ 13215 *cost = COSTS_N_INSNS (4); 13216 13217 if (speed) 13218 *cost += 2 * extra_cost->alu.logical 13219 + 2 * extra_cost->alu.arith; 13220 13221 return true; 13222 } 13223 13224 /* Fall-through. */ 13225 case UMOD: 13226 if (speed) 13227 { 13228 /* Slighly prefer UMOD over SMOD. */ 13229 if (VECTOR_MODE_P (mode)) 13230 *cost += extra_cost->vect.alu; 13231 else if (GET_MODE_CLASS (mode) == MODE_INT) 13232 *cost += (extra_cost->mult[mode == DImode].add 13233 + extra_cost->mult[mode == DImode].idiv 13234 + (code == MOD ? 1 : 0)); 13235 } 13236 return false; /* All arguments need to be in registers. */ 13237 13238 case DIV: 13239 case UDIV: 13240 case SQRT: 13241 if (speed) 13242 { 13243 if (VECTOR_MODE_P (mode)) 13244 *cost += extra_cost->vect.alu; 13245 else if (GET_MODE_CLASS (mode) == MODE_INT) 13246 /* There is no integer SQRT, so only DIV and UDIV can get 13247 here. */ 13248 *cost += (extra_cost->mult[mode == DImode].idiv 13249 /* Slighly prefer UDIV over SDIV. */ 13250 + (code == DIV ? 1 : 0)); 13251 else 13252 *cost += extra_cost->fp[mode == DFmode].div; 13253 } 13254 return false; /* All arguments need to be in registers. */ 13255 13256 case IF_THEN_ELSE: 13257 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1), 13258 XEXP (x, 2), cost, speed); 13259 13260 case EQ: 13261 case NE: 13262 case GT: 13263 case GTU: 13264 case LT: 13265 case LTU: 13266 case GE: 13267 case GEU: 13268 case LE: 13269 case LEU: 13270 13271 return false; /* All arguments must be in registers. */ 13272 13273 case FMA: 13274 op0 = XEXP (x, 0); 13275 op1 = XEXP (x, 1); 13276 op2 = XEXP (x, 2); 13277 13278 if (speed) 13279 { 13280 if (VECTOR_MODE_P (mode)) 13281 *cost += extra_cost->vect.alu; 13282 else 13283 *cost += extra_cost->fp[mode == DFmode].fma; 13284 } 13285 13286 /* FMSUB, FNMADD, and FNMSUB are free. */ 13287 if (GET_CODE (op0) == NEG) 13288 op0 = XEXP (op0, 0); 13289 13290 if (GET_CODE (op2) == NEG) 13291 op2 = XEXP (op2, 0); 13292 13293 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1, 13294 and the by-element operand as operand 0. */ 13295 if (GET_CODE (op1) == NEG) 13296 op1 = XEXP (op1, 0); 13297 13298 /* Catch vector-by-element operations. The by-element operand can 13299 either be (vec_duplicate (vec_select (x))) or just 13300 (vec_select (x)), depending on whether we are multiplying by 13301 a vector or a scalar. 13302 13303 Canonicalization is not very good in these cases, FMA4 will put the 13304 by-element operand as operand 0, FNMA4 will have it as operand 1. */ 13305 if (GET_CODE (op0) == VEC_DUPLICATE) 13306 op0 = XEXP (op0, 0); 13307 else if (GET_CODE (op1) == VEC_DUPLICATE) 13308 op1 = XEXP (op1, 0); 13309 13310 if (GET_CODE (op0) == VEC_SELECT) 13311 op0 = XEXP (op0, 0); 13312 else if (GET_CODE (op1) == VEC_SELECT) 13313 op1 = XEXP (op1, 0); 13314 13315 /* If the remaining parameters are not registers, 13316 get the cost to put them into registers. */ 13317 *cost += rtx_cost (op0, mode, FMA, 0, speed); 13318 *cost += rtx_cost (op1, mode, FMA, 1, speed); 13319 *cost += rtx_cost (op2, mode, FMA, 2, speed); 13320 return true; 13321 13322 case FLOAT: 13323 case UNSIGNED_FLOAT: 13324 if (speed) 13325 *cost += extra_cost->fp[mode == DFmode].fromint; 13326 return false; 13327 13328 case FLOAT_EXTEND: 13329 if (speed) 13330 { 13331 if (VECTOR_MODE_P (mode)) 13332 { 13333 /*Vector truncate. */ 13334 *cost += extra_cost->vect.alu; 13335 } 13336 else 13337 *cost += extra_cost->fp[mode == DFmode].widen; 13338 } 13339 return false; 13340 13341 case FLOAT_TRUNCATE: 13342 if (speed) 13343 { 13344 if (VECTOR_MODE_P (mode)) 13345 { 13346 /*Vector conversion. */ 13347 *cost += extra_cost->vect.alu; 13348 } 13349 else 13350 *cost += extra_cost->fp[mode == DFmode].narrow; 13351 } 13352 return false; 13353 13354 case FIX: 13355 case UNSIGNED_FIX: 13356 x = XEXP (x, 0); 13357 /* Strip the rounding part. They will all be implemented 13358 by the fcvt* family of instructions anyway. */ 13359 if (GET_CODE (x) == UNSPEC) 13360 { 13361 unsigned int uns_code = XINT (x, 1); 13362 13363 if (uns_code == UNSPEC_FRINTA 13364 || uns_code == UNSPEC_FRINTM 13365 || uns_code == UNSPEC_FRINTN 13366 || uns_code == UNSPEC_FRINTP 13367 || uns_code == UNSPEC_FRINTZ) 13368 x = XVECEXP (x, 0, 0); 13369 } 13370 13371 if (speed) 13372 { 13373 if (VECTOR_MODE_P (mode)) 13374 *cost += extra_cost->vect.alu; 13375 else 13376 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint; 13377 } 13378 13379 /* We can combine fmul by a power of 2 followed by a fcvt into a single 13380 fixed-point fcvt. */ 13381 if (GET_CODE (x) == MULT 13382 && ((VECTOR_MODE_P (mode) 13383 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0) 13384 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0)) 13385 { 13386 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code, 13387 0, speed); 13388 return true; 13389 } 13390 13391 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed); 13392 return true; 13393 13394 case ABS: 13395 if (VECTOR_MODE_P (mode)) 13396 { 13397 /* ABS (vector). */ 13398 if (speed) 13399 *cost += extra_cost->vect.alu; 13400 } 13401 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 13402 { 13403 op0 = XEXP (x, 0); 13404 13405 /* FABD, which is analogous to FADD. */ 13406 if (GET_CODE (op0) == MINUS) 13407 { 13408 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed); 13409 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed); 13410 if (speed) 13411 *cost += extra_cost->fp[mode == DFmode].addsub; 13412 13413 return true; 13414 } 13415 /* Simple FABS is analogous to FNEG. */ 13416 if (speed) 13417 *cost += extra_cost->fp[mode == DFmode].neg; 13418 } 13419 else 13420 { 13421 /* Integer ABS will either be split to 13422 two arithmetic instructions, or will be an ABS 13423 (scalar), which we don't model. */ 13424 *cost = COSTS_N_INSNS (2); 13425 if (speed) 13426 *cost += 2 * extra_cost->alu.arith; 13427 } 13428 return false; 13429 13430 case SMAX: 13431 case SMIN: 13432 if (speed) 13433 { 13434 if (VECTOR_MODE_P (mode)) 13435 *cost += extra_cost->vect.alu; 13436 else 13437 { 13438 /* FMAXNM/FMINNM/FMAX/FMIN. 13439 TODO: This may not be accurate for all implementations, but 13440 we do not model this in the cost tables. */ 13441 *cost += extra_cost->fp[mode == DFmode].addsub; 13442 } 13443 } 13444 return false; 13445 13446 case UNSPEC: 13447 /* The floating point round to integer frint* instructions. */ 13448 if (aarch64_frint_unspec_p (XINT (x, 1))) 13449 { 13450 if (speed) 13451 *cost += extra_cost->fp[mode == DFmode].roundint; 13452 13453 return false; 13454 } 13455 13456 if (XINT (x, 1) == UNSPEC_RBIT) 13457 { 13458 if (speed) 13459 *cost += extra_cost->alu.rev; 13460 13461 return false; 13462 } 13463 break; 13464 13465 case TRUNCATE: 13466 13467 /* Decompose <su>muldi3_highpart. */ 13468 if (/* (truncate:DI */ 13469 mode == DImode 13470 /* (lshiftrt:TI */ 13471 && GET_MODE (XEXP (x, 0)) == TImode 13472 && GET_CODE (XEXP (x, 0)) == LSHIFTRT 13473 /* (mult:TI */ 13474 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT 13475 /* (ANY_EXTEND:TI (reg:DI)) 13476 (ANY_EXTEND:TI (reg:DI))) */ 13477 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND 13478 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND) 13479 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND 13480 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)) 13481 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode 13482 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode 13483 /* (const_int 64) */ 13484 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 13485 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64) 13486 { 13487 /* UMULH/SMULH. */ 13488 if (speed) 13489 *cost += extra_cost->mult[mode == DImode].extend; 13490 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0), 13491 mode, MULT, 0, speed); 13492 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0), 13493 mode, MULT, 1, speed); 13494 return true; 13495 } 13496 13497 /* Fall through. */ 13498 default: 13499 break; 13500 } 13501 13502 if (dump_file 13503 && flag_aarch64_verbose_cost) 13504 fprintf (dump_file, 13505 "\nFailed to cost RTX. Assuming default cost.\n"); 13506 13507 return true; 13508} 13509 13510/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost 13511 calculated for X. This cost is stored in *COST. Returns true 13512 if the total cost of X was calculated. */ 13513static bool 13514aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer, 13515 int param, int *cost, bool speed) 13516{ 13517 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed); 13518 13519 if (dump_file 13520 && flag_aarch64_verbose_cost) 13521 { 13522 print_rtl_single (dump_file, x); 13523 fprintf (dump_file, "\n%s cost: %d (%s)\n", 13524 speed ? "Hot" : "Cold", 13525 *cost, result ? "final" : "partial"); 13526 } 13527 13528 return result; 13529} 13530 13531static int 13532aarch64_register_move_cost (machine_mode mode, 13533 reg_class_t from_i, reg_class_t to_i) 13534{ 13535 enum reg_class from = (enum reg_class) from_i; 13536 enum reg_class to = (enum reg_class) to_i; 13537 const struct cpu_regmove_cost *regmove_cost 13538 = aarch64_tune_params.regmove_cost; 13539 13540 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */ 13541 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS 13542 || to == STUB_REGS) 13543 to = GENERAL_REGS; 13544 13545 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS 13546 || from == STUB_REGS) 13547 from = GENERAL_REGS; 13548 13549 /* Make RDFFR very expensive. In particular, if we know that the FFR 13550 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR 13551 as a way of obtaining a PTRUE. */ 13552 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL 13553 && hard_reg_set_subset_p (reg_class_contents[from_i], 13554 reg_class_contents[FFR_REGS])) 13555 return 80; 13556 13557 /* Moving between GPR and stack cost is the same as GP2GP. */ 13558 if ((from == GENERAL_REGS && to == STACK_REG) 13559 || (to == GENERAL_REGS && from == STACK_REG)) 13560 return regmove_cost->GP2GP; 13561 13562 /* To/From the stack register, we move via the gprs. */ 13563 if (to == STACK_REG || from == STACK_REG) 13564 return aarch64_register_move_cost (mode, from, GENERAL_REGS) 13565 + aarch64_register_move_cost (mode, GENERAL_REGS, to); 13566 13567 if (known_eq (GET_MODE_SIZE (mode), 16)) 13568 { 13569 /* 128-bit operations on general registers require 2 instructions. */ 13570 if (from == GENERAL_REGS && to == GENERAL_REGS) 13571 return regmove_cost->GP2GP * 2; 13572 else if (from == GENERAL_REGS) 13573 return regmove_cost->GP2FP * 2; 13574 else if (to == GENERAL_REGS) 13575 return regmove_cost->FP2GP * 2; 13576 13577 /* When AdvSIMD instructions are disabled it is not possible to move 13578 a 128-bit value directly between Q registers. This is handled in 13579 secondary reload. A general register is used as a scratch to move 13580 the upper DI value and the lower DI value is moved directly, 13581 hence the cost is the sum of three moves. */ 13582 if (! TARGET_SIMD) 13583 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP; 13584 13585 return regmove_cost->FP2FP; 13586 } 13587 13588 if (from == GENERAL_REGS && to == GENERAL_REGS) 13589 return regmove_cost->GP2GP; 13590 else if (from == GENERAL_REGS) 13591 return regmove_cost->GP2FP; 13592 else if (to == GENERAL_REGS) 13593 return regmove_cost->FP2GP; 13594 13595 return regmove_cost->FP2FP; 13596} 13597 13598static int 13599aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, 13600 reg_class_t rclass ATTRIBUTE_UNUSED, 13601 bool in ATTRIBUTE_UNUSED) 13602{ 13603 return aarch64_tune_params.memmov_cost; 13604} 13605 13606/* Implement TARGET_INIT_BUILTINS. */ 13607static void 13608aarch64_init_builtins () 13609{ 13610 aarch64_general_init_builtins (); 13611 aarch64_sve::init_builtins (); 13612} 13613 13614/* Implement TARGET_FOLD_BUILTIN. */ 13615static tree 13616aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool) 13617{ 13618 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl); 13619 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT; 13620 tree type = TREE_TYPE (TREE_TYPE (fndecl)); 13621 switch (code & AARCH64_BUILTIN_CLASS) 13622 { 13623 case AARCH64_BUILTIN_GENERAL: 13624 return aarch64_general_fold_builtin (subcode, type, nargs, args); 13625 13626 case AARCH64_BUILTIN_SVE: 13627 return NULL_TREE; 13628 } 13629 gcc_unreachable (); 13630} 13631 13632/* Implement TARGET_GIMPLE_FOLD_BUILTIN. */ 13633static bool 13634aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi) 13635{ 13636 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi)); 13637 tree fndecl = gimple_call_fndecl (stmt); 13638 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl); 13639 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT; 13640 gimple *new_stmt = NULL; 13641 switch (code & AARCH64_BUILTIN_CLASS) 13642 { 13643 case AARCH64_BUILTIN_GENERAL: 13644 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt); 13645 break; 13646 13647 case AARCH64_BUILTIN_SVE: 13648 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt); 13649 break; 13650 } 13651 13652 if (!new_stmt) 13653 return false; 13654 13655 gsi_replace (gsi, new_stmt, true); 13656 return true; 13657} 13658 13659/* Implement TARGET_EXPAND_BUILTIN. */ 13660static rtx 13661aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore) 13662{ 13663 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 13664 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl); 13665 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT; 13666 switch (code & AARCH64_BUILTIN_CLASS) 13667 { 13668 case AARCH64_BUILTIN_GENERAL: 13669 return aarch64_general_expand_builtin (subcode, exp, target, ignore); 13670 13671 case AARCH64_BUILTIN_SVE: 13672 return aarch64_sve::expand_builtin (subcode, exp, target); 13673 } 13674 gcc_unreachable (); 13675} 13676 13677/* Implement TARGET_BUILTIN_DECL. */ 13678static tree 13679aarch64_builtin_decl (unsigned int code, bool initialize_p) 13680{ 13681 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT; 13682 switch (code & AARCH64_BUILTIN_CLASS) 13683 { 13684 case AARCH64_BUILTIN_GENERAL: 13685 return aarch64_general_builtin_decl (subcode, initialize_p); 13686 13687 case AARCH64_BUILTIN_SVE: 13688 return aarch64_sve::builtin_decl (subcode, initialize_p); 13689 } 13690 gcc_unreachable (); 13691} 13692 13693/* Return true if it is safe and beneficial to use the approximate rsqrt optabs 13694 to optimize 1.0/sqrt. */ 13695 13696static bool 13697use_rsqrt_p (machine_mode mode) 13698{ 13699 return (!flag_trapping_math 13700 && flag_unsafe_math_optimizations 13701 && ((aarch64_tune_params.approx_modes->recip_sqrt 13702 & AARCH64_APPROX_MODE (mode)) 13703 || flag_mrecip_low_precision_sqrt)); 13704} 13705 13706/* Function to decide when to use the approximate reciprocal square root 13707 builtin. */ 13708 13709static tree 13710aarch64_builtin_reciprocal (tree fndecl) 13711{ 13712 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl)); 13713 13714 if (!use_rsqrt_p (mode)) 13715 return NULL_TREE; 13716 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl); 13717 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT; 13718 switch (code & AARCH64_BUILTIN_CLASS) 13719 { 13720 case AARCH64_BUILTIN_GENERAL: 13721 return aarch64_general_builtin_rsqrt (subcode); 13722 13723 case AARCH64_BUILTIN_SVE: 13724 return NULL_TREE; 13725 } 13726 gcc_unreachable (); 13727} 13728 13729/* Emit code to perform the floating-point operation: 13730 13731 DST = SRC1 * SRC2 13732 13733 where all three operands are already known to be registers. 13734 If the operation is an SVE one, PTRUE is a suitable all-true 13735 predicate. */ 13736 13737static void 13738aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2) 13739{ 13740 if (ptrue) 13741 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst), 13742 dst, ptrue, src1, src2, 13743 gen_int_mode (SVE_RELAXED_GP, SImode))); 13744 else 13745 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2)); 13746} 13747 13748/* Emit instruction sequence to compute either the approximate square root 13749 or its approximate reciprocal, depending on the flag RECP, and return 13750 whether the sequence was emitted or not. */ 13751 13752bool 13753aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) 13754{ 13755 machine_mode mode = GET_MODE (dst); 13756 13757 if (GET_MODE_INNER (mode) == HFmode) 13758 { 13759 gcc_assert (!recp); 13760 return false; 13761 } 13762 13763 if (!recp) 13764 { 13765 if (!(flag_mlow_precision_sqrt 13766 || (aarch64_tune_params.approx_modes->sqrt 13767 & AARCH64_APPROX_MODE (mode)))) 13768 return false; 13769 13770 if (!flag_finite_math_only 13771 || flag_trapping_math 13772 || !flag_unsafe_math_optimizations 13773 || optimize_function_for_size_p (cfun)) 13774 return false; 13775 } 13776 else 13777 /* Caller assumes we cannot fail. */ 13778 gcc_assert (use_rsqrt_p (mode)); 13779 13780 rtx pg = NULL_RTX; 13781 if (aarch64_sve_mode_p (mode)) 13782 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode)); 13783 machine_mode mmsk = (VECTOR_MODE_P (mode) 13784 ? related_int_vector_mode (mode).require () 13785 : int_mode_for_mode (mode).require ()); 13786 rtx xmsk = NULL_RTX; 13787 if (!recp) 13788 { 13789 /* When calculating the approximate square root, compare the 13790 argument with 0.0 and create a mask. */ 13791 rtx zero = CONST0_RTX (mode); 13792 if (pg) 13793 { 13794 xmsk = gen_reg_rtx (GET_MODE (pg)); 13795 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode); 13796 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode, 13797 xmsk, pg, hint, src, zero)); 13798 } 13799 else 13800 { 13801 xmsk = gen_reg_rtx (mmsk); 13802 emit_insn (gen_rtx_SET (xmsk, 13803 gen_rtx_NEG (mmsk, 13804 gen_rtx_EQ (mmsk, src, zero)))); 13805 } 13806 } 13807 13808 /* Estimate the approximate reciprocal square root. */ 13809 rtx xdst = gen_reg_rtx (mode); 13810 emit_insn (gen_aarch64_rsqrte (mode, xdst, src)); 13811 13812 /* Iterate over the series twice for SF and thrice for DF. */ 13813 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; 13814 13815 /* Optionally iterate over the series once less for faster performance 13816 while sacrificing the accuracy. */ 13817 if ((recp && flag_mrecip_low_precision_sqrt) 13818 || (!recp && flag_mlow_precision_sqrt)) 13819 iterations--; 13820 13821 /* Iterate over the series to calculate the approximate reciprocal square 13822 root. */ 13823 rtx x1 = gen_reg_rtx (mode); 13824 while (iterations--) 13825 { 13826 rtx x2 = gen_reg_rtx (mode); 13827 aarch64_emit_mult (x2, pg, xdst, xdst); 13828 13829 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2)); 13830 13831 if (iterations > 0) 13832 aarch64_emit_mult (xdst, pg, xdst, x1); 13833 } 13834 13835 if (!recp) 13836 { 13837 if (pg) 13838 /* Multiply nonzero source values by the corresponding intermediate 13839 result elements, so that the final calculation is the approximate 13840 square root rather than its reciprocal. Select a zero result for 13841 zero source values, to avoid the Inf * 0 -> NaN that we'd get 13842 otherwise. */ 13843 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode, 13844 xdst, xmsk, xdst, src, CONST0_RTX (mode))); 13845 else 13846 { 13847 /* Qualify the approximate reciprocal square root when the 13848 argument is 0.0 by squashing the intermediary result to 0.0. */ 13849 rtx xtmp = gen_reg_rtx (mmsk); 13850 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), 13851 gen_rtx_SUBREG (mmsk, xdst, 0))); 13852 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); 13853 13854 /* Calculate the approximate square root. */ 13855 aarch64_emit_mult (xdst, pg, xdst, src); 13856 } 13857 } 13858 13859 /* Finalize the approximation. */ 13860 aarch64_emit_mult (dst, pg, xdst, x1); 13861 13862 return true; 13863} 13864 13865/* Emit the instruction sequence to compute the approximation for the division 13866 of NUM by DEN in QUO and return whether the sequence was emitted or not. */ 13867 13868bool 13869aarch64_emit_approx_div (rtx quo, rtx num, rtx den) 13870{ 13871 machine_mode mode = GET_MODE (quo); 13872 13873 if (GET_MODE_INNER (mode) == HFmode) 13874 return false; 13875 13876 bool use_approx_division_p = (flag_mlow_precision_div 13877 || (aarch64_tune_params.approx_modes->division 13878 & AARCH64_APPROX_MODE (mode))); 13879 13880 if (!flag_finite_math_only 13881 || flag_trapping_math 13882 || !flag_unsafe_math_optimizations 13883 || optimize_function_for_size_p (cfun) 13884 || !use_approx_division_p) 13885 return false; 13886 13887 if (!TARGET_SIMD && VECTOR_MODE_P (mode)) 13888 return false; 13889 13890 rtx pg = NULL_RTX; 13891 if (aarch64_sve_mode_p (mode)) 13892 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode)); 13893 13894 /* Estimate the approximate reciprocal. */ 13895 rtx xrcp = gen_reg_rtx (mode); 13896 emit_insn (gen_aarch64_frecpe (mode, xrcp, den)); 13897 13898 /* Iterate over the series twice for SF and thrice for DF. */ 13899 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; 13900 13901 /* Optionally iterate over the series less for faster performance, 13902 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */ 13903 if (flag_mlow_precision_div) 13904 iterations = (GET_MODE_INNER (mode) == DFmode 13905 ? aarch64_double_recp_precision 13906 : aarch64_float_recp_precision); 13907 13908 /* Iterate over the series to calculate the approximate reciprocal. */ 13909 rtx xtmp = gen_reg_rtx (mode); 13910 while (iterations--) 13911 { 13912 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den)); 13913 13914 if (iterations > 0) 13915 aarch64_emit_mult (xrcp, pg, xrcp, xtmp); 13916 } 13917 13918 if (num != CONST1_RTX (mode)) 13919 { 13920 /* As the approximate reciprocal of DEN is already calculated, only 13921 calculate the approximate division when NUM is not 1.0. */ 13922 rtx xnum = force_reg (mode, num); 13923 aarch64_emit_mult (xrcp, pg, xrcp, xnum); 13924 } 13925 13926 /* Finalize the approximation. */ 13927 aarch64_emit_mult (quo, pg, xrcp, xtmp); 13928 return true; 13929} 13930 13931/* Return the number of instructions that can be issued per cycle. */ 13932static int 13933aarch64_sched_issue_rate (void) 13934{ 13935 return aarch64_tune_params.issue_rate; 13936} 13937 13938/* Implement TARGET_SCHED_VARIABLE_ISSUE. */ 13939static int 13940aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) 13941{ 13942 if (DEBUG_INSN_P (insn)) 13943 return more; 13944 13945 rtx_code code = GET_CODE (PATTERN (insn)); 13946 if (code == USE || code == CLOBBER) 13947 return more; 13948 13949 if (get_attr_type (insn) == TYPE_NO_INSN) 13950 return more; 13951 13952 return more - 1; 13953} 13954 13955static int 13956aarch64_sched_first_cycle_multipass_dfa_lookahead (void) 13957{ 13958 int issue_rate = aarch64_sched_issue_rate (); 13959 13960 return issue_rate > 1 && !sched_fusion ? issue_rate : 0; 13961} 13962 13963 13964/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as 13965 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only 13966 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */ 13967 13968static int 13969aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, 13970 int ready_index) 13971{ 13972 return autopref_multipass_dfa_lookahead_guard (insn, ready_index); 13973} 13974 13975 13976/* Vectorizer cost model target hooks. */ 13977 13978/* Implement targetm.vectorize.builtin_vectorization_cost. */ 13979static int 13980aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, 13981 tree vectype, 13982 int misalign ATTRIBUTE_UNUSED) 13983{ 13984 unsigned elements; 13985 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; 13986 bool fp = false; 13987 13988 if (vectype != NULL) 13989 fp = FLOAT_TYPE_P (vectype); 13990 13991 switch (type_of_cost) 13992 { 13993 case scalar_stmt: 13994 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost; 13995 13996 case scalar_load: 13997 return costs->scalar_load_cost; 13998 13999 case scalar_store: 14000 return costs->scalar_store_cost; 14001 14002 case vector_stmt: 14003 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; 14004 14005 case vector_load: 14006 return costs->vec_align_load_cost; 14007 14008 case vector_store: 14009 return costs->vec_store_cost; 14010 14011 case vec_to_scalar: 14012 return costs->vec_to_scalar_cost; 14013 14014 case scalar_to_vec: 14015 return costs->scalar_to_vec_cost; 14016 14017 case unaligned_load: 14018 case vector_gather_load: 14019 return costs->vec_unalign_load_cost; 14020 14021 case unaligned_store: 14022 case vector_scatter_store: 14023 return costs->vec_unalign_store_cost; 14024 14025 case cond_branch_taken: 14026 return costs->cond_taken_branch_cost; 14027 14028 case cond_branch_not_taken: 14029 return costs->cond_not_taken_branch_cost; 14030 14031 case vec_perm: 14032 return costs->vec_permute_cost; 14033 14034 case vec_promote_demote: 14035 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; 14036 14037 case vec_construct: 14038 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); 14039 return elements / 2 + 1; 14040 14041 default: 14042 gcc_unreachable (); 14043 } 14044} 14045 14046/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD 14047 vectors would produce a series of LDP or STP operations. KIND is the 14048 kind of statement that STMT_INFO represents. */ 14049static bool 14050aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind, 14051 stmt_vec_info stmt_info) 14052{ 14053 switch (kind) 14054 { 14055 case vector_load: 14056 case vector_store: 14057 case unaligned_load: 14058 case unaligned_store: 14059 break; 14060 14061 default: 14062 return false; 14063 } 14064 14065 if (aarch64_tune_params.extra_tuning_flags 14066 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) 14067 return false; 14068 14069 return is_gimple_assign (stmt_info->stmt); 14070} 14071 14072/* Return true if STMT_INFO extends the result of a load. */ 14073static bool 14074aarch64_extending_load_p (stmt_vec_info stmt_info) 14075{ 14076 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); 14077 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) 14078 return false; 14079 14080 tree rhs = gimple_assign_rhs1 (stmt_info->stmt); 14081 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); 14082 tree rhs_type = TREE_TYPE (rhs); 14083 if (!INTEGRAL_TYPE_P (lhs_type) 14084 || !INTEGRAL_TYPE_P (rhs_type) 14085 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type)) 14086 return false; 14087 14088 stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs); 14089 return (def_stmt_info 14090 && STMT_VINFO_DATA_REF (def_stmt_info) 14091 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info))); 14092} 14093 14094/* Return true if STMT_INFO is an integer truncation. */ 14095static bool 14096aarch64_integer_truncation_p (stmt_vec_info stmt_info) 14097{ 14098 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); 14099 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) 14100 return false; 14101 14102 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); 14103 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); 14104 return (INTEGRAL_TYPE_P (lhs_type) 14105 && INTEGRAL_TYPE_P (rhs_type) 14106 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); 14107} 14108 14109/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost 14110 for STMT_INFO, which has cost kind KIND and which when vectorized would 14111 operate on vector type VECTYPE. Adjust the cost as necessary for SVE 14112 targets. */ 14113static unsigned int 14114aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, 14115 stmt_vec_info stmt_info, tree vectype, 14116 unsigned int stmt_cost) 14117{ 14118 /* Unlike vec_promote_demote, vector_stmt conversions do not change the 14119 vector register size or number of units. Integer promotions of this 14120 type therefore map to SXT[BHW] or UXT[BHW]. 14121 14122 Most loads have extending forms that can do the sign or zero extension 14123 on the fly. Optimistically assume that a load followed by an extension 14124 will fold to this form during combine, and that the extension therefore 14125 comes for free. */ 14126 if (kind == vector_stmt && aarch64_extending_load_p (stmt_info)) 14127 stmt_cost = 0; 14128 14129 /* For similar reasons, vector_stmt integer truncations are a no-op, 14130 because we can just ignore the unused upper bits of the source. */ 14131 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info)) 14132 stmt_cost = 0; 14133 14134 /* Advanced SIMD can load and store pairs of registers using LDP and STP, 14135 but there are no equivalent instructions for SVE. This means that 14136 (all other things being equal) 128-bit SVE needs twice as many load 14137 and store instructions as Advanced SIMD in order to process vector pairs. 14138 14139 Also, scalar code can often use LDP and STP to access pairs of values, 14140 so it is too simplistic to say that one SVE load or store replaces 14141 VF scalar loads and stores. 14142 14143 Ideally we would account for this in the scalar and Advanced SIMD 14144 costs by making suitable load/store pairs as cheap as a single 14145 load/store. However, that would be a very invasive change and in 14146 practice it tends to stress other parts of the cost model too much. 14147 E.g. stores of scalar constants currently count just a store, 14148 whereas stores of vector constants count a store and a vec_init. 14149 This is an artificial distinction for AArch64, where stores of 14150 nonzero scalar constants need the same kind of register invariant 14151 as vector stores. 14152 14153 An alternative would be to double the cost of any SVE loads and stores 14154 that could be paired in Advanced SIMD (and possibly also paired in 14155 scalar code). But this tends to stress other parts of the cost model 14156 in the same way. It also means that we can fall back to Advanced SIMD 14157 even if full-loop predication would have been useful. 14158 14159 Here we go for a more conservative version: double the costs of SVE 14160 loads and stores if one iteration of the scalar loop processes enough 14161 elements for it to use a whole number of Advanced SIMD LDP or STP 14162 instructions. This makes it very likely that the VF would be 1 for 14163 Advanced SIMD, and so no epilogue should be needed. */ 14164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 14165 { 14166 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info); 14167 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first); 14168 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)); 14169 if (multiple_p (count * elt_bits, 256) 14170 && aarch64_advsimd_ldp_stp_p (kind, stmt_info)) 14171 stmt_cost *= 2; 14172 } 14173 14174 return stmt_cost; 14175} 14176 14177/* Implement targetm.vectorize.add_stmt_cost. */ 14178static unsigned 14179aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, 14180 struct _stmt_vec_info *stmt_info, int misalign, 14181 enum vect_cost_model_location where) 14182{ 14183 unsigned *cost = (unsigned *) data; 14184 unsigned retval = 0; 14185 14186 if (flag_vect_cost_model) 14187 { 14188 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; 14189 int stmt_cost = 14190 aarch64_builtin_vectorization_cost (kind, vectype, misalign); 14191 14192 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) 14193 stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, vectype, 14194 stmt_cost); 14195 14196 /* Statements in an inner loop relative to the loop being 14197 vectorized are weighted more heavily. The value here is 14198 arbitrary and could potentially be improved with analysis. */ 14199 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) 14200 count *= 50; /* FIXME */ 14201 14202 retval = (unsigned) (count * stmt_cost); 14203 cost[where] += retval; 14204 } 14205 14206 return retval; 14207} 14208 14209static void initialize_aarch64_code_model (struct gcc_options *); 14210 14211/* Parse the TO_PARSE string and put the architecture struct that it 14212 selects into RES and the architectural features into ISA_FLAGS. 14213 Return an aarch64_parse_opt_result describing the parse result. 14214 If there is an error parsing, RES and ISA_FLAGS are left unchanged. 14215 When the TO_PARSE string contains an invalid extension, 14216 a copy of the string is created and stored to INVALID_EXTENSION. */ 14217 14218static enum aarch64_parse_opt_result 14219aarch64_parse_arch (const char *to_parse, const struct processor **res, 14220 uint64_t *isa_flags, std::string *invalid_extension) 14221{ 14222 const char *ext; 14223 const struct processor *arch; 14224 size_t len; 14225 14226 ext = strchr (to_parse, '+'); 14227 14228 if (ext != NULL) 14229 len = ext - to_parse; 14230 else 14231 len = strlen (to_parse); 14232 14233 if (len == 0) 14234 return AARCH64_PARSE_MISSING_ARG; 14235 14236 14237 /* Loop through the list of supported ARCHes to find a match. */ 14238 for (arch = all_architectures; arch->name != NULL; arch++) 14239 { 14240 if (strlen (arch->name) == len 14241 && strncmp (arch->name, to_parse, len) == 0) 14242 { 14243 uint64_t isa_temp = arch->flags; 14244 14245 if (ext != NULL) 14246 { 14247 /* TO_PARSE string contains at least one extension. */ 14248 enum aarch64_parse_opt_result ext_res 14249 = aarch64_parse_extension (ext, &isa_temp, invalid_extension); 14250 14251 if (ext_res != AARCH64_PARSE_OK) 14252 return ext_res; 14253 } 14254 /* Extension parsing was successful. Confirm the result 14255 arch and ISA flags. */ 14256 *res = arch; 14257 *isa_flags = isa_temp; 14258 return AARCH64_PARSE_OK; 14259 } 14260 } 14261 14262 /* ARCH name not found in list. */ 14263 return AARCH64_PARSE_INVALID_ARG; 14264} 14265 14266/* Parse the TO_PARSE string and put the result tuning in RES and the 14267 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result 14268 describing the parse result. If there is an error parsing, RES and 14269 ISA_FLAGS are left unchanged. 14270 When the TO_PARSE string contains an invalid extension, 14271 a copy of the string is created and stored to INVALID_EXTENSION. */ 14272 14273static enum aarch64_parse_opt_result 14274aarch64_parse_cpu (const char *to_parse, const struct processor **res, 14275 uint64_t *isa_flags, std::string *invalid_extension) 14276{ 14277 const char *ext; 14278 const struct processor *cpu; 14279 size_t len; 14280 14281 ext = strchr (to_parse, '+'); 14282 14283 if (ext != NULL) 14284 len = ext - to_parse; 14285 else 14286 len = strlen (to_parse); 14287 14288 if (len == 0) 14289 return AARCH64_PARSE_MISSING_ARG; 14290 14291 14292 /* Loop through the list of supported CPUs to find a match. */ 14293 for (cpu = all_cores; cpu->name != NULL; cpu++) 14294 { 14295 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0) 14296 { 14297 uint64_t isa_temp = cpu->flags; 14298 14299 14300 if (ext != NULL) 14301 { 14302 /* TO_PARSE string contains at least one extension. */ 14303 enum aarch64_parse_opt_result ext_res 14304 = aarch64_parse_extension (ext, &isa_temp, invalid_extension); 14305 14306 if (ext_res != AARCH64_PARSE_OK) 14307 return ext_res; 14308 } 14309 /* Extension parsing was successfull. Confirm the result 14310 cpu and ISA flags. */ 14311 *res = cpu; 14312 *isa_flags = isa_temp; 14313 return AARCH64_PARSE_OK; 14314 } 14315 } 14316 14317 /* CPU name not found in list. */ 14318 return AARCH64_PARSE_INVALID_ARG; 14319} 14320 14321/* Parse the TO_PARSE string and put the cpu it selects into RES. 14322 Return an aarch64_parse_opt_result describing the parse result. 14323 If the parsing fails the RES does not change. */ 14324 14325static enum aarch64_parse_opt_result 14326aarch64_parse_tune (const char *to_parse, const struct processor **res) 14327{ 14328 const struct processor *cpu; 14329 14330 /* Loop through the list of supported CPUs to find a match. */ 14331 for (cpu = all_cores; cpu->name != NULL; cpu++) 14332 { 14333 if (strcmp (cpu->name, to_parse) == 0) 14334 { 14335 *res = cpu; 14336 return AARCH64_PARSE_OK; 14337 } 14338 } 14339 14340 /* CPU name not found in list. */ 14341 return AARCH64_PARSE_INVALID_ARG; 14342} 14343 14344/* Parse TOKEN, which has length LENGTH to see if it is an option 14345 described in FLAG. If it is, return the index bit for that fusion type. 14346 If not, error (printing OPTION_NAME) and return zero. */ 14347 14348static unsigned int 14349aarch64_parse_one_option_token (const char *token, 14350 size_t length, 14351 const struct aarch64_flag_desc *flag, 14352 const char *option_name) 14353{ 14354 for (; flag->name != NULL; flag++) 14355 { 14356 if (length == strlen (flag->name) 14357 && !strncmp (flag->name, token, length)) 14358 return flag->flag; 14359 } 14360 14361 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token); 14362 return 0; 14363} 14364 14365/* Parse OPTION which is a comma-separated list of flags to enable. 14366 FLAGS gives the list of flags we understand, INITIAL_STATE gives any 14367 default state we inherit from the CPU tuning structures. OPTION_NAME 14368 gives the top-level option we are parsing in the -moverride string, 14369 for use in error messages. */ 14370 14371static unsigned int 14372aarch64_parse_boolean_options (const char *option, 14373 const struct aarch64_flag_desc *flags, 14374 unsigned int initial_state, 14375 const char *option_name) 14376{ 14377 const char separator = '.'; 14378 const char* specs = option; 14379 const char* ntoken = option; 14380 unsigned int found_flags = initial_state; 14381 14382 while ((ntoken = strchr (specs, separator))) 14383 { 14384 size_t token_length = ntoken - specs; 14385 unsigned token_ops = aarch64_parse_one_option_token (specs, 14386 token_length, 14387 flags, 14388 option_name); 14389 /* If we find "none" (or, for simplicity's sake, an error) anywhere 14390 in the token stream, reset the supported operations. So: 14391 14392 adrp+add.cmp+branch.none.adrp+add 14393 14394 would have the result of turning on only adrp+add fusion. */ 14395 if (!token_ops) 14396 found_flags = 0; 14397 14398 found_flags |= token_ops; 14399 specs = ++ntoken; 14400 } 14401 14402 /* We ended with a comma, print something. */ 14403 if (!(*specs)) 14404 { 14405 error ("%s string ill-formed\n", option_name); 14406 return 0; 14407 } 14408 14409 /* We still have one more token to parse. */ 14410 size_t token_length = strlen (specs); 14411 unsigned token_ops = aarch64_parse_one_option_token (specs, 14412 token_length, 14413 flags, 14414 option_name); 14415 if (!token_ops) 14416 found_flags = 0; 14417 14418 found_flags |= token_ops; 14419 return found_flags; 14420} 14421 14422/* Support for overriding instruction fusion. */ 14423 14424static void 14425aarch64_parse_fuse_string (const char *fuse_string, 14426 struct tune_params *tune) 14427{ 14428 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string, 14429 aarch64_fusible_pairs, 14430 tune->fusible_ops, 14431 "fuse="); 14432} 14433 14434/* Support for overriding other tuning flags. */ 14435 14436static void 14437aarch64_parse_tune_string (const char *tune_string, 14438 struct tune_params *tune) 14439{ 14440 tune->extra_tuning_flags 14441 = aarch64_parse_boolean_options (tune_string, 14442 aarch64_tuning_flags, 14443 tune->extra_tuning_flags, 14444 "tune="); 14445} 14446 14447/* Parse the sve_width tuning moverride string in TUNE_STRING. 14448 Accept the valid SVE vector widths allowed by 14449 aarch64_sve_vector_bits_enum and use it to override sve_width 14450 in TUNE. */ 14451 14452static void 14453aarch64_parse_sve_width_string (const char *tune_string, 14454 struct tune_params *tune) 14455{ 14456 int width = -1; 14457 14458 int n = sscanf (tune_string, "%d", &width); 14459 if (n == EOF) 14460 { 14461 error ("invalid format for sve_width"); 14462 return; 14463 } 14464 switch (width) 14465 { 14466 case SVE_128: 14467 case SVE_256: 14468 case SVE_512: 14469 case SVE_1024: 14470 case SVE_2048: 14471 break; 14472 default: 14473 error ("invalid sve_width value: %d", width); 14474 } 14475 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width; 14476} 14477 14478/* Parse TOKEN, which has length LENGTH to see if it is a tuning option 14479 we understand. If it is, extract the option string and handoff to 14480 the appropriate function. */ 14481 14482void 14483aarch64_parse_one_override_token (const char* token, 14484 size_t length, 14485 struct tune_params *tune) 14486{ 14487 const struct aarch64_tuning_override_function *fn 14488 = aarch64_tuning_override_functions; 14489 14490 const char *option_part = strchr (token, '='); 14491 if (!option_part) 14492 { 14493 error ("tuning string missing in option (%s)", token); 14494 return; 14495 } 14496 14497 /* Get the length of the option name. */ 14498 length = option_part - token; 14499 /* Skip the '=' to get to the option string. */ 14500 option_part++; 14501 14502 for (; fn->name != NULL; fn++) 14503 { 14504 if (!strncmp (fn->name, token, length)) 14505 { 14506 fn->parse_override (option_part, tune); 14507 return; 14508 } 14509 } 14510 14511 error ("unknown tuning option (%s)",token); 14512 return; 14513} 14514 14515/* A checking mechanism for the implementation of the tls size. */ 14516 14517static void 14518initialize_aarch64_tls_size (struct gcc_options *opts) 14519{ 14520 if (aarch64_tls_size == 0) 14521 aarch64_tls_size = 24; 14522 14523 switch (opts->x_aarch64_cmodel_var) 14524 { 14525 case AARCH64_CMODEL_TINY: 14526 /* Both the default and maximum TLS size allowed under tiny is 1M which 14527 needs two instructions to address, so we clamp the size to 24. */ 14528 if (aarch64_tls_size > 24) 14529 aarch64_tls_size = 24; 14530 break; 14531 case AARCH64_CMODEL_SMALL: 14532 /* The maximum TLS size allowed under small is 4G. */ 14533 if (aarch64_tls_size > 32) 14534 aarch64_tls_size = 32; 14535 break; 14536 case AARCH64_CMODEL_LARGE: 14537 /* The maximum TLS size allowed under large is 16E. 14538 FIXME: 16E should be 64bit, we only support 48bit offset now. */ 14539 if (aarch64_tls_size > 48) 14540 aarch64_tls_size = 48; 14541 break; 14542 default: 14543 gcc_unreachable (); 14544 } 14545 14546 return; 14547} 14548 14549/* Parse STRING looking for options in the format: 14550 string :: option:string 14551 option :: name=substring 14552 name :: {a-z} 14553 substring :: defined by option. */ 14554 14555static void 14556aarch64_parse_override_string (const char* input_string, 14557 struct tune_params* tune) 14558{ 14559 const char separator = ':'; 14560 size_t string_length = strlen (input_string) + 1; 14561 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length); 14562 char *string = string_root; 14563 strncpy (string, input_string, string_length); 14564 string[string_length - 1] = '\0'; 14565 14566 char* ntoken = string; 14567 14568 while ((ntoken = strchr (string, separator))) 14569 { 14570 size_t token_length = ntoken - string; 14571 /* Make this substring look like a string. */ 14572 *ntoken = '\0'; 14573 aarch64_parse_one_override_token (string, token_length, tune); 14574 string = ++ntoken; 14575 } 14576 14577 /* One last option to parse. */ 14578 aarch64_parse_one_override_token (string, strlen (string), tune); 14579 free (string_root); 14580} 14581 14582/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that 14583 are best for a generic target with the currently-enabled architecture 14584 extensions. */ 14585static void 14586aarch64_adjust_generic_arch_tuning (struct tune_params ¤t_tune) 14587{ 14588 /* Neoverse V1 is the only core that is known to benefit from 14589 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no 14590 point enabling it for SVE2 and above. */ 14591 if (TARGET_SVE2) 14592 current_tune.extra_tuning_flags 14593 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS; 14594} 14595 14596static void 14597aarch64_override_options_after_change_1 (struct gcc_options *opts) 14598{ 14599 if (accepted_branch_protection_string) 14600 { 14601 opts->x_aarch64_branch_protection_string 14602 = xstrdup (accepted_branch_protection_string); 14603 } 14604 14605 /* PR 70044: We have to be careful about being called multiple times for the 14606 same function. This means all changes should be repeatable. */ 14607 14608 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer. 14609 Disable the frame pointer flag so the mid-end will not use a frame 14610 pointer in leaf functions in order to support -fomit-leaf-frame-pointer. 14611 Set x_flag_omit_frame_pointer to the special value 2 to differentiate 14612 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */ 14613 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1; 14614 if (opts->x_flag_omit_frame_pointer == 0) 14615 opts->x_flag_omit_frame_pointer = 2; 14616 14617 /* If not optimizing for size, set the default 14618 alignment to what the target wants. */ 14619 if (!opts->x_optimize_size) 14620 { 14621 if (opts->x_flag_align_loops && !opts->x_str_align_loops) 14622 opts->x_str_align_loops = aarch64_tune_params.loop_align; 14623 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) 14624 opts->x_str_align_jumps = aarch64_tune_params.jump_align; 14625 if (opts->x_flag_align_functions && !opts->x_str_align_functions) 14626 opts->x_str_align_functions = aarch64_tune_params.function_align; 14627 } 14628 14629 /* We default to no pc-relative literal loads. */ 14630 14631 aarch64_pcrelative_literal_loads = false; 14632 14633 /* If -mpc-relative-literal-loads is set on the command line, this 14634 implies that the user asked for PC relative literal loads. */ 14635 if (opts->x_pcrelative_literal_loads == 1) 14636 aarch64_pcrelative_literal_loads = true; 14637 14638 /* In the tiny memory model it makes no sense to disallow PC relative 14639 literal pool loads. */ 14640 if (aarch64_cmodel == AARCH64_CMODEL_TINY 14641 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) 14642 aarch64_pcrelative_literal_loads = true; 14643 14644 /* When enabling the lower precision Newton series for the square root, also 14645 enable it for the reciprocal square root, since the latter is an 14646 intermediary step for the former. */ 14647 if (flag_mlow_precision_sqrt) 14648 flag_mrecip_low_precision_sqrt = true; 14649} 14650 14651/* 'Unpack' up the internal tuning structs and update the options 14652 in OPTS. The caller must have set up selected_tune and selected_arch 14653 as all the other target-specific codegen decisions are 14654 derived from them. */ 14655 14656void 14657aarch64_override_options_internal (struct gcc_options *opts) 14658{ 14659 aarch64_tune_flags = selected_tune->flags; 14660 aarch64_tune = selected_tune->sched_core; 14661 /* Make a copy of the tuning parameters attached to the core, which 14662 we may later overwrite. */ 14663 aarch64_tune_params = *(selected_tune->tune); 14664 aarch64_architecture_version = selected_arch->architecture_version; 14665 if (selected_tune->tune == &generic_tunings) 14666 aarch64_adjust_generic_arch_tuning (aarch64_tune_params); 14667 14668 if (opts->x_aarch64_override_tune_string) 14669 aarch64_parse_override_string (opts->x_aarch64_override_tune_string, 14670 &aarch64_tune_params); 14671 14672 /* This target defaults to strict volatile bitfields. */ 14673 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2)) 14674 opts->x_flag_strict_volatile_bitfields = 1; 14675 14676 if (aarch64_stack_protector_guard == SSP_GLOBAL 14677 && opts->x_aarch64_stack_protector_guard_offset_str) 14678 { 14679 error ("incompatible options %<-mstack-protector-guard=global%> and " 14680 "%<-mstack-protector-guard-offset=%s%>", 14681 aarch64_stack_protector_guard_offset_str); 14682 } 14683 14684 if (aarch64_stack_protector_guard == SSP_SYSREG 14685 && !(opts->x_aarch64_stack_protector_guard_offset_str 14686 && opts->x_aarch64_stack_protector_guard_reg_str)) 14687 { 14688 error ("both %<-mstack-protector-guard-offset%> and " 14689 "%<-mstack-protector-guard-reg%> must be used " 14690 "with %<-mstack-protector-guard=sysreg%>"); 14691 } 14692 14693 if (opts->x_aarch64_stack_protector_guard_reg_str) 14694 { 14695 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100) 14696 error ("specify a system register with a small string length."); 14697 } 14698 14699 if (opts->x_aarch64_stack_protector_guard_offset_str) 14700 { 14701 char *end; 14702 const char *str = aarch64_stack_protector_guard_offset_str; 14703 errno = 0; 14704 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0); 14705 if (!*str || *end || errno) 14706 error ("%qs is not a valid offset in %qs", str, 14707 "-mstack-protector-guard-offset="); 14708 aarch64_stack_protector_guard_offset = offs; 14709 } 14710 14711 initialize_aarch64_code_model (opts); 14712 initialize_aarch64_tls_size (opts); 14713 14714 int queue_depth = 0; 14715 switch (aarch64_tune_params.autoprefetcher_model) 14716 { 14717 case tune_params::AUTOPREFETCHER_OFF: 14718 queue_depth = -1; 14719 break; 14720 case tune_params::AUTOPREFETCHER_WEAK: 14721 queue_depth = 0; 14722 break; 14723 case tune_params::AUTOPREFETCHER_STRONG: 14724 queue_depth = max_insn_queue_index + 1; 14725 break; 14726 default: 14727 gcc_unreachable (); 14728 } 14729 14730 /* We don't mind passing in global_options_set here as we don't use 14731 the *options_set structs anyway. */ 14732 SET_OPTION_IF_UNSET (opts, &global_options_set, 14733 param_sched_autopref_queue_depth, queue_depth); 14734 14735 /* If the core wants only AdvancedSIMD autovectorization, do this through 14736 aarch64_autovec_preference. If the user set it explicitly, they should 14737 know what they want. */ 14738 if (aarch64_tune_params.extra_tuning_flags 14739 & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC) 14740 SET_OPTION_IF_UNSET (opts, &global_options_set, 14741 aarch64_autovec_preference, 1); 14742 14743 /* If using Advanced SIMD only for autovectorization disable SVE vector costs 14744 comparison. */ 14745 if (aarch64_autovec_preference == 1) 14746 SET_OPTION_IF_UNSET (opts, &global_options_set, 14747 aarch64_sve_compare_costs, 0); 14748 14749 /* Set up parameters to be used in prefetching algorithm. Do not 14750 override the defaults unless we are tuning for a core we have 14751 researched values for. */ 14752 if (aarch64_tune_params.prefetch->num_slots > 0) 14753 SET_OPTION_IF_UNSET (opts, &global_options_set, 14754 param_simultaneous_prefetches, 14755 aarch64_tune_params.prefetch->num_slots); 14756 if (aarch64_tune_params.prefetch->l1_cache_size >= 0) 14757 SET_OPTION_IF_UNSET (opts, &global_options_set, 14758 param_l1_cache_size, 14759 aarch64_tune_params.prefetch->l1_cache_size); 14760 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0) 14761 SET_OPTION_IF_UNSET (opts, &global_options_set, 14762 param_l1_cache_line_size, 14763 aarch64_tune_params.prefetch->l1_cache_line_size); 14764 if (aarch64_tune_params.prefetch->l2_cache_size >= 0) 14765 SET_OPTION_IF_UNSET (opts, &global_options_set, 14766 param_l2_cache_size, 14767 aarch64_tune_params.prefetch->l2_cache_size); 14768 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides) 14769 SET_OPTION_IF_UNSET (opts, &global_options_set, 14770 param_prefetch_dynamic_strides, 0); 14771 if (aarch64_tune_params.prefetch->minimum_stride >= 0) 14772 SET_OPTION_IF_UNSET (opts, &global_options_set, 14773 param_prefetch_minimum_stride, 14774 aarch64_tune_params.prefetch->minimum_stride); 14775 14776 /* Use the alternative scheduling-pressure algorithm by default. */ 14777 SET_OPTION_IF_UNSET (opts, &global_options_set, 14778 param_sched_pressure_algorithm, 14779 SCHED_PRESSURE_MODEL); 14780 14781 /* Validate the guard size. */ 14782 int guard_size = param_stack_clash_protection_guard_size; 14783 14784 if (guard_size != 12 && guard_size != 16) 14785 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard " 14786 "size. Given value %d (%llu KB) is out of range", 14787 guard_size, (1ULL << guard_size) / 1024ULL); 14788 14789 /* Enforce that interval is the same size as size so the mid-end does the 14790 right thing. */ 14791 SET_OPTION_IF_UNSET (opts, &global_options_set, 14792 param_stack_clash_protection_probe_interval, 14793 guard_size); 14794 14795 /* The maybe_set calls won't update the value if the user has explicitly set 14796 one. Which means we need to validate that probing interval and guard size 14797 are equal. */ 14798 int probe_interval 14799 = param_stack_clash_protection_probe_interval; 14800 if (guard_size != probe_interval) 14801 error ("stack clash guard size %<%d%> must be equal to probing interval " 14802 "%<%d%>", guard_size, probe_interval); 14803 14804 /* Enable sw prefetching at specified optimization level for 14805 CPUS that have prefetch. Lower optimization level threshold by 1 14806 when profiling is enabled. */ 14807 if (opts->x_flag_prefetch_loop_arrays < 0 14808 && !opts->x_optimize_size 14809 && aarch64_tune_params.prefetch->default_opt_level >= 0 14810 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) 14811 opts->x_flag_prefetch_loop_arrays = 1; 14812 14813 if (opts->x_aarch64_arch_string == NULL) 14814 opts->x_aarch64_arch_string = selected_arch->name; 14815 if (opts->x_aarch64_cpu_string == NULL) 14816 opts->x_aarch64_cpu_string = selected_cpu->name; 14817 if (opts->x_aarch64_tune_string == NULL) 14818 opts->x_aarch64_tune_string = selected_tune->name; 14819 14820 aarch64_override_options_after_change_1 (opts); 14821} 14822 14823/* Print a hint with a suggestion for a core or architecture name that 14824 most closely resembles what the user passed in STR. ARCH is true if 14825 the user is asking for an architecture name. ARCH is false if the user 14826 is asking for a core name. */ 14827 14828static void 14829aarch64_print_hint_for_core_or_arch (const char *str, bool arch) 14830{ 14831 auto_vec<const char *> candidates; 14832 const struct processor *entry = arch ? all_architectures : all_cores; 14833 for (; entry->name != NULL; entry++) 14834 candidates.safe_push (entry->name); 14835 14836#ifdef HAVE_LOCAL_CPU_DETECT 14837 /* Add also "native" as possible value. */ 14838 if (arch) 14839 candidates.safe_push ("native"); 14840#endif 14841 14842 char *s; 14843 const char *hint = candidates_list_and_hint (str, s, candidates); 14844 if (hint) 14845 inform (input_location, "valid arguments are: %s;" 14846 " did you mean %qs?", s, hint); 14847 else 14848 inform (input_location, "valid arguments are: %s", s); 14849 14850 XDELETEVEC (s); 14851} 14852 14853/* Print a hint with a suggestion for a core name that most closely resembles 14854 what the user passed in STR. */ 14855 14856inline static void 14857aarch64_print_hint_for_core (const char *str) 14858{ 14859 aarch64_print_hint_for_core_or_arch (str, false); 14860} 14861 14862/* Print a hint with a suggestion for an architecture name that most closely 14863 resembles what the user passed in STR. */ 14864 14865inline static void 14866aarch64_print_hint_for_arch (const char *str) 14867{ 14868 aarch64_print_hint_for_core_or_arch (str, true); 14869} 14870 14871 14872/* Print a hint with a suggestion for an extension name 14873 that most closely resembles what the user passed in STR. */ 14874 14875void 14876aarch64_print_hint_for_extensions (const std::string &str) 14877{ 14878 auto_vec<const char *> candidates; 14879 aarch64_get_all_extension_candidates (&candidates); 14880 char *s; 14881 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates); 14882 if (hint) 14883 inform (input_location, "valid arguments are: %s;" 14884 " did you mean %qs?", s, hint); 14885 else 14886 inform (input_location, "valid arguments are: %s;", s); 14887 14888 XDELETEVEC (s); 14889} 14890 14891/* Validate a command-line -mcpu option. Parse the cpu and extensions (if any) 14892 specified in STR and throw errors if appropriate. Put the results if 14893 they are valid in RES and ISA_FLAGS. Return whether the option is 14894 valid. */ 14895 14896static bool 14897aarch64_validate_mcpu (const char *str, const struct processor **res, 14898 uint64_t *isa_flags) 14899{ 14900 std::string invalid_extension; 14901 enum aarch64_parse_opt_result parse_res 14902 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension); 14903 14904 if (parse_res == AARCH64_PARSE_OK) 14905 return true; 14906 14907 switch (parse_res) 14908 { 14909 case AARCH64_PARSE_MISSING_ARG: 14910 error ("missing cpu name in %<-mcpu=%s%>", str); 14911 break; 14912 case AARCH64_PARSE_INVALID_ARG: 14913 error ("unknown value %qs for %<-mcpu%>", str); 14914 aarch64_print_hint_for_core (str); 14915 break; 14916 case AARCH64_PARSE_INVALID_FEATURE: 14917 error ("invalid feature modifier %qs in %<-mcpu=%s%>", 14918 invalid_extension.c_str (), str); 14919 aarch64_print_hint_for_extensions (invalid_extension); 14920 break; 14921 default: 14922 gcc_unreachable (); 14923 } 14924 14925 return false; 14926} 14927 14928/* Straight line speculation indicators. */ 14929enum aarch64_sls_hardening_type 14930{ 14931 SLS_NONE = 0, 14932 SLS_RETBR = 1, 14933 SLS_BLR = 2, 14934 SLS_ALL = 3, 14935}; 14936static enum aarch64_sls_hardening_type aarch64_sls_hardening; 14937 14938/* Return whether we should mitigatate Straight Line Speculation for the RET 14939 and BR instructions. */ 14940bool 14941aarch64_harden_sls_retbr_p (void) 14942{ 14943 return aarch64_sls_hardening & SLS_RETBR; 14944} 14945 14946/* Return whether we should mitigatate Straight Line Speculation for the BLR 14947 instruction. */ 14948bool 14949aarch64_harden_sls_blr_p (void) 14950{ 14951 return aarch64_sls_hardening & SLS_BLR; 14952} 14953 14954/* As of yet we only allow setting these options globally, in the future we may 14955 allow setting them per function. */ 14956static void 14957aarch64_validate_sls_mitigation (const char *const_str) 14958{ 14959 char *token_save = NULL; 14960 char *str = NULL; 14961 14962 if (strcmp (const_str, "none") == 0) 14963 { 14964 aarch64_sls_hardening = SLS_NONE; 14965 return; 14966 } 14967 if (strcmp (const_str, "all") == 0) 14968 { 14969 aarch64_sls_hardening = SLS_ALL; 14970 return; 14971 } 14972 14973 char *str_root = xstrdup (const_str); 14974 str = strtok_r (str_root, ",", &token_save); 14975 if (!str) 14976 error ("invalid argument given to %<-mharden-sls=%>"); 14977 14978 int temp = SLS_NONE; 14979 while (str) 14980 { 14981 if (strcmp (str, "blr") == 0) 14982 temp |= SLS_BLR; 14983 else if (strcmp (str, "retbr") == 0) 14984 temp |= SLS_RETBR; 14985 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0) 14986 { 14987 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str); 14988 break; 14989 } 14990 else 14991 { 14992 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str); 14993 break; 14994 } 14995 str = strtok_r (NULL, ",", &token_save); 14996 } 14997 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp; 14998 free (str_root); 14999} 15000 15001/* Parses CONST_STR for branch protection features specified in 15002 aarch64_branch_protect_types, and set any global variables required. Returns 15003 the parsing result and assigns LAST_STR to the last processed token from 15004 CONST_STR so that it can be used for error reporting. */ 15005 15006static enum 15007aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str, 15008 char** last_str) 15009{ 15010 char *str_root = xstrdup (const_str); 15011 char* token_save = NULL; 15012 char *str = strtok_r (str_root, "+", &token_save); 15013 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK; 15014 if (!str) 15015 res = AARCH64_PARSE_MISSING_ARG; 15016 else 15017 { 15018 char *next_str = strtok_r (NULL, "+", &token_save); 15019 /* Reset the branch protection features to their defaults. */ 15020 aarch64_handle_no_branch_protection (NULL, NULL); 15021 15022 while (str && res == AARCH64_PARSE_OK) 15023 { 15024 const aarch64_branch_protect_type* type = aarch64_branch_protect_types; 15025 bool found = false; 15026 /* Search for this type. */ 15027 while (type && type->name && !found && res == AARCH64_PARSE_OK) 15028 { 15029 if (strcmp (str, type->name) == 0) 15030 { 15031 found = true; 15032 res = type->handler (str, next_str); 15033 str = next_str; 15034 next_str = strtok_r (NULL, "+", &token_save); 15035 } 15036 else 15037 type++; 15038 } 15039 if (found && res == AARCH64_PARSE_OK) 15040 { 15041 bool found_subtype = true; 15042 /* Loop through each token until we find one that isn't a 15043 subtype. */ 15044 while (found_subtype) 15045 { 15046 found_subtype = false; 15047 const aarch64_branch_protect_type *subtype = type->subtypes; 15048 /* Search for the subtype. */ 15049 while (str && subtype && subtype->name && !found_subtype 15050 && res == AARCH64_PARSE_OK) 15051 { 15052 if (strcmp (str, subtype->name) == 0) 15053 { 15054 found_subtype = true; 15055 res = subtype->handler (str, next_str); 15056 str = next_str; 15057 next_str = strtok_r (NULL, "+", &token_save); 15058 } 15059 else 15060 subtype++; 15061 } 15062 } 15063 } 15064 else if (!found) 15065 res = AARCH64_PARSE_INVALID_ARG; 15066 } 15067 } 15068 /* Copy the last processed token into the argument to pass it back. 15069 Used by option and attribute validation to print the offending token. */ 15070 if (last_str) 15071 { 15072 if (str) strcpy (*last_str, str); 15073 else *last_str = NULL; 15074 } 15075 if (res == AARCH64_PARSE_OK) 15076 { 15077 /* If needed, alloc the accepted string then copy in const_str. 15078 Used by override_option_after_change_1. */ 15079 if (!accepted_branch_protection_string) 15080 accepted_branch_protection_string = (char *) xmalloc ( 15081 BRANCH_PROTECT_STR_MAX 15082 + 1); 15083 strncpy (accepted_branch_protection_string, const_str, 15084 BRANCH_PROTECT_STR_MAX + 1); 15085 /* Forcibly null-terminate. */ 15086 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0'; 15087 } 15088 return res; 15089} 15090 15091static bool 15092aarch64_validate_mbranch_protection (const char *const_str) 15093{ 15094 char *str = (char *) xmalloc (strlen (const_str)); 15095 enum aarch64_parse_opt_result res = 15096 aarch64_parse_branch_protection (const_str, &str); 15097 if (res == AARCH64_PARSE_INVALID_ARG) 15098 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str); 15099 else if (res == AARCH64_PARSE_MISSING_ARG) 15100 error ("missing argument for %<-mbranch-protection=%>"); 15101 free (str); 15102 return res == AARCH64_PARSE_OK; 15103} 15104 15105/* Validate a command-line -march option. Parse the arch and extensions 15106 (if any) specified in STR and throw errors if appropriate. Put the 15107 results, if they are valid, in RES and ISA_FLAGS. Return whether the 15108 option is valid. */ 15109 15110static bool 15111aarch64_validate_march (const char *str, const struct processor **res, 15112 uint64_t *isa_flags) 15113{ 15114 std::string invalid_extension; 15115 enum aarch64_parse_opt_result parse_res 15116 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension); 15117 15118 if (parse_res == AARCH64_PARSE_OK) 15119 return true; 15120 15121 switch (parse_res) 15122 { 15123 case AARCH64_PARSE_MISSING_ARG: 15124 error ("missing arch name in %<-march=%s%>", str); 15125 break; 15126 case AARCH64_PARSE_INVALID_ARG: 15127 error ("unknown value %qs for %<-march%>", str); 15128 aarch64_print_hint_for_arch (str); 15129 break; 15130 case AARCH64_PARSE_INVALID_FEATURE: 15131 error ("invalid feature modifier %qs in %<-march=%s%>", 15132 invalid_extension.c_str (), str); 15133 aarch64_print_hint_for_extensions (invalid_extension); 15134 break; 15135 default: 15136 gcc_unreachable (); 15137 } 15138 15139 return false; 15140} 15141 15142/* Validate a command-line -mtune option. Parse the cpu 15143 specified in STR and throw errors if appropriate. Put the 15144 result, if it is valid, in RES. Return whether the option is 15145 valid. */ 15146 15147static bool 15148aarch64_validate_mtune (const char *str, const struct processor **res) 15149{ 15150 enum aarch64_parse_opt_result parse_res 15151 = aarch64_parse_tune (str, res); 15152 15153 if (parse_res == AARCH64_PARSE_OK) 15154 return true; 15155 15156 switch (parse_res) 15157 { 15158 case AARCH64_PARSE_MISSING_ARG: 15159 error ("missing cpu name in %<-mtune=%s%>", str); 15160 break; 15161 case AARCH64_PARSE_INVALID_ARG: 15162 error ("unknown value %qs for %<-mtune%>", str); 15163 aarch64_print_hint_for_core (str); 15164 break; 15165 default: 15166 gcc_unreachable (); 15167 } 15168 return false; 15169} 15170 15171/* Return the CPU corresponding to the enum CPU. 15172 If it doesn't specify a cpu, return the default. */ 15173 15174static const struct processor * 15175aarch64_get_tune_cpu (enum aarch64_processor cpu) 15176{ 15177 if (cpu != aarch64_none) 15178 return &all_cores[cpu]; 15179 15180 /* The & 0x3f is to extract the bottom 6 bits that encode the 15181 default cpu as selected by the --with-cpu GCC configure option 15182 in config.gcc. 15183 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS 15184 flags mechanism should be reworked to make it more sane. */ 15185 return &all_cores[TARGET_CPU_DEFAULT & 0x3f]; 15186} 15187 15188/* Return the architecture corresponding to the enum ARCH. 15189 If it doesn't specify a valid architecture, return the default. */ 15190 15191static const struct processor * 15192aarch64_get_arch (enum aarch64_arch arch) 15193{ 15194 if (arch != aarch64_no_arch) 15195 return &all_architectures[arch]; 15196 15197 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f]; 15198 15199 return &all_architectures[cpu->arch]; 15200} 15201 15202/* Return the VG value associated with -msve-vector-bits= value VALUE. */ 15203 15204static poly_uint16 15205aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value) 15206{ 15207 /* 128-bit SVE and Advanced SIMD modes use different register layouts 15208 on big-endian targets, so we would need to forbid subregs that convert 15209 from one to the other. By default a reinterpret sequence would then 15210 involve a store to memory in one mode and a load back in the other. 15211 Even if we optimize that sequence using reverse instructions, 15212 it would still be a significant potential overhead. 15213 15214 For now, it seems better to generate length-agnostic code for that 15215 case instead. */ 15216 if (value == SVE_SCALABLE 15217 || (value == SVE_128 && BYTES_BIG_ENDIAN)) 15218 return poly_uint16 (2, 2); 15219 else 15220 return (int) value / 64; 15221} 15222 15223/* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning 15224 and is used to parse the -m{cpu,tune,arch} strings and setup the initial 15225 tuning structs. In particular it must set selected_tune and 15226 aarch64_isa_flags that define the available ISA features and tuning 15227 decisions. It must also set selected_arch as this will be used to 15228 output the .arch asm tags for each function. */ 15229 15230static void 15231aarch64_override_options (void) 15232{ 15233 uint64_t cpu_isa = 0; 15234 uint64_t arch_isa = 0; 15235 aarch64_isa_flags = 0; 15236 15237 bool valid_cpu = true; 15238 bool valid_tune = true; 15239 bool valid_arch = true; 15240 15241 selected_cpu = NULL; 15242 selected_arch = NULL; 15243 selected_tune = NULL; 15244 15245 if (aarch64_harden_sls_string) 15246 aarch64_validate_sls_mitigation (aarch64_harden_sls_string); 15247 15248 if (aarch64_branch_protection_string) 15249 aarch64_validate_mbranch_protection (aarch64_branch_protection_string); 15250 15251 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU. 15252 If either of -march or -mtune is given, they override their 15253 respective component of -mcpu. */ 15254 if (aarch64_cpu_string) 15255 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu, 15256 &cpu_isa); 15257 15258 if (aarch64_arch_string) 15259 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch, 15260 &arch_isa); 15261 15262 if (aarch64_tune_string) 15263 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune); 15264 15265#ifdef SUBTARGET_OVERRIDE_OPTIONS 15266 SUBTARGET_OVERRIDE_OPTIONS; 15267#endif 15268 15269 /* If the user did not specify a processor, choose the default 15270 one for them. This will be the CPU set during configuration using 15271 --with-cpu, otherwise it is "generic". */ 15272 if (!selected_cpu) 15273 { 15274 if (selected_arch) 15275 { 15276 selected_cpu = &all_cores[selected_arch->ident]; 15277 aarch64_isa_flags = arch_isa; 15278 explicit_arch = selected_arch->arch; 15279 } 15280 else 15281 { 15282 /* Get default configure-time CPU. */ 15283 selected_cpu = aarch64_get_tune_cpu (aarch64_none); 15284 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6; 15285 } 15286 15287 if (selected_tune) 15288 explicit_tune_core = selected_tune->ident; 15289 } 15290 /* If both -mcpu and -march are specified check that they are architecturally 15291 compatible, warn if they're not and prefer the -march ISA flags. */ 15292 else if (selected_arch) 15293 { 15294 if (selected_arch->arch != selected_cpu->arch) 15295 { 15296 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch", 15297 aarch64_cpu_string, 15298 aarch64_arch_string); 15299 } 15300 aarch64_isa_flags = arch_isa; 15301 explicit_arch = selected_arch->arch; 15302 explicit_tune_core = selected_tune ? selected_tune->ident 15303 : selected_cpu->ident; 15304 } 15305 else 15306 { 15307 /* -mcpu but no -march. */ 15308 aarch64_isa_flags = cpu_isa; 15309 explicit_tune_core = selected_tune ? selected_tune->ident 15310 : selected_cpu->ident; 15311 gcc_assert (selected_cpu); 15312 selected_arch = &all_architectures[selected_cpu->arch]; 15313 explicit_arch = selected_arch->arch; 15314 } 15315 15316 /* Set the arch as well as we will need it when outputing 15317 the .arch directive in assembly. */ 15318 if (!selected_arch) 15319 { 15320 gcc_assert (selected_cpu); 15321 selected_arch = &all_architectures[selected_cpu->arch]; 15322 } 15323 15324 if (!selected_tune) 15325 selected_tune = selected_cpu; 15326 15327 if (aarch64_enable_bti == 2) 15328 { 15329#ifdef TARGET_ENABLE_BTI 15330 aarch64_enable_bti = 1; 15331#else 15332 aarch64_enable_bti = 0; 15333#endif 15334 } 15335 15336 /* Return address signing is currently not supported for ILP32 targets. For 15337 LP64 targets use the configured option in the absence of a command-line 15338 option for -mbranch-protection. */ 15339 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL) 15340 { 15341#ifdef TARGET_ENABLE_PAC_RET 15342 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 15343#else 15344 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; 15345#endif 15346 } 15347 15348#ifndef HAVE_AS_MABI_OPTION 15349 /* The compiler may have been configured with 2.23.* binutils, which does 15350 not have support for ILP32. */ 15351 if (TARGET_ILP32) 15352 error ("assembler does not support %<-mabi=ilp32%>"); 15353#endif 15354 15355 /* Convert -msve-vector-bits to a VG count. */ 15356 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits); 15357 15358 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32) 15359 sorry ("return address signing is only supported for %<-mabi=lp64%>"); 15360 15361 /* Make sure we properly set up the explicit options. */ 15362 if ((aarch64_cpu_string && valid_cpu) 15363 || (aarch64_tune_string && valid_tune)) 15364 gcc_assert (explicit_tune_core != aarch64_none); 15365 15366 if ((aarch64_cpu_string && valid_cpu) 15367 || (aarch64_arch_string && valid_arch)) 15368 gcc_assert (explicit_arch != aarch64_no_arch); 15369 15370 /* The pass to insert speculation tracking runs before 15371 shrink-wrapping and the latter does not know how to update the 15372 tracking status. So disable it in this case. */ 15373 if (aarch64_track_speculation) 15374 flag_shrink_wrap = 0; 15375 15376 aarch64_override_options_internal (&global_options); 15377 15378 /* Save these options as the default ones in case we push and pop them later 15379 while processing functions with potential target attributes. */ 15380 target_option_default_node = target_option_current_node 15381 = build_target_option_node (&global_options); 15382} 15383 15384/* Implement targetm.override_options_after_change. */ 15385 15386static void 15387aarch64_override_options_after_change (void) 15388{ 15389 aarch64_override_options_after_change_1 (&global_options); 15390} 15391 15392static struct machine_function * 15393aarch64_init_machine_status (void) 15394{ 15395 struct machine_function *machine; 15396 machine = ggc_cleared_alloc<machine_function> (); 15397 return machine; 15398} 15399 15400void 15401aarch64_init_expanders (void) 15402{ 15403 init_machine_status = aarch64_init_machine_status; 15404} 15405 15406/* A checking mechanism for the implementation of the various code models. */ 15407static void 15408initialize_aarch64_code_model (struct gcc_options *opts) 15409{ 15410 aarch64_cmodel = opts->x_aarch64_cmodel_var; 15411 switch (opts->x_aarch64_cmodel_var) 15412 { 15413 case AARCH64_CMODEL_TINY: 15414 if (opts->x_flag_pic) 15415 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC; 15416 break; 15417 case AARCH64_CMODEL_SMALL: 15418 if (opts->x_flag_pic) 15419 { 15420#ifdef HAVE_AS_SMALL_PIC_RELOCS 15421 aarch64_cmodel = (flag_pic == 2 15422 ? AARCH64_CMODEL_SMALL_PIC 15423 : AARCH64_CMODEL_SMALL_SPIC); 15424#else 15425 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC; 15426#endif 15427 } 15428 break; 15429 case AARCH64_CMODEL_LARGE: 15430 if (opts->x_flag_pic) 15431 sorry ("code model %qs with %<-f%s%>", "large", 15432 opts->x_flag_pic > 1 ? "PIC" : "pic"); 15433 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32) 15434 sorry ("code model %qs not supported in ilp32 mode", "large"); 15435 break; 15436 case AARCH64_CMODEL_TINY_PIC: 15437 case AARCH64_CMODEL_SMALL_PIC: 15438 case AARCH64_CMODEL_SMALL_SPIC: 15439 gcc_unreachable (); 15440 } 15441} 15442 15443/* Implement TARGET_OPTION_SAVE. */ 15444 15445static void 15446aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts) 15447{ 15448 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string; 15449 ptr->x_aarch64_branch_protection_string 15450 = opts->x_aarch64_branch_protection_string; 15451} 15452 15453/* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions 15454 using the information saved in PTR. */ 15455 15456static void 15457aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr) 15458{ 15459 opts->x_explicit_tune_core = ptr->x_explicit_tune_core; 15460 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); 15461 opts->x_explicit_arch = ptr->x_explicit_arch; 15462 selected_arch = aarch64_get_arch (ptr->x_explicit_arch); 15463 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string; 15464 opts->x_aarch64_branch_protection_string 15465 = ptr->x_aarch64_branch_protection_string; 15466 if (opts->x_aarch64_branch_protection_string) 15467 { 15468 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string, 15469 NULL); 15470 } 15471 15472 aarch64_override_options_internal (opts); 15473} 15474 15475/* Implement TARGET_OPTION_PRINT. */ 15476 15477static void 15478aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr) 15479{ 15480 const struct processor *cpu 15481 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); 15482 uint64_t isa_flags = ptr->x_aarch64_isa_flags; 15483 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch); 15484 std::string extension 15485 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags); 15486 15487 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name); 15488 fprintf (file, "%*sselected arch = %s%s\n", indent, "", 15489 arch->name, extension.c_str ()); 15490} 15491 15492static GTY(()) tree aarch64_previous_fndecl; 15493 15494void 15495aarch64_reset_previous_fndecl (void) 15496{ 15497 aarch64_previous_fndecl = NULL; 15498} 15499 15500/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE. 15501 Used by aarch64_set_current_function and aarch64_pragma_target_parse to 15502 make sure optab availability predicates are recomputed when necessary. */ 15503 15504void 15505aarch64_save_restore_target_globals (tree new_tree) 15506{ 15507 if (TREE_TARGET_GLOBALS (new_tree)) 15508 restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); 15509 else if (new_tree == target_option_default_node) 15510 restore_target_globals (&default_target_globals); 15511 else 15512 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); 15513} 15514 15515/* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions 15516 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET 15517 of the function, if such exists. This function may be called multiple 15518 times on a single function so use aarch64_previous_fndecl to avoid 15519 setting up identical state. */ 15520 15521static void 15522aarch64_set_current_function (tree fndecl) 15523{ 15524 if (!fndecl || fndecl == aarch64_previous_fndecl) 15525 return; 15526 15527 tree old_tree = (aarch64_previous_fndecl 15528 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl) 15529 : NULL_TREE); 15530 15531 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 15532 15533 /* If current function has no attributes but the previous one did, 15534 use the default node. */ 15535 if (!new_tree && old_tree) 15536 new_tree = target_option_default_node; 15537 15538 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to 15539 the default have been handled by aarch64_save_restore_target_globals from 15540 aarch64_pragma_target_parse. */ 15541 if (old_tree == new_tree) 15542 return; 15543 15544 aarch64_previous_fndecl = fndecl; 15545 15546 /* First set the target options. */ 15547 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); 15548 15549 aarch64_save_restore_target_globals (new_tree); 15550} 15551 15552/* Enum describing the various ways we can handle attributes. 15553 In many cases we can reuse the generic option handling machinery. */ 15554 15555enum aarch64_attr_opt_type 15556{ 15557 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */ 15558 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */ 15559 aarch64_attr_enum, /* Attribute sets an enum variable. */ 15560 aarch64_attr_custom /* Attribute requires a custom handling function. */ 15561}; 15562 15563/* All the information needed to handle a target attribute. 15564 NAME is the name of the attribute. 15565 ATTR_TYPE specifies the type of behavior of the attribute as described 15566 in the definition of enum aarch64_attr_opt_type. 15567 ALLOW_NEG is true if the attribute supports a "no-" form. 15568 HANDLER is the function that takes the attribute string as an argument 15569 It is needed only when the ATTR_TYPE is aarch64_attr_custom. 15570 OPT_NUM is the enum specifying the option that the attribute modifies. 15571 This is needed for attributes that mirror the behavior of a command-line 15572 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or 15573 aarch64_attr_enum. */ 15574 15575struct aarch64_attribute_info 15576{ 15577 const char *name; 15578 enum aarch64_attr_opt_type attr_type; 15579 bool allow_neg; 15580 bool (*handler) (const char *); 15581 enum opt_code opt_num; 15582}; 15583 15584/* Handle the ARCH_STR argument to the arch= target attribute. */ 15585 15586static bool 15587aarch64_handle_attr_arch (const char *str) 15588{ 15589 const struct processor *tmp_arch = NULL; 15590 std::string invalid_extension; 15591 enum aarch64_parse_opt_result parse_res 15592 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension); 15593 15594 if (parse_res == AARCH64_PARSE_OK) 15595 { 15596 gcc_assert (tmp_arch); 15597 selected_arch = tmp_arch; 15598 explicit_arch = selected_arch->arch; 15599 return true; 15600 } 15601 15602 switch (parse_res) 15603 { 15604 case AARCH64_PARSE_MISSING_ARG: 15605 error ("missing name in %<target(\"arch=\")%> pragma or attribute"); 15606 break; 15607 case AARCH64_PARSE_INVALID_ARG: 15608 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str); 15609 aarch64_print_hint_for_arch (str); 15610 break; 15611 case AARCH64_PARSE_INVALID_FEATURE: 15612 error ("invalid feature modifier %s of value (\"%s\") in " 15613 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 15614 aarch64_print_hint_for_extensions (invalid_extension); 15615 break; 15616 default: 15617 gcc_unreachable (); 15618 } 15619 15620 return false; 15621} 15622 15623/* Handle the argument CPU_STR to the cpu= target attribute. */ 15624 15625static bool 15626aarch64_handle_attr_cpu (const char *str) 15627{ 15628 const struct processor *tmp_cpu = NULL; 15629 std::string invalid_extension; 15630 enum aarch64_parse_opt_result parse_res 15631 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension); 15632 15633 if (parse_res == AARCH64_PARSE_OK) 15634 { 15635 gcc_assert (tmp_cpu); 15636 selected_tune = tmp_cpu; 15637 explicit_tune_core = selected_tune->ident; 15638 15639 selected_arch = &all_architectures[tmp_cpu->arch]; 15640 explicit_arch = selected_arch->arch; 15641 return true; 15642 } 15643 15644 switch (parse_res) 15645 { 15646 case AARCH64_PARSE_MISSING_ARG: 15647 error ("missing name in %<target(\"cpu=\")%> pragma or attribute"); 15648 break; 15649 case AARCH64_PARSE_INVALID_ARG: 15650 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str); 15651 aarch64_print_hint_for_core (str); 15652 break; 15653 case AARCH64_PARSE_INVALID_FEATURE: 15654 error ("invalid feature modifier %s of value (\"%s\") in " 15655 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 15656 aarch64_print_hint_for_extensions (invalid_extension); 15657 break; 15658 default: 15659 gcc_unreachable (); 15660 } 15661 15662 return false; 15663} 15664 15665/* Handle the argument STR to the branch-protection= attribute. */ 15666 15667 static bool 15668 aarch64_handle_attr_branch_protection (const char* str) 15669 { 15670 char *err_str = (char *) xmalloc (strlen (str) + 1); 15671 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str, 15672 &err_str); 15673 bool success = false; 15674 switch (res) 15675 { 15676 case AARCH64_PARSE_MISSING_ARG: 15677 error ("missing argument to %<target(\"branch-protection=\")%> pragma or" 15678 " attribute"); 15679 break; 15680 case AARCH64_PARSE_INVALID_ARG: 15681 error ("invalid protection type (\"%s\") in %<target(\"branch-protection" 15682 "=\")%> pragma or attribute", err_str); 15683 break; 15684 case AARCH64_PARSE_OK: 15685 success = true; 15686 /* Fall through. */ 15687 case AARCH64_PARSE_INVALID_FEATURE: 15688 break; 15689 default: 15690 gcc_unreachable (); 15691 } 15692 free (err_str); 15693 return success; 15694 } 15695 15696/* Handle the argument STR to the tune= target attribute. */ 15697 15698static bool 15699aarch64_handle_attr_tune (const char *str) 15700{ 15701 const struct processor *tmp_tune = NULL; 15702 enum aarch64_parse_opt_result parse_res 15703 = aarch64_parse_tune (str, &tmp_tune); 15704 15705 if (parse_res == AARCH64_PARSE_OK) 15706 { 15707 gcc_assert (tmp_tune); 15708 selected_tune = tmp_tune; 15709 explicit_tune_core = selected_tune->ident; 15710 return true; 15711 } 15712 15713 switch (parse_res) 15714 { 15715 case AARCH64_PARSE_INVALID_ARG: 15716 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str); 15717 aarch64_print_hint_for_core (str); 15718 break; 15719 default: 15720 gcc_unreachable (); 15721 } 15722 15723 return false; 15724} 15725 15726/* Parse an architecture extensions target attribute string specified in STR. 15727 For example "+fp+nosimd". Show any errors if needed. Return TRUE 15728 if successful. Update aarch64_isa_flags to reflect the ISA features 15729 modified. */ 15730 15731static bool 15732aarch64_handle_attr_isa_flags (char *str) 15733{ 15734 enum aarch64_parse_opt_result parse_res; 15735 uint64_t isa_flags = aarch64_isa_flags; 15736 15737 /* We allow "+nothing" in the beginning to clear out all architectural 15738 features if the user wants to handpick specific features. */ 15739 if (strncmp ("+nothing", str, 8) == 0) 15740 { 15741 isa_flags = 0; 15742 str += 8; 15743 } 15744 15745 std::string invalid_extension; 15746 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension); 15747 15748 if (parse_res == AARCH64_PARSE_OK) 15749 { 15750 aarch64_isa_flags = isa_flags; 15751 return true; 15752 } 15753 15754 switch (parse_res) 15755 { 15756 case AARCH64_PARSE_MISSING_ARG: 15757 error ("missing value in %<target()%> pragma or attribute"); 15758 break; 15759 15760 case AARCH64_PARSE_INVALID_FEATURE: 15761 error ("invalid feature modifier %s of value (\"%s\") in " 15762 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 15763 break; 15764 15765 default: 15766 gcc_unreachable (); 15767 } 15768 15769 return false; 15770} 15771 15772/* The target attributes that we support. On top of these we also support just 15773 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is 15774 handled explicitly in aarch64_process_one_target_attr. */ 15775 15776static const struct aarch64_attribute_info aarch64_attributes[] = 15777{ 15778 { "general-regs-only", aarch64_attr_mask, false, NULL, 15779 OPT_mgeneral_regs_only }, 15780 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL, 15781 OPT_mfix_cortex_a53_835769 }, 15782 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL, 15783 OPT_mfix_cortex_a53_843419 }, 15784 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ }, 15785 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align }, 15786 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL, 15787 OPT_momit_leaf_frame_pointer }, 15788 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ }, 15789 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch, 15790 OPT_march_ }, 15791 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ }, 15792 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune, 15793 OPT_mtune_ }, 15794 { "branch-protection", aarch64_attr_custom, false, 15795 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ }, 15796 { "sign-return-address", aarch64_attr_enum, false, NULL, 15797 OPT_msign_return_address_ }, 15798 { "outline-atomics", aarch64_attr_bool, true, NULL, 15799 OPT_moutline_atomics}, 15800 { NULL, aarch64_attr_custom, false, NULL, OPT____ } 15801}; 15802 15803/* Parse ARG_STR which contains the definition of one target attribute. 15804 Show appropriate errors if any or return true if the attribute is valid. */ 15805 15806static bool 15807aarch64_process_one_target_attr (char *arg_str) 15808{ 15809 bool invert = false; 15810 15811 size_t len = strlen (arg_str); 15812 15813 if (len == 0) 15814 { 15815 error ("malformed %<target()%> pragma or attribute"); 15816 return false; 15817 } 15818 15819 char *str_to_check = (char *) alloca (len + 1); 15820 strcpy (str_to_check, arg_str); 15821 15822 /* We have something like __attribute__ ((target ("+fp+nosimd"))). 15823 It is easier to detect and handle it explicitly here rather than going 15824 through the machinery for the rest of the target attributes in this 15825 function. */ 15826 if (*str_to_check == '+') 15827 return aarch64_handle_attr_isa_flags (str_to_check); 15828 15829 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0) 15830 { 15831 invert = true; 15832 str_to_check += 3; 15833 } 15834 char *arg = strchr (str_to_check, '='); 15835 15836 /* If we found opt=foo then terminate STR_TO_CHECK at the '=' 15837 and point ARG to "foo". */ 15838 if (arg) 15839 { 15840 *arg = '\0'; 15841 arg++; 15842 } 15843 const struct aarch64_attribute_info *p_attr; 15844 bool found = false; 15845 for (p_attr = aarch64_attributes; p_attr->name; p_attr++) 15846 { 15847 /* If the names don't match up, or the user has given an argument 15848 to an attribute that doesn't accept one, or didn't give an argument 15849 to an attribute that expects one, fail to match. */ 15850 if (strcmp (str_to_check, p_attr->name) != 0) 15851 continue; 15852 15853 found = true; 15854 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom 15855 || p_attr->attr_type == aarch64_attr_enum; 15856 15857 if (attr_need_arg_p ^ (arg != NULL)) 15858 { 15859 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check); 15860 return false; 15861 } 15862 15863 /* If the name matches but the attribute does not allow "no-" versions 15864 then we can't match. */ 15865 if (invert && !p_attr->allow_neg) 15866 { 15867 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check); 15868 return false; 15869 } 15870 15871 switch (p_attr->attr_type) 15872 { 15873 /* Has a custom handler registered. 15874 For example, cpu=, arch=, tune=. */ 15875 case aarch64_attr_custom: 15876 gcc_assert (p_attr->handler); 15877 if (!p_attr->handler (arg)) 15878 return false; 15879 break; 15880 15881 /* Either set or unset a boolean option. */ 15882 case aarch64_attr_bool: 15883 { 15884 struct cl_decoded_option decoded; 15885 15886 generate_option (p_attr->opt_num, NULL, !invert, 15887 CL_TARGET, &decoded); 15888 aarch64_handle_option (&global_options, &global_options_set, 15889 &decoded, input_location); 15890 break; 15891 } 15892 /* Set or unset a bit in the target_flags. aarch64_handle_option 15893 should know what mask to apply given the option number. */ 15894 case aarch64_attr_mask: 15895 { 15896 struct cl_decoded_option decoded; 15897 /* We only need to specify the option number. 15898 aarch64_handle_option will know which mask to apply. */ 15899 decoded.opt_index = p_attr->opt_num; 15900 decoded.value = !invert; 15901 aarch64_handle_option (&global_options, &global_options_set, 15902 &decoded, input_location); 15903 break; 15904 } 15905 /* Use the option setting machinery to set an option to an enum. */ 15906 case aarch64_attr_enum: 15907 { 15908 gcc_assert (arg); 15909 bool valid; 15910 int value; 15911 valid = opt_enum_arg_to_value (p_attr->opt_num, arg, 15912 &value, CL_TARGET); 15913 if (valid) 15914 { 15915 set_option (&global_options, NULL, p_attr->opt_num, value, 15916 NULL, DK_UNSPECIFIED, input_location, 15917 global_dc); 15918 } 15919 else 15920 { 15921 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg); 15922 } 15923 break; 15924 } 15925 default: 15926 gcc_unreachable (); 15927 } 15928 } 15929 15930 /* If we reached here we either have found an attribute and validated 15931 it or didn't match any. If we matched an attribute but its arguments 15932 were malformed we will have returned false already. */ 15933 return found; 15934} 15935 15936/* Count how many times the character C appears in 15937 NULL-terminated string STR. */ 15938 15939static unsigned int 15940num_occurences_in_str (char c, char *str) 15941{ 15942 unsigned int res = 0; 15943 while (*str != '\0') 15944 { 15945 if (*str == c) 15946 res++; 15947 15948 str++; 15949 } 15950 15951 return res; 15952} 15953 15954/* Parse the tree in ARGS that contains the target attribute information 15955 and update the global target options space. */ 15956 15957bool 15958aarch64_process_target_attr (tree args) 15959{ 15960 if (TREE_CODE (args) == TREE_LIST) 15961 { 15962 do 15963 { 15964 tree head = TREE_VALUE (args); 15965 if (head) 15966 { 15967 if (!aarch64_process_target_attr (head)) 15968 return false; 15969 } 15970 args = TREE_CHAIN (args); 15971 } while (args); 15972 15973 return true; 15974 } 15975 15976 if (TREE_CODE (args) != STRING_CST) 15977 { 15978 error ("attribute %<target%> argument not a string"); 15979 return false; 15980 } 15981 15982 size_t len = strlen (TREE_STRING_POINTER (args)); 15983 char *str_to_check = (char *) alloca (len + 1); 15984 strcpy (str_to_check, TREE_STRING_POINTER (args)); 15985 15986 if (len == 0) 15987 { 15988 error ("malformed %<target()%> pragma or attribute"); 15989 return false; 15990 } 15991 15992 /* Used to catch empty spaces between commas i.e. 15993 attribute ((target ("attr1,,attr2"))). */ 15994 unsigned int num_commas = num_occurences_in_str (',', str_to_check); 15995 15996 /* Handle multiple target attributes separated by ','. */ 15997 char *token = strtok_r (str_to_check, ",", &str_to_check); 15998 15999 unsigned int num_attrs = 0; 16000 while (token) 16001 { 16002 num_attrs++; 16003 if (!aarch64_process_one_target_attr (token)) 16004 { 16005 error ("pragma or attribute %<target(\"%s\")%> is not valid", token); 16006 return false; 16007 } 16008 16009 token = strtok_r (NULL, ",", &str_to_check); 16010 } 16011 16012 if (num_attrs != num_commas + 1) 16013 { 16014 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args)); 16015 return false; 16016 } 16017 16018 return true; 16019} 16020 16021/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to 16022 process attribute ((target ("..."))). */ 16023 16024static bool 16025aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int) 16026{ 16027 struct cl_target_option cur_target; 16028 bool ret; 16029 tree old_optimize; 16030 tree new_target, new_optimize; 16031 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 16032 16033 /* If what we're processing is the current pragma string then the 16034 target option node is already stored in target_option_current_node 16035 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid 16036 having to re-parse the string. This is especially useful to keep 16037 arm_neon.h compile times down since that header contains a lot 16038 of intrinsics enclosed in pragmas. */ 16039 if (!existing_target && args == current_target_pragma) 16040 { 16041 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node; 16042 return true; 16043 } 16044 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); 16045 16046 old_optimize = build_optimization_node (&global_options); 16047 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); 16048 16049 /* If the function changed the optimization levels as well as setting 16050 target options, start with the optimizations specified. */ 16051 if (func_optimize && func_optimize != old_optimize) 16052 cl_optimization_restore (&global_options, 16053 TREE_OPTIMIZATION (func_optimize)); 16054 16055 /* Save the current target options to restore at the end. */ 16056 cl_target_option_save (&cur_target, &global_options); 16057 16058 /* If fndecl already has some target attributes applied to it, unpack 16059 them so that we add this attribute on top of them, rather than 16060 overwriting them. */ 16061 if (existing_target) 16062 { 16063 struct cl_target_option *existing_options 16064 = TREE_TARGET_OPTION (existing_target); 16065 16066 if (existing_options) 16067 cl_target_option_restore (&global_options, existing_options); 16068 } 16069 else 16070 cl_target_option_restore (&global_options, 16071 TREE_TARGET_OPTION (target_option_current_node)); 16072 16073 ret = aarch64_process_target_attr (args); 16074 16075 /* Set up any additional state. */ 16076 if (ret) 16077 { 16078 aarch64_override_options_internal (&global_options); 16079 /* Initialize SIMD builtins if we haven't already. 16080 Set current_target_pragma to NULL for the duration so that 16081 the builtin initialization code doesn't try to tag the functions 16082 being built with the attributes specified by any current pragma, thus 16083 going into an infinite recursion. */ 16084 if (TARGET_SIMD) 16085 { 16086 tree saved_current_target_pragma = current_target_pragma; 16087 current_target_pragma = NULL; 16088 aarch64_init_simd_builtins (); 16089 current_target_pragma = saved_current_target_pragma; 16090 } 16091 new_target = build_target_option_node (&global_options); 16092 } 16093 else 16094 new_target = NULL; 16095 16096 new_optimize = build_optimization_node (&global_options); 16097 16098 if (fndecl && ret) 16099 { 16100 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; 16101 16102 if (old_optimize != new_optimize) 16103 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; 16104 } 16105 16106 cl_target_option_restore (&global_options, &cur_target); 16107 16108 if (old_optimize != new_optimize) 16109 cl_optimization_restore (&global_options, 16110 TREE_OPTIMIZATION (old_optimize)); 16111 return ret; 16112} 16113 16114/* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are 16115 tri-bool options (yes, no, don't care) and the default value is 16116 DEF, determine whether to reject inlining. */ 16117 16118static bool 16119aarch64_tribools_ok_for_inlining_p (int caller, int callee, 16120 int dont_care, int def) 16121{ 16122 /* If the callee doesn't care, always allow inlining. */ 16123 if (callee == dont_care) 16124 return true; 16125 16126 /* If the caller doesn't care, always allow inlining. */ 16127 if (caller == dont_care) 16128 return true; 16129 16130 /* Otherwise, allow inlining if either the callee and caller values 16131 agree, or if the callee is using the default value. */ 16132 return (callee == caller || callee == def); 16133} 16134 16135/* Implement TARGET_CAN_INLINE_P. Decide whether it is valid 16136 to inline CALLEE into CALLER based on target-specific info. 16137 Make sure that the caller and callee have compatible architectural 16138 features. Then go through the other possible target attributes 16139 and see if they can block inlining. Try not to reject always_inline 16140 callees unless they are incompatible architecturally. */ 16141 16142static bool 16143aarch64_can_inline_p (tree caller, tree callee) 16144{ 16145 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); 16146 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); 16147 16148 struct cl_target_option *caller_opts 16149 = TREE_TARGET_OPTION (caller_tree ? caller_tree 16150 : target_option_default_node); 16151 16152 struct cl_target_option *callee_opts 16153 = TREE_TARGET_OPTION (callee_tree ? callee_tree 16154 : target_option_default_node); 16155 16156 /* Callee's ISA flags should be a subset of the caller's. */ 16157 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags) 16158 != callee_opts->x_aarch64_isa_flags) 16159 return false; 16160 16161 /* Allow non-strict aligned functions inlining into strict 16162 aligned ones. */ 16163 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags) 16164 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)) 16165 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags) 16166 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags))) 16167 return false; 16168 16169 bool always_inline = lookup_attribute ("always_inline", 16170 DECL_ATTRIBUTES (callee)); 16171 16172 /* If the architectural features match up and the callee is always_inline 16173 then the other attributes don't matter. */ 16174 if (always_inline) 16175 return true; 16176 16177 if (caller_opts->x_aarch64_cmodel_var 16178 != callee_opts->x_aarch64_cmodel_var) 16179 return false; 16180 16181 if (caller_opts->x_aarch64_tls_dialect 16182 != callee_opts->x_aarch64_tls_dialect) 16183 return false; 16184 16185 /* Honour explicit requests to workaround errata. */ 16186 if (!aarch64_tribools_ok_for_inlining_p ( 16187 caller_opts->x_aarch64_fix_a53_err835769, 16188 callee_opts->x_aarch64_fix_a53_err835769, 16189 2, TARGET_FIX_ERR_A53_835769_DEFAULT)) 16190 return false; 16191 16192 if (!aarch64_tribools_ok_for_inlining_p ( 16193 caller_opts->x_aarch64_fix_a53_err843419, 16194 callee_opts->x_aarch64_fix_a53_err843419, 16195 2, TARGET_FIX_ERR_A53_843419)) 16196 return false; 16197 16198 /* If the user explicitly specified -momit-leaf-frame-pointer for the 16199 caller and calle and they don't match up, reject inlining. */ 16200 if (!aarch64_tribools_ok_for_inlining_p ( 16201 caller_opts->x_flag_omit_leaf_frame_pointer, 16202 callee_opts->x_flag_omit_leaf_frame_pointer, 16203 2, 1)) 16204 return false; 16205 16206 /* If the callee has specific tuning overrides, respect them. */ 16207 if (callee_opts->x_aarch64_override_tune_string != NULL 16208 && caller_opts->x_aarch64_override_tune_string == NULL) 16209 return false; 16210 16211 /* If the user specified tuning override strings for the 16212 caller and callee and they don't match up, reject inlining. 16213 We just do a string compare here, we don't analyze the meaning 16214 of the string, as it would be too costly for little gain. */ 16215 if (callee_opts->x_aarch64_override_tune_string 16216 && caller_opts->x_aarch64_override_tune_string 16217 && (strcmp (callee_opts->x_aarch64_override_tune_string, 16218 caller_opts->x_aarch64_override_tune_string) != 0)) 16219 return false; 16220 16221 return true; 16222} 16223 16224/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't 16225 been already. */ 16226 16227unsigned int 16228aarch64_tlsdesc_abi_id () 16229{ 16230 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC]; 16231 if (!tlsdesc_abi.initialized_p ()) 16232 { 16233 HARD_REG_SET full_reg_clobbers; 16234 CLEAR_HARD_REG_SET (full_reg_clobbers); 16235 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM); 16236 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM); 16237 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno) 16238 SET_HARD_REG_BIT (full_reg_clobbers, regno); 16239 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers); 16240 } 16241 return tlsdesc_abi.id (); 16242} 16243 16244/* Return true if SYMBOL_REF X binds locally. */ 16245 16246static bool 16247aarch64_symbol_binds_local_p (const_rtx x) 16248{ 16249 return (SYMBOL_REF_DECL (x) 16250 ? targetm.binds_local_p (SYMBOL_REF_DECL (x)) 16251 : SYMBOL_REF_LOCAL_P (x)); 16252} 16253 16254/* Return true if SYMBOL_REF X is thread local */ 16255static bool 16256aarch64_tls_symbol_p (rtx x) 16257{ 16258 if (! TARGET_HAVE_TLS) 16259 return false; 16260 16261 x = strip_salt (x); 16262 if (GET_CODE (x) != SYMBOL_REF) 16263 return false; 16264 16265 return SYMBOL_REF_TLS_MODEL (x) != 0; 16266} 16267 16268/* Classify a TLS symbol into one of the TLS kinds. */ 16269enum aarch64_symbol_type 16270aarch64_classify_tls_symbol (rtx x) 16271{ 16272 enum tls_model tls_kind = tls_symbolic_operand_type (x); 16273 16274 switch (tls_kind) 16275 { 16276 case TLS_MODEL_GLOBAL_DYNAMIC: 16277 case TLS_MODEL_LOCAL_DYNAMIC: 16278 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD; 16279 16280 case TLS_MODEL_INITIAL_EXEC: 16281 switch (aarch64_cmodel) 16282 { 16283 case AARCH64_CMODEL_TINY: 16284 case AARCH64_CMODEL_TINY_PIC: 16285 return SYMBOL_TINY_TLSIE; 16286 default: 16287 return SYMBOL_SMALL_TLSIE; 16288 } 16289 16290 case TLS_MODEL_LOCAL_EXEC: 16291 if (aarch64_tls_size == 12) 16292 return SYMBOL_TLSLE12; 16293 else if (aarch64_tls_size == 24) 16294 return SYMBOL_TLSLE24; 16295 else if (aarch64_tls_size == 32) 16296 return SYMBOL_TLSLE32; 16297 else if (aarch64_tls_size == 48) 16298 return SYMBOL_TLSLE48; 16299 else 16300 gcc_unreachable (); 16301 16302 case TLS_MODEL_EMULATED: 16303 case TLS_MODEL_NONE: 16304 return SYMBOL_FORCE_TO_MEM; 16305 16306 default: 16307 gcc_unreachable (); 16308 } 16309} 16310 16311/* Return the correct method for accessing X + OFFSET, where X is either 16312 a SYMBOL_REF or LABEL_REF. */ 16313 16314enum aarch64_symbol_type 16315aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset) 16316{ 16317 x = strip_salt (x); 16318 16319 if (GET_CODE (x) == LABEL_REF) 16320 { 16321 switch (aarch64_cmodel) 16322 { 16323 case AARCH64_CMODEL_LARGE: 16324 return SYMBOL_FORCE_TO_MEM; 16325 16326 case AARCH64_CMODEL_TINY_PIC: 16327 case AARCH64_CMODEL_TINY: 16328 return SYMBOL_TINY_ABSOLUTE; 16329 16330 case AARCH64_CMODEL_SMALL_SPIC: 16331 case AARCH64_CMODEL_SMALL_PIC: 16332 case AARCH64_CMODEL_SMALL: 16333 return SYMBOL_SMALL_ABSOLUTE; 16334 16335 default: 16336 gcc_unreachable (); 16337 } 16338 } 16339 16340 if (GET_CODE (x) == SYMBOL_REF) 16341 { 16342 if (aarch64_tls_symbol_p (x)) 16343 return aarch64_classify_tls_symbol (x); 16344 16345 switch (aarch64_cmodel) 16346 { 16347 case AARCH64_CMODEL_TINY: 16348 /* When we retrieve symbol + offset address, we have to make sure 16349 the offset does not cause overflow of the final address. But 16350 we have no way of knowing the address of symbol at compile time 16351 so we can't accurately say if the distance between the PC and 16352 symbol + offset is outside the addressible range of +/-1MB in the 16353 TINY code model. So we limit the maximum offset to +/-64KB and 16354 assume the offset to the symbol is not larger than +/-(1MB - 64KB). 16355 If offset_within_block_p is true we allow larger offsets. 16356 Furthermore force to memory if the symbol is a weak reference to 16357 something that doesn't resolve to a symbol in this module. */ 16358 16359 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x)) 16360 return SYMBOL_FORCE_TO_MEM; 16361 if (!(IN_RANGE (offset, -0x10000, 0x10000) 16362 || offset_within_block_p (x, offset))) 16363 return SYMBOL_FORCE_TO_MEM; 16364 16365 return SYMBOL_TINY_ABSOLUTE; 16366 16367 case AARCH64_CMODEL_SMALL: 16368 /* Same reasoning as the tiny code model, but the offset cap here is 16369 1MB, allowing +/-3.9GB for the offset to the symbol. */ 16370 16371 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x)) 16372 return SYMBOL_FORCE_TO_MEM; 16373 if (!(IN_RANGE (offset, -0x100000, 0x100000) 16374 || offset_within_block_p (x, offset))) 16375 return SYMBOL_FORCE_TO_MEM; 16376 16377 return SYMBOL_SMALL_ABSOLUTE; 16378 16379 case AARCH64_CMODEL_TINY_PIC: 16380 if (!aarch64_symbol_binds_local_p (x)) 16381 return SYMBOL_TINY_GOT; 16382 return SYMBOL_TINY_ABSOLUTE; 16383 16384 case AARCH64_CMODEL_SMALL_SPIC: 16385 case AARCH64_CMODEL_SMALL_PIC: 16386 if (!aarch64_symbol_binds_local_p (x)) 16387 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC 16388 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G); 16389 return SYMBOL_SMALL_ABSOLUTE; 16390 16391 case AARCH64_CMODEL_LARGE: 16392 /* This is alright even in PIC code as the constant 16393 pool reference is always PC relative and within 16394 the same translation unit. */ 16395 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x)) 16396 return SYMBOL_SMALL_ABSOLUTE; 16397 else 16398 return SYMBOL_FORCE_TO_MEM; 16399 16400 default: 16401 gcc_unreachable (); 16402 } 16403 } 16404 16405 /* By default push everything into the constant pool. */ 16406 return SYMBOL_FORCE_TO_MEM; 16407} 16408 16409bool 16410aarch64_constant_address_p (rtx x) 16411{ 16412 return (CONSTANT_P (x) && memory_address_p (DImode, x)); 16413} 16414 16415bool 16416aarch64_legitimate_pic_operand_p (rtx x) 16417{ 16418 poly_int64 offset; 16419 x = strip_offset_and_salt (x, &offset); 16420 if (GET_CODE (x) == SYMBOL_REF) 16421 return false; 16422 16423 return true; 16424} 16425 16426/* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants 16427 that should be rematerialized rather than spilled. */ 16428 16429static bool 16430aarch64_legitimate_constant_p (machine_mode mode, rtx x) 16431{ 16432 /* Support CSE and rematerialization of common constants. */ 16433 if (CONST_INT_P (x) 16434 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)) 16435 return true; 16436 16437 /* Only accept variable-length vector constants if they can be 16438 handled directly. 16439 16440 ??? It would be possible (but complex) to handle rematerialization 16441 of other constants via secondary reloads. */ 16442 if (!GET_MODE_SIZE (mode).is_constant ()) 16443 return aarch64_simd_valid_immediate (x, NULL); 16444 16445 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at 16446 least be forced to memory and loaded from there. */ 16447 if (GET_CODE (x) == CONST_VECTOR) 16448 return !targetm.cannot_force_const_mem (mode, x); 16449 16450 /* Do not allow vector struct mode constants for Advanced SIMD. 16451 We could support 0 and -1 easily, but they need support in 16452 aarch64-simd.md. */ 16453 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 16454 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) 16455 return false; 16456 16457 if (GET_CODE (x) == HIGH) 16458 x = XEXP (x, 0); 16459 16460 /* Accept polynomial constants that can be calculated by using the 16461 destination of a move as the sole temporary. Constants that 16462 require a second temporary cannot be rematerialized (they can't be 16463 forced to memory and also aren't legitimate constants). */ 16464 poly_int64 offset; 16465 if (poly_int_rtx_p (x, &offset)) 16466 return aarch64_offset_temporaries (false, offset) <= 1; 16467 16468 /* If an offset is being added to something else, we need to allow the 16469 base to be moved into the destination register, meaning that there 16470 are no free temporaries for the offset. */ 16471 x = strip_offset_and_salt (x, &offset); 16472 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0) 16473 return false; 16474 16475 /* Do not allow const (plus (anchor_symbol, const_int)). */ 16476 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x)) 16477 return false; 16478 16479 /* Treat symbols as constants. Avoid TLS symbols as they are complex, 16480 so spilling them is better than rematerialization. */ 16481 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x)) 16482 return true; 16483 16484 /* Label references are always constant. */ 16485 if (GET_CODE (x) == LABEL_REF) 16486 return true; 16487 16488 return false; 16489} 16490 16491rtx 16492aarch64_load_tp (rtx target) 16493{ 16494 if (!target 16495 || GET_MODE (target) != Pmode 16496 || !register_operand (target, Pmode)) 16497 target = gen_reg_rtx (Pmode); 16498 16499 /* Can return in any reg. */ 16500 emit_insn (gen_aarch64_load_tp_hard (target)); 16501 return target; 16502} 16503 16504/* On AAPCS systems, this is the "struct __va_list". */ 16505static GTY(()) tree va_list_type; 16506 16507/* Implement TARGET_BUILD_BUILTIN_VA_LIST. 16508 Return the type to use as __builtin_va_list. 16509 16510 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as: 16511 16512 struct __va_list 16513 { 16514 void *__stack; 16515 void *__gr_top; 16516 void *__vr_top; 16517 int __gr_offs; 16518 int __vr_offs; 16519 }; */ 16520 16521static tree 16522aarch64_build_builtin_va_list (void) 16523{ 16524 tree va_list_name; 16525 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 16526 16527 /* Create the type. */ 16528 va_list_type = lang_hooks.types.make_type (RECORD_TYPE); 16529 /* Give it the required name. */ 16530 va_list_name = build_decl (BUILTINS_LOCATION, 16531 TYPE_DECL, 16532 get_identifier ("__va_list"), 16533 va_list_type); 16534 DECL_ARTIFICIAL (va_list_name) = 1; 16535 TYPE_NAME (va_list_type) = va_list_name; 16536 TYPE_STUB_DECL (va_list_type) = va_list_name; 16537 16538 /* Create the fields. */ 16539 f_stack = build_decl (BUILTINS_LOCATION, 16540 FIELD_DECL, get_identifier ("__stack"), 16541 ptr_type_node); 16542 f_grtop = build_decl (BUILTINS_LOCATION, 16543 FIELD_DECL, get_identifier ("__gr_top"), 16544 ptr_type_node); 16545 f_vrtop = build_decl (BUILTINS_LOCATION, 16546 FIELD_DECL, get_identifier ("__vr_top"), 16547 ptr_type_node); 16548 f_groff = build_decl (BUILTINS_LOCATION, 16549 FIELD_DECL, get_identifier ("__gr_offs"), 16550 integer_type_node); 16551 f_vroff = build_decl (BUILTINS_LOCATION, 16552 FIELD_DECL, get_identifier ("__vr_offs"), 16553 integer_type_node); 16554 16555 /* Tell tree-stdarg pass about our internal offset fields. 16556 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision 16557 purpose to identify whether the code is updating va_list internal 16558 offset fields through irregular way. */ 16559 va_list_gpr_counter_field = f_groff; 16560 va_list_fpr_counter_field = f_vroff; 16561 16562 DECL_ARTIFICIAL (f_stack) = 1; 16563 DECL_ARTIFICIAL (f_grtop) = 1; 16564 DECL_ARTIFICIAL (f_vrtop) = 1; 16565 DECL_ARTIFICIAL (f_groff) = 1; 16566 DECL_ARTIFICIAL (f_vroff) = 1; 16567 16568 DECL_FIELD_CONTEXT (f_stack) = va_list_type; 16569 DECL_FIELD_CONTEXT (f_grtop) = va_list_type; 16570 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type; 16571 DECL_FIELD_CONTEXT (f_groff) = va_list_type; 16572 DECL_FIELD_CONTEXT (f_vroff) = va_list_type; 16573 16574 TYPE_FIELDS (va_list_type) = f_stack; 16575 DECL_CHAIN (f_stack) = f_grtop; 16576 DECL_CHAIN (f_grtop) = f_vrtop; 16577 DECL_CHAIN (f_vrtop) = f_groff; 16578 DECL_CHAIN (f_groff) = f_vroff; 16579 16580 /* Compute its layout. */ 16581 layout_type (va_list_type); 16582 16583 return va_list_type; 16584} 16585 16586/* Implement TARGET_EXPAND_BUILTIN_VA_START. */ 16587static void 16588aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED) 16589{ 16590 const CUMULATIVE_ARGS *cum; 16591 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 16592 tree stack, grtop, vrtop, groff, vroff; 16593 tree t; 16594 int gr_save_area_size = cfun->va_list_gpr_size; 16595 int vr_save_area_size = cfun->va_list_fpr_size; 16596 int vr_offset; 16597 16598 cum = &crtl->args.info; 16599 if (cfun->va_list_gpr_size) 16600 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD, 16601 cfun->va_list_gpr_size); 16602 if (cfun->va_list_fpr_size) 16603 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn) 16604 * UNITS_PER_VREG, cfun->va_list_fpr_size); 16605 16606 if (!TARGET_FLOAT) 16607 { 16608 gcc_assert (cum->aapcs_nvrn == 0); 16609 vr_save_area_size = 0; 16610 } 16611 16612 f_stack = TYPE_FIELDS (va_list_type_node); 16613 f_grtop = DECL_CHAIN (f_stack); 16614 f_vrtop = DECL_CHAIN (f_grtop); 16615 f_groff = DECL_CHAIN (f_vrtop); 16616 f_vroff = DECL_CHAIN (f_groff); 16617 16618 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack, 16619 NULL_TREE); 16620 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop, 16621 NULL_TREE); 16622 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop, 16623 NULL_TREE); 16624 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff, 16625 NULL_TREE); 16626 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff, 16627 NULL_TREE); 16628 16629 /* Emit code to initialize STACK, which points to the next varargs stack 16630 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used 16631 by named arguments. STACK is 8-byte aligned. */ 16632 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx); 16633 if (cum->aapcs_stack_size > 0) 16634 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD); 16635 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t); 16636 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 16637 16638 /* Emit code to initialize GRTOP, the top of the GR save area. 16639 virtual_incoming_args_rtx should have been 16 byte aligned. */ 16640 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx); 16641 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t); 16642 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 16643 16644 /* Emit code to initialize VRTOP, the top of the VR save area. 16645 This address is gr_save_area_bytes below GRTOP, rounded 16646 down to the next 16-byte boundary. */ 16647 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx); 16648 vr_offset = ROUND_UP (gr_save_area_size, 16649 STACK_BOUNDARY / BITS_PER_UNIT); 16650 16651 if (vr_offset) 16652 t = fold_build_pointer_plus_hwi (t, -vr_offset); 16653 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t); 16654 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 16655 16656 /* Emit code to initialize GROFF, the offset from GRTOP of the 16657 next GPR argument. */ 16658 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff, 16659 build_int_cst (TREE_TYPE (groff), -gr_save_area_size)); 16660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 16661 16662 /* Likewise emit code to initialize VROFF, the offset from FTOP 16663 of the next VR argument. */ 16664 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff, 16665 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size)); 16666 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 16667} 16668 16669/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */ 16670 16671static tree 16672aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, 16673 gimple_seq *post_p ATTRIBUTE_UNUSED) 16674{ 16675 tree addr; 16676 bool indirect_p; 16677 bool is_ha; /* is HFA or HVA. */ 16678 bool dw_align; /* double-word align. */ 16679 machine_mode ag_mode = VOIDmode; 16680 int nregs; 16681 machine_mode mode; 16682 16683 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 16684 tree stack, f_top, f_off, off, arg, roundup, on_stack; 16685 HOST_WIDE_INT size, rsize, adjust, align; 16686 tree t, u, cond1, cond2; 16687 16688 indirect_p = pass_va_arg_by_reference (type); 16689 if (indirect_p) 16690 type = build_pointer_type (type); 16691 16692 mode = TYPE_MODE (type); 16693 16694 f_stack = TYPE_FIELDS (va_list_type_node); 16695 f_grtop = DECL_CHAIN (f_stack); 16696 f_vrtop = DECL_CHAIN (f_grtop); 16697 f_groff = DECL_CHAIN (f_vrtop); 16698 f_vroff = DECL_CHAIN (f_groff); 16699 16700 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist), 16701 f_stack, NULL_TREE); 16702 size = int_size_in_bytes (type); 16703 16704 bool abi_break; 16705 align 16706 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT; 16707 16708 dw_align = false; 16709 adjust = 0; 16710 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs, 16711 &is_ha, false)) 16712 { 16713 /* No frontends can create types with variable-sized modes, so we 16714 shouldn't be asked to pass or return them. */ 16715 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant (); 16716 16717 /* TYPE passed in fp/simd registers. */ 16718 if (!TARGET_FLOAT) 16719 aarch64_err_no_fpadvsimd (mode); 16720 16721 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), 16722 unshare_expr (valist), f_vrtop, NULL_TREE); 16723 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), 16724 unshare_expr (valist), f_vroff, NULL_TREE); 16725 16726 rsize = nregs * UNITS_PER_VREG; 16727 16728 if (is_ha) 16729 { 16730 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG) 16731 adjust = UNITS_PER_VREG - ag_size; 16732 } 16733 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 16734 && size < UNITS_PER_VREG) 16735 { 16736 adjust = UNITS_PER_VREG - size; 16737 } 16738 } 16739 else 16740 { 16741 /* TYPE passed in general registers. */ 16742 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), 16743 unshare_expr (valist), f_grtop, NULL_TREE); 16744 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff), 16745 unshare_expr (valist), f_groff, NULL_TREE); 16746 rsize = ROUND_UP (size, UNITS_PER_WORD); 16747 nregs = rsize / UNITS_PER_WORD; 16748 16749 if (align > 8) 16750 { 16751 if (abi_break && warn_psabi) 16752 inform (input_location, "parameter passing for argument of type " 16753 "%qT changed in GCC 9.1", type); 16754 dw_align = true; 16755 } 16756 16757 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 16758 && size < UNITS_PER_WORD) 16759 { 16760 adjust = UNITS_PER_WORD - size; 16761 } 16762 } 16763 16764 /* Get a local temporary for the field value. */ 16765 off = get_initialized_tmp_var (f_off, pre_p, NULL); 16766 16767 /* Emit code to branch if off >= 0. */ 16768 t = build2 (GE_EXPR, boolean_type_node, off, 16769 build_int_cst (TREE_TYPE (off), 0)); 16770 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE); 16771 16772 if (dw_align) 16773 { 16774 /* Emit: offs = (offs + 15) & -16. */ 16775 t = build2 (PLUS_EXPR, TREE_TYPE (off), off, 16776 build_int_cst (TREE_TYPE (off), 15)); 16777 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t, 16778 build_int_cst (TREE_TYPE (off), -16)); 16779 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t); 16780 } 16781 else 16782 roundup = NULL; 16783 16784 /* Update ap.__[g|v]r_offs */ 16785 t = build2 (PLUS_EXPR, TREE_TYPE (off), off, 16786 build_int_cst (TREE_TYPE (off), rsize)); 16787 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t); 16788 16789 /* String up. */ 16790 if (roundup) 16791 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t); 16792 16793 /* [cond2] if (ap.__[g|v]r_offs > 0) */ 16794 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off), 16795 build_int_cst (TREE_TYPE (f_off), 0)); 16796 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE); 16797 16798 /* String up: make sure the assignment happens before the use. */ 16799 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2); 16800 COND_EXPR_ELSE (cond1) = t; 16801 16802 /* Prepare the trees handling the argument that is passed on the stack; 16803 the top level node will store in ON_STACK. */ 16804 arg = get_initialized_tmp_var (stack, pre_p, NULL); 16805 if (align > 8) 16806 { 16807 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */ 16808 t = fold_build_pointer_plus_hwi (arg, 15); 16809 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, 16810 build_int_cst (TREE_TYPE (t), -16)); 16811 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t); 16812 } 16813 else 16814 roundup = NULL; 16815 /* Advance ap.__stack */ 16816 t = fold_build_pointer_plus_hwi (arg, size + 7); 16817 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, 16818 build_int_cst (TREE_TYPE (t), -8)); 16819 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t); 16820 /* String up roundup and advance. */ 16821 if (roundup) 16822 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t); 16823 /* String up with arg */ 16824 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg); 16825 /* Big-endianness related address adjustment. */ 16826 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 16827 && size < UNITS_PER_WORD) 16828 { 16829 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg, 16830 size_int (UNITS_PER_WORD - size)); 16831 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t); 16832 } 16833 16834 COND_EXPR_THEN (cond1) = unshare_expr (on_stack); 16835 COND_EXPR_THEN (cond2) = unshare_expr (on_stack); 16836 16837 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */ 16838 t = off; 16839 if (adjust) 16840 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off, 16841 build_int_cst (TREE_TYPE (off), adjust)); 16842 16843 t = fold_convert (sizetype, t); 16844 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t); 16845 16846 if (is_ha) 16847 { 16848 /* type ha; // treat as "struct {ftype field[n];}" 16849 ... [computing offs] 16850 for (i = 0; i <nregs; ++i, offs += 16) 16851 ha.field[i] = *((ftype *)(ap.__vr_top + offs)); 16852 return ha; */ 16853 int i; 16854 tree tmp_ha, field_t, field_ptr_t; 16855 16856 /* Declare a local variable. */ 16857 tmp_ha = create_tmp_var_raw (type, "ha"); 16858 gimple_add_tmp_var (tmp_ha); 16859 16860 /* Establish the base type. */ 16861 switch (ag_mode) 16862 { 16863 case E_SFmode: 16864 field_t = float_type_node; 16865 field_ptr_t = float_ptr_type_node; 16866 break; 16867 case E_DFmode: 16868 field_t = double_type_node; 16869 field_ptr_t = double_ptr_type_node; 16870 break; 16871 case E_TFmode: 16872 field_t = long_double_type_node; 16873 field_ptr_t = long_double_ptr_type_node; 16874 break; 16875 case E_HFmode: 16876 field_t = aarch64_fp16_type_node; 16877 field_ptr_t = aarch64_fp16_ptr_type_node; 16878 break; 16879 case E_BFmode: 16880 field_t = aarch64_bf16_type_node; 16881 field_ptr_t = aarch64_bf16_ptr_type_node; 16882 break; 16883 case E_V2SImode: 16884 case E_V4SImode: 16885 { 16886 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode)); 16887 field_t = build_vector_type_for_mode (innertype, ag_mode); 16888 field_ptr_t = build_pointer_type (field_t); 16889 } 16890 break; 16891 default: 16892 gcc_assert (0); 16893 } 16894 16895 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */ 16896 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha); 16897 addr = t; 16898 t = fold_convert (field_ptr_t, addr); 16899 t = build2 (MODIFY_EXPR, field_t, 16900 build1 (INDIRECT_REF, field_t, tmp_ha), 16901 build1 (INDIRECT_REF, field_t, t)); 16902 16903 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */ 16904 for (i = 1; i < nregs; ++i) 16905 { 16906 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG); 16907 u = fold_convert (field_ptr_t, addr); 16908 u = build2 (MODIFY_EXPR, field_t, 16909 build2 (MEM_REF, field_t, tmp_ha, 16910 build_int_cst (field_ptr_t, 16911 (i * 16912 int_size_in_bytes (field_t)))), 16913 build1 (INDIRECT_REF, field_t, u)); 16914 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u); 16915 } 16916 16917 u = fold_convert (TREE_TYPE (f_top), tmp_ha); 16918 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u); 16919 } 16920 16921 COND_EXPR_ELSE (cond2) = t; 16922 addr = fold_convert (build_pointer_type (type), cond1); 16923 addr = build_va_arg_indirect_ref (addr); 16924 16925 if (indirect_p) 16926 addr = build_va_arg_indirect_ref (addr); 16927 16928 return addr; 16929} 16930 16931/* Implement TARGET_SETUP_INCOMING_VARARGS. */ 16932 16933static void 16934aarch64_setup_incoming_varargs (cumulative_args_t cum_v, 16935 const function_arg_info &arg, 16936 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl) 16937{ 16938 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 16939 CUMULATIVE_ARGS local_cum; 16940 int gr_saved = cfun->va_list_gpr_size; 16941 int vr_saved = cfun->va_list_fpr_size; 16942 16943 /* The caller has advanced CUM up to, but not beyond, the last named 16944 argument. Advance a local copy of CUM past the last "real" named 16945 argument, to find out how many registers are left over. */ 16946 local_cum = *cum; 16947 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg); 16948 16949 /* Found out how many registers we need to save. 16950 Honor tree-stdvar analysis results. */ 16951 if (cfun->va_list_gpr_size) 16952 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn, 16953 cfun->va_list_gpr_size / UNITS_PER_WORD); 16954 if (cfun->va_list_fpr_size) 16955 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn, 16956 cfun->va_list_fpr_size / UNITS_PER_VREG); 16957 16958 if (!TARGET_FLOAT) 16959 { 16960 gcc_assert (local_cum.aapcs_nvrn == 0); 16961 vr_saved = 0; 16962 } 16963 16964 if (!no_rtl) 16965 { 16966 if (gr_saved > 0) 16967 { 16968 rtx ptr, mem; 16969 16970 /* virtual_incoming_args_rtx should have been 16-byte aligned. */ 16971 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, 16972 - gr_saved * UNITS_PER_WORD); 16973 mem = gen_frame_mem (BLKmode, ptr); 16974 set_mem_alias_set (mem, get_varargs_alias_set ()); 16975 16976 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM, 16977 mem, gr_saved); 16978 } 16979 if (vr_saved > 0) 16980 { 16981 /* We can't use move_block_from_reg, because it will use 16982 the wrong mode, storing D regs only. */ 16983 machine_mode mode = TImode; 16984 int off, i, vr_start; 16985 16986 /* Set OFF to the offset from virtual_incoming_args_rtx of 16987 the first vector register. The VR save area lies below 16988 the GR one, and is aligned to 16 bytes. */ 16989 off = -ROUND_UP (gr_saved * UNITS_PER_WORD, 16990 STACK_BOUNDARY / BITS_PER_UNIT); 16991 off -= vr_saved * UNITS_PER_VREG; 16992 16993 vr_start = V0_REGNUM + local_cum.aapcs_nvrn; 16994 for (i = 0; i < vr_saved; ++i) 16995 { 16996 rtx ptr, mem; 16997 16998 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off); 16999 mem = gen_frame_mem (mode, ptr); 17000 set_mem_alias_set (mem, get_varargs_alias_set ()); 17001 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i)); 17002 off += UNITS_PER_VREG; 17003 } 17004 } 17005 } 17006 17007 /* We don't save the size into *PRETEND_SIZE because we want to avoid 17008 any complication of having crtl->args.pretend_args_size changed. */ 17009 cfun->machine->frame.saved_varargs_size 17010 = (ROUND_UP (gr_saved * UNITS_PER_WORD, 17011 STACK_BOUNDARY / BITS_PER_UNIT) 17012 + vr_saved * UNITS_PER_VREG); 17013} 17014 17015static void 17016aarch64_conditional_register_usage (void) 17017{ 17018 int i; 17019 if (!TARGET_FLOAT) 17020 { 17021 for (i = V0_REGNUM; i <= V31_REGNUM; i++) 17022 { 17023 fixed_regs[i] = 1; 17024 call_used_regs[i] = 1; 17025 } 17026 } 17027 if (!TARGET_SVE) 17028 for (i = P0_REGNUM; i <= P15_REGNUM; i++) 17029 { 17030 fixed_regs[i] = 1; 17031 call_used_regs[i] = 1; 17032 } 17033 17034 /* Only allow the FFR and FFRT to be accessed via special patterns. */ 17035 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM); 17036 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM); 17037 17038 /* When tracking speculation, we need a couple of call-clobbered registers 17039 to track the speculation state. It would be nice to just use 17040 IP0 and IP1, but currently there are numerous places that just 17041 assume these registers are free for other uses (eg pointer 17042 authentication). */ 17043 if (aarch64_track_speculation) 17044 { 17045 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1; 17046 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1; 17047 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1; 17048 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1; 17049 } 17050} 17051 17052/* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */ 17053 17054bool 17055aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode) 17056{ 17057 /* For records we're passed a FIELD_DECL, for arrays we're passed 17058 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */ 17059 const_tree type = TREE_TYPE (field_or_array); 17060 17061 /* Assign BLKmode to anything that contains multiple SVE predicates. 17062 For structures, the "multiple" case is indicated by MODE being 17063 VOIDmode. */ 17064 unsigned int num_zr, num_pr; 17065 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0) 17066 { 17067 if (TREE_CODE (field_or_array) == ARRAY_TYPE) 17068 return !simple_cst_equal (TYPE_SIZE (field_or_array), 17069 TYPE_SIZE (type)); 17070 return mode == VOIDmode; 17071 } 17072 17073 return default_member_type_forces_blk (field_or_array, mode); 17074} 17075 17076/* Bitmasks that indicate whether earlier versions of GCC would have 17077 taken a different path through the ABI logic. This should result in 17078 a -Wpsabi warning if the earlier path led to a different ABI decision. 17079 17080 WARN_PSABI_EMPTY_CXX17_BASE 17081 Indicates that the type includes an artificial empty C++17 base field 17082 that, prior to GCC 10.1, would prevent the type from being treated as 17083 a HFA or HVA. See PR94383 for details. 17084 17085 WARN_PSABI_NO_UNIQUE_ADDRESS 17086 Indicates that the type includes an empty [[no_unique_address]] field 17087 that, prior to GCC 10.1, would prevent the type from being treated as 17088 a HFA or HVA. */ 17089const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0; 17090const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1; 17091 17092/* Walk down the type tree of TYPE counting consecutive base elements. 17093 If *MODEP is VOIDmode, then set it to the first valid floating point 17094 type. If a non-floating point type is found, or if a floating point 17095 type that doesn't match a non-VOIDmode *MODEP is found, then return -1, 17096 otherwise return the count in the sub-tree. 17097 17098 The WARN_PSABI_FLAGS argument allows the caller to check whether this 17099 function has changed its behavior relative to earlier versions of GCC. 17100 Normally the argument should be nonnull and point to a zero-initialized 17101 variable. The function then records whether the ABI decision might 17102 be affected by a known fix to the ABI logic, setting the associated 17103 WARN_PSABI_* bits if so. 17104 17105 When the argument is instead a null pointer, the function tries to 17106 simulate the behavior of GCC before all such ABI fixes were made. 17107 This is useful to check whether the function returns something 17108 different after the ABI fixes. */ 17109static int 17110aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep, 17111 unsigned int *warn_psabi_flags) 17112{ 17113 machine_mode mode; 17114 HOST_WIDE_INT size; 17115 17116 if (aarch64_sve::builtin_type_p (type)) 17117 return -1; 17118 17119 switch (TREE_CODE (type)) 17120 { 17121 case REAL_TYPE: 17122 mode = TYPE_MODE (type); 17123 if (mode != DFmode && mode != SFmode 17124 && mode != TFmode && mode != HFmode) 17125 return -1; 17126 17127 if (*modep == VOIDmode) 17128 *modep = mode; 17129 17130 if (*modep == mode) 17131 return 1; 17132 17133 break; 17134 17135 case COMPLEX_TYPE: 17136 mode = TYPE_MODE (TREE_TYPE (type)); 17137 if (mode != DFmode && mode != SFmode 17138 && mode != TFmode && mode != HFmode) 17139 return -1; 17140 17141 if (*modep == VOIDmode) 17142 *modep = mode; 17143 17144 if (*modep == mode) 17145 return 2; 17146 17147 break; 17148 17149 case VECTOR_TYPE: 17150 /* Use V2SImode and V4SImode as representatives of all 64-bit 17151 and 128-bit vector types. */ 17152 size = int_size_in_bytes (type); 17153 switch (size) 17154 { 17155 case 8: 17156 mode = V2SImode; 17157 break; 17158 case 16: 17159 mode = V4SImode; 17160 break; 17161 default: 17162 return -1; 17163 } 17164 17165 if (*modep == VOIDmode) 17166 *modep = mode; 17167 17168 /* Vector modes are considered to be opaque: two vectors are 17169 equivalent for the purposes of being homogeneous aggregates 17170 if they are the same size. */ 17171 if (*modep == mode) 17172 return 1; 17173 17174 break; 17175 17176 case ARRAY_TYPE: 17177 { 17178 int count; 17179 tree index = TYPE_DOMAIN (type); 17180 17181 /* Can't handle incomplete types nor sizes that are not 17182 fixed. */ 17183 if (!COMPLETE_TYPE_P (type) 17184 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 17185 return -1; 17186 17187 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep, 17188 warn_psabi_flags); 17189 if (count == -1 17190 || !index 17191 || !TYPE_MAX_VALUE (index) 17192 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index)) 17193 || !TYPE_MIN_VALUE (index) 17194 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index)) 17195 || count < 0) 17196 return -1; 17197 17198 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index)) 17199 - tree_to_uhwi (TYPE_MIN_VALUE (index))); 17200 17201 /* There must be no padding. */ 17202 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 17203 count * GET_MODE_BITSIZE (*modep))) 17204 return -1; 17205 17206 return count; 17207 } 17208 17209 case RECORD_TYPE: 17210 { 17211 int count = 0; 17212 int sub_count; 17213 tree field; 17214 17215 /* Can't handle incomplete types nor sizes that are not 17216 fixed. */ 17217 if (!COMPLETE_TYPE_P (type) 17218 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 17219 return -1; 17220 17221 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 17222 { 17223 if (TREE_CODE (field) != FIELD_DECL) 17224 continue; 17225 17226 if (DECL_FIELD_ABI_IGNORED (field)) 17227 { 17228 /* See whether this is something that earlier versions of 17229 GCC failed to ignore. */ 17230 unsigned int flag; 17231 if (lookup_attribute ("no_unique_address", 17232 DECL_ATTRIBUTES (field))) 17233 flag = WARN_PSABI_NO_UNIQUE_ADDRESS; 17234 else if (cxx17_empty_base_field_p (field)) 17235 flag = WARN_PSABI_EMPTY_CXX17_BASE; 17236 else 17237 /* No compatibility problem. */ 17238 continue; 17239 17240 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */ 17241 if (warn_psabi_flags) 17242 { 17243 *warn_psabi_flags |= flag; 17244 continue; 17245 } 17246 } 17247 17248 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep, 17249 warn_psabi_flags); 17250 if (sub_count < 0) 17251 return -1; 17252 count += sub_count; 17253 } 17254 17255 /* There must be no padding. */ 17256 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 17257 count * GET_MODE_BITSIZE (*modep))) 17258 return -1; 17259 17260 return count; 17261 } 17262 17263 case UNION_TYPE: 17264 case QUAL_UNION_TYPE: 17265 { 17266 /* These aren't very interesting except in a degenerate case. */ 17267 int count = 0; 17268 int sub_count; 17269 tree field; 17270 17271 /* Can't handle incomplete types nor sizes that are not 17272 fixed. */ 17273 if (!COMPLETE_TYPE_P (type) 17274 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 17275 return -1; 17276 17277 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 17278 { 17279 if (TREE_CODE (field) != FIELD_DECL) 17280 continue; 17281 17282 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep, 17283 warn_psabi_flags); 17284 if (sub_count < 0) 17285 return -1; 17286 count = count > sub_count ? count : sub_count; 17287 } 17288 17289 /* There must be no padding. */ 17290 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 17291 count * GET_MODE_BITSIZE (*modep))) 17292 return -1; 17293 17294 return count; 17295 } 17296 17297 default: 17298 break; 17299 } 17300 17301 return -1; 17302} 17303 17304/* Return TRUE if the type, as described by TYPE and MODE, is a short vector 17305 type as described in AAPCS64 \S 4.1.2. 17306 17307 See the comment above aarch64_composite_type_p for the notes on MODE. */ 17308 17309static bool 17310aarch64_short_vector_p (const_tree type, 17311 machine_mode mode) 17312{ 17313 poly_int64 size = -1; 17314 17315 if (type && TREE_CODE (type) == VECTOR_TYPE) 17316 { 17317 if (aarch64_sve::builtin_type_p (type)) 17318 return false; 17319 size = int_size_in_bytes (type); 17320 } 17321 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT 17322 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) 17323 { 17324 /* Rely only on the type, not the mode, when processing SVE types. */ 17325 if (type && aarch64_some_values_include_pst_objects_p (type)) 17326 gcc_assert (aarch64_sve_mode_p (mode)); 17327 else 17328 size = GET_MODE_SIZE (mode); 17329 } 17330 if (known_eq (size, 8) || known_eq (size, 16)) 17331 { 17332 /* 64-bit and 128-bit vectors should only acquire an SVE mode if 17333 they are being treated as scalable AAPCS64 types. */ 17334 gcc_assert (!aarch64_sve_mode_p (mode)); 17335 return true; 17336 } 17337 return false; 17338} 17339 17340/* Return TRUE if the type, as described by TYPE and MODE, is a composite 17341 type as described in AAPCS64 \S 4.3. This includes aggregate, union and 17342 array types. The C99 floating-point complex types are also considered 17343 as composite types, according to AAPCS64 \S 7.1.1. The complex integer 17344 types, which are GCC extensions and out of the scope of AAPCS64, are 17345 treated as composite types here as well. 17346 17347 Note that MODE itself is not sufficient in determining whether a type 17348 is such a composite type or not. This is because 17349 stor-layout.c:compute_record_mode may have already changed the MODE 17350 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a 17351 structure with only one field may have its MODE set to the mode of the 17352 field. Also an integer mode whose size matches the size of the 17353 RECORD_TYPE type may be used to substitute the original mode 17354 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be 17355 solely relied on. */ 17356 17357static bool 17358aarch64_composite_type_p (const_tree type, 17359 machine_mode mode) 17360{ 17361 if (aarch64_short_vector_p (type, mode)) 17362 return false; 17363 17364 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE)) 17365 return true; 17366 17367 if (mode == BLKmode 17368 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT 17369 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT) 17370 return true; 17371 17372 return false; 17373} 17374 17375/* Return TRUE if an argument, whose type is described by TYPE and MODE, 17376 shall be passed or returned in simd/fp register(s) (providing these 17377 parameter passing registers are available). 17378 17379 Upon successful return, *COUNT returns the number of needed registers, 17380 *BASE_MODE returns the mode of the individual register and when IS_HAF 17381 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous 17382 floating-point aggregate or a homogeneous short-vector aggregate. 17383 17384 SILENT_P is true if the function should refrain from reporting any 17385 diagnostics. This should only be used if the caller is certain that 17386 any ABI decisions would eventually come through this function with 17387 SILENT_P set to false. */ 17388 17389static bool 17390aarch64_vfp_is_call_or_return_candidate (machine_mode mode, 17391 const_tree type, 17392 machine_mode *base_mode, 17393 int *count, 17394 bool *is_ha, 17395 bool silent_p) 17396{ 17397 if (is_ha != NULL) *is_ha = false; 17398 17399 machine_mode new_mode = VOIDmode; 17400 bool composite_p = aarch64_composite_type_p (type, mode); 17401 17402 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT) 17403 || aarch64_short_vector_p (type, mode)) 17404 { 17405 *count = 1; 17406 new_mode = mode; 17407 } 17408 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) 17409 { 17410 if (is_ha != NULL) *is_ha = true; 17411 *count = 2; 17412 new_mode = GET_MODE_INNER (mode); 17413 } 17414 else if (type && composite_p) 17415 { 17416 unsigned int warn_psabi_flags = 0; 17417 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode, 17418 &warn_psabi_flags); 17419 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS) 17420 { 17421 static unsigned last_reported_type_uid; 17422 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type)); 17423 int alt; 17424 if (!silent_p 17425 && warn_psabi 17426 && warn_psabi_flags 17427 && uid != last_reported_type_uid 17428 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL)) 17429 != ag_count)) 17430 { 17431 const char *url 17432 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base"; 17433 gcc_assert (alt == -1); 17434 last_reported_type_uid = uid; 17435 /* Use TYPE_MAIN_VARIANT to strip any redundant const 17436 qualification. */ 17437 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS) 17438 inform (input_location, "parameter passing for argument of " 17439 "type %qT with %<[[no_unique_address]]%> members " 17440 "changed %{in GCC 10.1%}", 17441 TYPE_MAIN_VARIANT (type), url); 17442 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE) 17443 inform (input_location, "parameter passing for argument of " 17444 "type %qT when C++17 is enabled changed to match " 17445 "C++14 %{in GCC 10.1%}", 17446 TYPE_MAIN_VARIANT (type), url); 17447 } 17448 17449 if (is_ha != NULL) *is_ha = true; 17450 *count = ag_count; 17451 } 17452 else 17453 return false; 17454 } 17455 else 17456 return false; 17457 17458 gcc_assert (!aarch64_sve_mode_p (new_mode)); 17459 *base_mode = new_mode; 17460 return true; 17461} 17462 17463/* Implement TARGET_STRUCT_VALUE_RTX. */ 17464 17465static rtx 17466aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED, 17467 int incoming ATTRIBUTE_UNUSED) 17468{ 17469 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM); 17470} 17471 17472/* Implements target hook vector_mode_supported_p. */ 17473static bool 17474aarch64_vector_mode_supported_p (machine_mode mode) 17475{ 17476 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 17477 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0; 17478} 17479 17480/* Return the full-width SVE vector mode for element mode MODE, if one 17481 exists. */ 17482opt_machine_mode 17483aarch64_full_sve_mode (scalar_mode mode) 17484{ 17485 switch (mode) 17486 { 17487 case E_DFmode: 17488 return VNx2DFmode; 17489 case E_SFmode: 17490 return VNx4SFmode; 17491 case E_HFmode: 17492 return VNx8HFmode; 17493 case E_BFmode: 17494 return VNx8BFmode; 17495 case E_DImode: 17496 return VNx2DImode; 17497 case E_SImode: 17498 return VNx4SImode; 17499 case E_HImode: 17500 return VNx8HImode; 17501 case E_QImode: 17502 return VNx16QImode; 17503 default: 17504 return opt_machine_mode (); 17505 } 17506} 17507 17508/* Return the 128-bit Advanced SIMD vector mode for element mode MODE, 17509 if it exists. */ 17510opt_machine_mode 17511aarch64_vq_mode (scalar_mode mode) 17512{ 17513 switch (mode) 17514 { 17515 case E_DFmode: 17516 return V2DFmode; 17517 case E_SFmode: 17518 return V4SFmode; 17519 case E_HFmode: 17520 return V8HFmode; 17521 case E_BFmode: 17522 return V8BFmode; 17523 case E_SImode: 17524 return V4SImode; 17525 case E_HImode: 17526 return V8HImode; 17527 case E_QImode: 17528 return V16QImode; 17529 case E_DImode: 17530 return V2DImode; 17531 default: 17532 return opt_machine_mode (); 17533 } 17534} 17535 17536/* Return appropriate SIMD container 17537 for MODE within a vector of WIDTH bits. */ 17538static machine_mode 17539aarch64_simd_container_mode (scalar_mode mode, poly_int64 width) 17540{ 17541 if (TARGET_SVE 17542 && maybe_ne (width, 128) 17543 && known_eq (width, BITS_PER_SVE_VECTOR)) 17544 return aarch64_full_sve_mode (mode).else_mode (word_mode); 17545 17546 gcc_assert (known_eq (width, 64) || known_eq (width, 128)); 17547 if (TARGET_SIMD) 17548 { 17549 if (known_eq (width, 128)) 17550 return aarch64_vq_mode (mode).else_mode (word_mode); 17551 else 17552 switch (mode) 17553 { 17554 case E_SFmode: 17555 return V2SFmode; 17556 case E_HFmode: 17557 return V4HFmode; 17558 case E_BFmode: 17559 return V4BFmode; 17560 case E_SImode: 17561 return V2SImode; 17562 case E_HImode: 17563 return V4HImode; 17564 case E_QImode: 17565 return V8QImode; 17566 default: 17567 break; 17568 } 17569 } 17570 return word_mode; 17571} 17572 17573static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64); 17574 17575/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M 17576 and return whether the SVE mode should be preferred over the 17577 Advanced SIMD one in aarch64_autovectorize_vector_modes. */ 17578static bool 17579aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m) 17580{ 17581 /* Take into account the aarch64-autovec-preference param if non-zero. */ 17582 bool only_asimd_p = aarch64_autovec_preference == 1; 17583 bool only_sve_p = aarch64_autovec_preference == 2; 17584 17585 if (only_asimd_p) 17586 return false; 17587 if (only_sve_p) 17588 return true; 17589 17590 /* The preference in case of a tie in costs. */ 17591 bool prefer_asimd = aarch64_autovec_preference == 3; 17592 bool prefer_sve = aarch64_autovec_preference == 4; 17593 17594 aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width; 17595 17596 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m); 17597 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m); 17598 /* If the CPU information does not have an SVE width registered use the 17599 generic poly_int comparison that prefers SVE. If a preference is 17600 explicitly requested avoid this path. */ 17601 if (tune_width == SVE_SCALABLE 17602 && !prefer_asimd 17603 && !prefer_sve) 17604 return maybe_gt (nunits_sve, nunits_asimd); 17605 17606 /* Otherwise estimate the runtime width of the modes involved. */ 17607 HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve); 17608 HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd); 17609 17610 /* Preferring SVE means picking it first unless the Advanced SIMD mode 17611 is clearly wider. */ 17612 if (prefer_sve) 17613 return est_sve >= est_asimd; 17614 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE 17615 is clearly wider. */ 17616 if (prefer_asimd) 17617 return est_sve > est_asimd; 17618 17619 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */ 17620 return est_sve > est_asimd; 17621} 17622 17623/* Return 128-bit container as the preferred SIMD mode for MODE. */ 17624static machine_mode 17625aarch64_preferred_simd_mode (scalar_mode mode) 17626{ 17627 /* Take into account explicit auto-vectorization ISA preferences through 17628 aarch64_cmp_autovec_modes. */ 17629 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode)) 17630 return aarch64_full_sve_mode (mode).else_mode (word_mode); 17631 if (TARGET_SIMD) 17632 return aarch64_vq_mode (mode).else_mode (word_mode); 17633 return word_mode; 17634} 17635 17636/* Return a list of possible vector sizes for the vectorizer 17637 to iterate over. */ 17638static unsigned int 17639aarch64_autovectorize_vector_modes (vector_modes *modes, bool) 17640{ 17641 static const machine_mode sve_modes[] = { 17642 /* Try using full vectors for all element types. */ 17643 VNx16QImode, 17644 17645 /* Try using 16-bit containers for 8-bit elements and full vectors 17646 for wider elements. */ 17647 VNx8QImode, 17648 17649 /* Try using 32-bit containers for 8-bit and 16-bit elements and 17650 full vectors for wider elements. */ 17651 VNx4QImode, 17652 17653 /* Try using 64-bit containers for all element types. */ 17654 VNx2QImode 17655 }; 17656 17657 static const machine_mode advsimd_modes[] = { 17658 /* Try using 128-bit vectors for all element types. */ 17659 V16QImode, 17660 17661 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors 17662 for wider elements. */ 17663 V8QImode, 17664 17665 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors 17666 for wider elements. 17667 17668 TODO: We could support a limited form of V4QImode too, so that 17669 we use 32-bit vectors for 8-bit elements. */ 17670 V4HImode, 17671 17672 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors 17673 for 64-bit elements. 17674 17675 TODO: We could similarly support limited forms of V2QImode and V2HImode 17676 for this case. */ 17677 V2SImode 17678 }; 17679 17680 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode. 17681 This is because: 17682 17683 - If we can't use N-byte Advanced SIMD vectors then the placement 17684 doesn't matter; we'll just continue as though the Advanced SIMD 17685 entry didn't exist. 17686 17687 - If an SVE main loop with N bytes ends up being cheaper than an 17688 Advanced SIMD main loop with N bytes then by default we'll replace 17689 the Advanced SIMD version with the SVE one. 17690 17691 - If an Advanced SIMD main loop with N bytes ends up being cheaper 17692 than an SVE main loop with N bytes then by default we'll try to 17693 use the SVE loop to vectorize the epilogue instead. */ 17694 17695 bool only_asimd_p = aarch64_autovec_preference == 1; 17696 bool only_sve_p = aarch64_autovec_preference == 2; 17697 17698 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes); 17699 unsigned int advsimd_i = 0; 17700 17701 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes)) 17702 { 17703 if (sve_i < ARRAY_SIZE (sve_modes) 17704 && aarch64_cmp_autovec_modes (sve_modes[sve_i], 17705 advsimd_modes[advsimd_i])) 17706 modes->safe_push (sve_modes[sve_i++]); 17707 else 17708 modes->safe_push (advsimd_modes[advsimd_i++]); 17709 } 17710 while (sve_i < ARRAY_SIZE (sve_modes)) 17711 modes->safe_push (sve_modes[sve_i++]); 17712 17713 unsigned int flags = 0; 17714 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we 17715 can compare SVE against Advanced SIMD and so that we can compare 17716 multiple SVE vectorization approaches against each other. There's 17717 not really any point doing this for Advanced SIMD only, since the 17718 first mode that works should always be the best. */ 17719 if (TARGET_SVE && aarch64_sve_compare_costs) 17720 flags |= VECT_COMPARE_COSTS; 17721 return flags; 17722} 17723 17724/* Implement TARGET_MANGLE_TYPE. */ 17725 17726static const char * 17727aarch64_mangle_type (const_tree type) 17728{ 17729 /* The AArch64 ABI documents say that "__va_list" has to be 17730 mangled as if it is in the "std" namespace. */ 17731 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type)) 17732 return "St9__va_list"; 17733 17734 /* Half-precision floating point types. */ 17735 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16) 17736 { 17737 if (TYPE_MODE (type) == BFmode) 17738 return "u6__bf16"; 17739 else 17740 return "Dh"; 17741 } 17742 17743 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for 17744 builtin types. */ 17745 if (TYPE_NAME (type) != NULL) 17746 { 17747 const char *res; 17748 if ((res = aarch64_general_mangle_builtin_type (type)) 17749 || (res = aarch64_sve::mangle_builtin_type (type))) 17750 return res; 17751 } 17752 17753 /* Use the default mangling. */ 17754 return NULL; 17755} 17756 17757/* Implement TARGET_VERIFY_TYPE_CONTEXT. */ 17758 17759static bool 17760aarch64_verify_type_context (location_t loc, type_context_kind context, 17761 const_tree type, bool silent_p) 17762{ 17763 return aarch64_sve::verify_type_context (loc, context, type, silent_p); 17764} 17765 17766/* Find the first rtx_insn before insn that will generate an assembly 17767 instruction. */ 17768 17769static rtx_insn * 17770aarch64_prev_real_insn (rtx_insn *insn) 17771{ 17772 if (!insn) 17773 return NULL; 17774 17775 do 17776 { 17777 insn = prev_real_insn (insn); 17778 } 17779 while (insn && recog_memoized (insn) < 0); 17780 17781 return insn; 17782} 17783 17784static bool 17785is_madd_op (enum attr_type t1) 17786{ 17787 unsigned int i; 17788 /* A number of these may be AArch32 only. */ 17789 enum attr_type mlatypes[] = { 17790 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD, 17791 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY, 17792 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD 17793 }; 17794 17795 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++) 17796 { 17797 if (t1 == mlatypes[i]) 17798 return true; 17799 } 17800 17801 return false; 17802} 17803 17804/* Check if there is a register dependency between a load and the insn 17805 for which we hold recog_data. */ 17806 17807static bool 17808dep_between_memop_and_curr (rtx memop) 17809{ 17810 rtx load_reg; 17811 int opno; 17812 17813 gcc_assert (GET_CODE (memop) == SET); 17814 17815 if (!REG_P (SET_DEST (memop))) 17816 return false; 17817 17818 load_reg = SET_DEST (memop); 17819 for (opno = 1; opno < recog_data.n_operands; opno++) 17820 { 17821 rtx operand = recog_data.operand[opno]; 17822 if (REG_P (operand) 17823 && reg_overlap_mentioned_p (load_reg, operand)) 17824 return true; 17825 17826 } 17827 return false; 17828} 17829 17830 17831/* When working around the Cortex-A53 erratum 835769, 17832 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate 17833 instruction and has a preceding memory instruction such that a NOP 17834 should be inserted between them. */ 17835 17836bool 17837aarch64_madd_needs_nop (rtx_insn* insn) 17838{ 17839 enum attr_type attr_type; 17840 rtx_insn *prev; 17841 rtx body; 17842 17843 if (!TARGET_FIX_ERR_A53_835769) 17844 return false; 17845 17846 if (!INSN_P (insn) || recog_memoized (insn) < 0) 17847 return false; 17848 17849 attr_type = get_attr_type (insn); 17850 if (!is_madd_op (attr_type)) 17851 return false; 17852 17853 prev = aarch64_prev_real_insn (insn); 17854 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN. 17855 Restore recog state to INSN to avoid state corruption. */ 17856 extract_constrain_insn_cached (insn); 17857 17858 if (!prev || !contains_mem_rtx_p (PATTERN (prev))) 17859 return false; 17860 17861 body = single_set (prev); 17862 17863 /* If the previous insn is a memory op and there is no dependency between 17864 it and the DImode madd, emit a NOP between them. If body is NULL then we 17865 have a complex memory operation, probably a load/store pair. 17866 Be conservative for now and emit a NOP. */ 17867 if (GET_MODE (recog_data.operand[0]) == DImode 17868 && (!body || !dep_between_memop_and_curr (body))) 17869 return true; 17870 17871 return false; 17872 17873} 17874 17875 17876/* Implement FINAL_PRESCAN_INSN. */ 17877 17878void 17879aarch64_final_prescan_insn (rtx_insn *insn) 17880{ 17881 if (aarch64_madd_needs_nop (insn)) 17882 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n"); 17883} 17884 17885 17886/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX 17887 instruction. */ 17888 17889bool 17890aarch64_sve_index_immediate_p (rtx base_or_step) 17891{ 17892 return (CONST_INT_P (base_or_step) 17893 && IN_RANGE (INTVAL (base_or_step), -16, 15)); 17894} 17895 17896/* Return true if X is a valid immediate for the SVE ADD and SUB instructions 17897 when applied to mode MODE. Negate X first if NEGATE_P is true. */ 17898 17899bool 17900aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p) 17901{ 17902 rtx elt = unwrap_const_vec_duplicate (x); 17903 if (!CONST_INT_P (elt)) 17904 return false; 17905 17906 HOST_WIDE_INT val = INTVAL (elt); 17907 if (negate_p) 17908 val = -val; 17909 val &= GET_MODE_MASK (GET_MODE_INNER (mode)); 17910 17911 if (val & 0xff) 17912 return IN_RANGE (val, 0, 0xff); 17913 return IN_RANGE (val, 0, 0xff00); 17914} 17915 17916/* Return true if X is a valid immediate for the SVE SQADD and SQSUB 17917 instructions when applied to mode MODE. Negate X first if NEGATE_P 17918 is true. */ 17919 17920bool 17921aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p) 17922{ 17923 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p)) 17924 return false; 17925 17926 /* After the optional negation, the immediate must be nonnegative. 17927 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127 17928 instead of SQADD Zn.B, Zn.B, #129. */ 17929 rtx elt = unwrap_const_vec_duplicate (x); 17930 return negate_p == (INTVAL (elt) < 0); 17931} 17932 17933/* Return true if X is a valid immediate operand for an SVE logical 17934 instruction such as AND. */ 17935 17936bool 17937aarch64_sve_bitmask_immediate_p (rtx x) 17938{ 17939 rtx elt; 17940 17941 return (const_vec_duplicate_p (x, &elt) 17942 && CONST_INT_P (elt) 17943 && aarch64_bitmask_imm (INTVAL (elt), 17944 GET_MODE_INNER (GET_MODE (x)))); 17945} 17946 17947/* Return true if X is a valid immediate for the SVE DUP and CPY 17948 instructions. */ 17949 17950bool 17951aarch64_sve_dup_immediate_p (rtx x) 17952{ 17953 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x)); 17954 if (!CONST_INT_P (x)) 17955 return false; 17956 17957 HOST_WIDE_INT val = INTVAL (x); 17958 if (val & 0xff) 17959 return IN_RANGE (val, -0x80, 0x7f); 17960 return IN_RANGE (val, -0x8000, 0x7f00); 17961} 17962 17963/* Return true if X is a valid immediate operand for an SVE CMP instruction. 17964 SIGNED_P says whether the operand is signed rather than unsigned. */ 17965 17966bool 17967aarch64_sve_cmp_immediate_p (rtx x, bool signed_p) 17968{ 17969 x = unwrap_const_vec_duplicate (x); 17970 return (CONST_INT_P (x) 17971 && (signed_p 17972 ? IN_RANGE (INTVAL (x), -16, 15) 17973 : IN_RANGE (INTVAL (x), 0, 127))); 17974} 17975 17976/* Return true if X is a valid immediate operand for an SVE FADD or FSUB 17977 instruction. Negate X first if NEGATE_P is true. */ 17978 17979bool 17980aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p) 17981{ 17982 rtx elt; 17983 REAL_VALUE_TYPE r; 17984 17985 if (!const_vec_duplicate_p (x, &elt) 17986 || GET_CODE (elt) != CONST_DOUBLE) 17987 return false; 17988 17989 r = *CONST_DOUBLE_REAL_VALUE (elt); 17990 17991 if (negate_p) 17992 r = real_value_negate (&r); 17993 17994 if (real_equal (&r, &dconst1)) 17995 return true; 17996 if (real_equal (&r, &dconsthalf)) 17997 return true; 17998 return false; 17999} 18000 18001/* Return true if X is a valid immediate operand for an SVE FMUL 18002 instruction. */ 18003 18004bool 18005aarch64_sve_float_mul_immediate_p (rtx x) 18006{ 18007 rtx elt; 18008 18009 return (const_vec_duplicate_p (x, &elt) 18010 && GET_CODE (elt) == CONST_DOUBLE 18011 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf) 18012 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2))); 18013} 18014 18015/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate 18016 for the Advanced SIMD operation described by WHICH and INSN. If INFO 18017 is nonnull, use it to describe valid immediates. */ 18018static bool 18019aarch64_advsimd_valid_immediate_hs (unsigned int val32, 18020 simd_immediate_info *info, 18021 enum simd_immediate_check which, 18022 simd_immediate_info::insn_type insn) 18023{ 18024 /* Try a 4-byte immediate with LSL. */ 18025 for (unsigned int shift = 0; shift < 32; shift += 8) 18026 if ((val32 & (0xff << shift)) == val32) 18027 { 18028 if (info) 18029 *info = simd_immediate_info (SImode, val32 >> shift, insn, 18030 simd_immediate_info::LSL, shift); 18031 return true; 18032 } 18033 18034 /* Try a 2-byte immediate with LSL. */ 18035 unsigned int imm16 = val32 & 0xffff; 18036 if (imm16 == (val32 >> 16)) 18037 for (unsigned int shift = 0; shift < 16; shift += 8) 18038 if ((imm16 & (0xff << shift)) == imm16) 18039 { 18040 if (info) 18041 *info = simd_immediate_info (HImode, imm16 >> shift, insn, 18042 simd_immediate_info::LSL, shift); 18043 return true; 18044 } 18045 18046 /* Try a 4-byte immediate with MSL, except for cases that MVN 18047 can handle. */ 18048 if (which == AARCH64_CHECK_MOV) 18049 for (unsigned int shift = 8; shift < 24; shift += 8) 18050 { 18051 unsigned int low = (1 << shift) - 1; 18052 if (((val32 & (0xff << shift)) | low) == val32) 18053 { 18054 if (info) 18055 *info = simd_immediate_info (SImode, val32 >> shift, insn, 18056 simd_immediate_info::MSL, shift); 18057 return true; 18058 } 18059 } 18060 18061 return false; 18062} 18063 18064/* Return true if replicating VAL64 is a valid immediate for the 18065 Advanced SIMD operation described by WHICH. If INFO is nonnull, 18066 use it to describe valid immediates. */ 18067static bool 18068aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64, 18069 simd_immediate_info *info, 18070 enum simd_immediate_check which) 18071{ 18072 unsigned int val32 = val64 & 0xffffffff; 18073 unsigned int val16 = val64 & 0xffff; 18074 unsigned int val8 = val64 & 0xff; 18075 18076 if (val32 == (val64 >> 32)) 18077 { 18078 if ((which & AARCH64_CHECK_ORR) != 0 18079 && aarch64_advsimd_valid_immediate_hs (val32, info, which, 18080 simd_immediate_info::MOV)) 18081 return true; 18082 18083 if ((which & AARCH64_CHECK_BIC) != 0 18084 && aarch64_advsimd_valid_immediate_hs (~val32, info, which, 18085 simd_immediate_info::MVN)) 18086 return true; 18087 18088 /* Try using a replicated byte. */ 18089 if (which == AARCH64_CHECK_MOV 18090 && val16 == (val32 >> 16) 18091 && val8 == (val16 >> 8)) 18092 { 18093 if (info) 18094 *info = simd_immediate_info (QImode, val8); 18095 return true; 18096 } 18097 } 18098 18099 /* Try using a bit-to-bytemask. */ 18100 if (which == AARCH64_CHECK_MOV) 18101 { 18102 unsigned int i; 18103 for (i = 0; i < 64; i += 8) 18104 { 18105 unsigned char byte = (val64 >> i) & 0xff; 18106 if (byte != 0 && byte != 0xff) 18107 break; 18108 } 18109 if (i == 64) 18110 { 18111 if (info) 18112 *info = simd_immediate_info (DImode, val64); 18113 return true; 18114 } 18115 } 18116 return false; 18117} 18118 18119/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV 18120 instruction. If INFO is nonnull, use it to describe valid immediates. */ 18121 18122static bool 18123aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64, 18124 simd_immediate_info *info) 18125{ 18126 scalar_int_mode mode = DImode; 18127 unsigned int val32 = val64 & 0xffffffff; 18128 if (val32 == (val64 >> 32)) 18129 { 18130 mode = SImode; 18131 unsigned int val16 = val32 & 0xffff; 18132 if (val16 == (val32 >> 16)) 18133 { 18134 mode = HImode; 18135 unsigned int val8 = val16 & 0xff; 18136 if (val8 == (val16 >> 8)) 18137 mode = QImode; 18138 } 18139 } 18140 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode); 18141 if (IN_RANGE (val, -0x80, 0x7f)) 18142 { 18143 /* DUP with no shift. */ 18144 if (info) 18145 *info = simd_immediate_info (mode, val); 18146 return true; 18147 } 18148 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00)) 18149 { 18150 /* DUP with LSL #8. */ 18151 if (info) 18152 *info = simd_immediate_info (mode, val); 18153 return true; 18154 } 18155 if (aarch64_bitmask_imm (val64, mode)) 18156 { 18157 /* DUPM. */ 18158 if (info) 18159 *info = simd_immediate_info (mode, val); 18160 return true; 18161 } 18162 return false; 18163} 18164 18165/* Return true if X is an UNSPEC_PTRUE constant of the form: 18166 18167 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE)) 18168 18169 where PATTERN is the svpattern as a CONST_INT and where ZERO 18170 is a zero constant of the required PTRUE mode (which can have 18171 fewer elements than X's mode, if zero bits are significant). 18172 18173 If so, and if INFO is nonnull, describe the immediate in INFO. */ 18174bool 18175aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info) 18176{ 18177 if (GET_CODE (x) != CONST) 18178 return false; 18179 18180 x = XEXP (x, 0); 18181 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE) 18182 return false; 18183 18184 if (info) 18185 { 18186 aarch64_svpattern pattern 18187 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0)); 18188 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1)); 18189 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode); 18190 *info = simd_immediate_info (int_mode, pattern); 18191 } 18192 return true; 18193} 18194 18195/* Return true if X is a valid SVE predicate. If INFO is nonnull, use 18196 it to describe valid immediates. */ 18197 18198static bool 18199aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info) 18200{ 18201 if (aarch64_sve_ptrue_svpattern_p (x, info)) 18202 return true; 18203 18204 if (x == CONST0_RTX (GET_MODE (x))) 18205 { 18206 if (info) 18207 *info = simd_immediate_info (DImode, 0); 18208 return true; 18209 } 18210 18211 /* Analyze the value as a VNx16BImode. This should be relatively 18212 efficient, since rtx_vector_builder has enough built-in capacity 18213 to store all VLA predicate constants without needing the heap. */ 18214 rtx_vector_builder builder; 18215 if (!aarch64_get_sve_pred_bits (builder, x)) 18216 return false; 18217 18218 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); 18219 if (int vl = aarch64_partial_ptrue_length (builder, elt_size)) 18220 { 18221 machine_mode mode = aarch64_sve_pred_mode (elt_size).require (); 18222 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl); 18223 if (pattern != AARCH64_NUM_SVPATTERNS) 18224 { 18225 if (info) 18226 { 18227 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); 18228 *info = simd_immediate_info (int_mode, pattern); 18229 } 18230 return true; 18231 } 18232 } 18233 return false; 18234} 18235 18236/* Return true if OP is a valid SIMD immediate for the operation 18237 described by WHICH. If INFO is nonnull, use it to describe valid 18238 immediates. */ 18239bool 18240aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, 18241 enum simd_immediate_check which) 18242{ 18243 machine_mode mode = GET_MODE (op); 18244 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 18245 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) 18246 return false; 18247 18248 if (vec_flags & VEC_SVE_PRED) 18249 return aarch64_sve_pred_valid_immediate (op, info); 18250 18251 scalar_mode elt_mode = GET_MODE_INNER (mode); 18252 rtx base, step; 18253 unsigned int n_elts; 18254 if (GET_CODE (op) == CONST_VECTOR 18255 && CONST_VECTOR_DUPLICATE_P (op)) 18256 n_elts = CONST_VECTOR_NPATTERNS (op); 18257 else if ((vec_flags & VEC_SVE_DATA) 18258 && const_vec_series_p (op, &base, &step)) 18259 { 18260 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); 18261 if (!aarch64_sve_index_immediate_p (base) 18262 || !aarch64_sve_index_immediate_p (step)) 18263 return false; 18264 18265 if (info) 18266 { 18267 /* Get the corresponding container mode. E.g. an INDEX on V2SI 18268 should yield two integer values per 128-bit block, meaning 18269 that we need to treat it in the same way as V2DI and then 18270 ignore the upper 32 bits of each element. */ 18271 elt_mode = aarch64_sve_container_int_mode (mode); 18272 *info = simd_immediate_info (elt_mode, base, step); 18273 } 18274 return true; 18275 } 18276 else if (GET_CODE (op) == CONST_VECTOR 18277 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts)) 18278 /* N_ELTS set above. */; 18279 else 18280 return false; 18281 18282 scalar_float_mode elt_float_mode; 18283 if (n_elts == 1 18284 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) 18285 { 18286 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); 18287 if (aarch64_float_const_zero_rtx_p (elt) 18288 || aarch64_float_const_representable_p (elt)) 18289 { 18290 if (info) 18291 *info = simd_immediate_info (elt_float_mode, elt); 18292 return true; 18293 } 18294 } 18295 18296 /* If all elements in an SVE vector have the same value, we have a free 18297 choice between using the element mode and using the container mode. 18298 Using the element mode means that unused parts of the vector are 18299 duplicates of the used elements, while using the container mode means 18300 that the unused parts are an extension of the used elements. Using the 18301 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid 18302 for its container mode VNx4SI while 0x00000101 isn't. 18303 18304 If not all elements in an SVE vector have the same value, we need the 18305 transition from one element to the next to occur at container boundaries. 18306 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated 18307 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */ 18308 scalar_int_mode elt_int_mode; 18309 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1) 18310 elt_int_mode = aarch64_sve_container_int_mode (mode); 18311 else 18312 elt_int_mode = int_mode_for_mode (elt_mode).require (); 18313 18314 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode); 18315 if (elt_size > 8) 18316 return false; 18317 18318 /* Expand the vector constant out into a byte vector, with the least 18319 significant byte of the register first. */ 18320 auto_vec<unsigned char, 16> bytes; 18321 bytes.reserve (n_elts * elt_size); 18322 for (unsigned int i = 0; i < n_elts; i++) 18323 { 18324 /* The vector is provided in gcc endian-neutral fashion. 18325 For aarch64_be Advanced SIMD, it must be laid out in the vector 18326 register in reverse order. */ 18327 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN); 18328 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i); 18329 18330 if (elt_mode != elt_int_mode) 18331 elt = gen_lowpart (elt_int_mode, elt); 18332 18333 if (!CONST_INT_P (elt)) 18334 return false; 18335 18336 unsigned HOST_WIDE_INT elt_val = INTVAL (elt); 18337 for (unsigned int byte = 0; byte < elt_size; byte++) 18338 { 18339 bytes.quick_push (elt_val & 0xff); 18340 elt_val >>= BITS_PER_UNIT; 18341 } 18342 } 18343 18344 /* The immediate must repeat every eight bytes. */ 18345 unsigned int nbytes = bytes.length (); 18346 for (unsigned i = 8; i < nbytes; ++i) 18347 if (bytes[i] != bytes[i - 8]) 18348 return false; 18349 18350 /* Get the repeating 8-byte value as an integer. No endian correction 18351 is needed here because bytes is already in lsb-first order. */ 18352 unsigned HOST_WIDE_INT val64 = 0; 18353 for (unsigned int i = 0; i < 8; i++) 18354 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes] 18355 << (i * BITS_PER_UNIT)); 18356 18357 if (vec_flags & VEC_SVE_DATA) 18358 return aarch64_sve_valid_immediate (val64, info); 18359 else 18360 return aarch64_advsimd_valid_immediate (val64, info, which); 18361} 18362 18363/* Check whether X is a VEC_SERIES-like constant that starts at 0 and 18364 has a step in the range of INDEX. Return the index expression if so, 18365 otherwise return null. */ 18366rtx 18367aarch64_check_zero_based_sve_index_immediate (rtx x) 18368{ 18369 rtx base, step; 18370 if (const_vec_series_p (x, &base, &step) 18371 && base == const0_rtx 18372 && aarch64_sve_index_immediate_p (step)) 18373 return step; 18374 return NULL_RTX; 18375} 18376 18377/* Check of immediate shift constants are within range. */ 18378bool 18379aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left) 18380{ 18381 x = unwrap_const_vec_duplicate (x); 18382 if (!CONST_INT_P (x)) 18383 return false; 18384 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT; 18385 if (left) 18386 return IN_RANGE (INTVAL (x), 0, bit_width - 1); 18387 else 18388 return IN_RANGE (INTVAL (x), 1, bit_width); 18389} 18390 18391/* Return the bitmask CONST_INT to select the bits required by a zero extract 18392 operation of width WIDTH at bit position POS. */ 18393 18394rtx 18395aarch64_mask_from_zextract_ops (rtx width, rtx pos) 18396{ 18397 gcc_assert (CONST_INT_P (width)); 18398 gcc_assert (CONST_INT_P (pos)); 18399 18400 unsigned HOST_WIDE_INT mask 18401 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1; 18402 return GEN_INT (mask << UINTVAL (pos)); 18403} 18404 18405bool 18406aarch64_mov_operand_p (rtx x, machine_mode mode) 18407{ 18408 if (GET_CODE (x) == HIGH 18409 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0)))) 18410 return true; 18411 18412 if (CONST_INT_P (x)) 18413 return true; 18414 18415 if (VECTOR_MODE_P (GET_MODE (x))) 18416 { 18417 /* Require predicate constants to be VNx16BI before RA, so that we 18418 force everything to have a canonical form. */ 18419 if (!lra_in_progress 18420 && !reload_completed 18421 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL 18422 && GET_MODE (x) != VNx16BImode) 18423 return false; 18424 18425 return aarch64_simd_valid_immediate (x, NULL); 18426 } 18427 18428 x = strip_salt (x); 18429 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x)) 18430 return true; 18431 18432 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x)) 18433 return true; 18434 18435 return aarch64_classify_symbolic_expression (x) 18436 == SYMBOL_TINY_ABSOLUTE; 18437} 18438 18439/* Return a const_int vector of VAL. */ 18440rtx 18441aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val) 18442{ 18443 rtx c = gen_int_mode (val, GET_MODE_INNER (mode)); 18444 return gen_const_vec_duplicate (mode, c); 18445} 18446 18447/* Check OP is a legal scalar immediate for the MOVI instruction. */ 18448 18449bool 18450aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode) 18451{ 18452 machine_mode vmode; 18453 18454 vmode = aarch64_simd_container_mode (mode, 64); 18455 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op)); 18456 return aarch64_simd_valid_immediate (op_v, NULL); 18457} 18458 18459/* Construct and return a PARALLEL RTX vector with elements numbering the 18460 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of 18461 the vector - from the perspective of the architecture. This does not 18462 line up with GCC's perspective on lane numbers, so we end up with 18463 different masks depending on our target endian-ness. The diagram 18464 below may help. We must draw the distinction when building masks 18465 which select one half of the vector. An instruction selecting 18466 architectural low-lanes for a big-endian target, must be described using 18467 a mask selecting GCC high-lanes. 18468 18469 Big-Endian Little-Endian 18470 18471GCC 0 1 2 3 3 2 1 0 18472 | x | x | x | x | | x | x | x | x | 18473Architecture 3 2 1 0 3 2 1 0 18474 18475Low Mask: { 2, 3 } { 0, 1 } 18476High Mask: { 0, 1 } { 2, 3 } 18477 18478 MODE Is the mode of the vector and NUNITS is the number of units in it. */ 18479 18480rtx 18481aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high) 18482{ 18483 rtvec v = rtvec_alloc (nunits / 2); 18484 int high_base = nunits / 2; 18485 int low_base = 0; 18486 int base; 18487 rtx t1; 18488 int i; 18489 18490 if (BYTES_BIG_ENDIAN) 18491 base = high ? low_base : high_base; 18492 else 18493 base = high ? high_base : low_base; 18494 18495 for (i = 0; i < nunits / 2; i++) 18496 RTVEC_ELT (v, i) = GEN_INT (base + i); 18497 18498 t1 = gen_rtx_PARALLEL (mode, v); 18499 return t1; 18500} 18501 18502/* Check OP for validity as a PARALLEL RTX vector with elements 18503 numbering the lanes of either the high (HIGH == TRUE) or low lanes, 18504 from the perspective of the architecture. See the diagram above 18505 aarch64_simd_vect_par_cnst_half for more details. */ 18506 18507bool 18508aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode, 18509 bool high) 18510{ 18511 int nelts; 18512 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts)) 18513 return false; 18514 18515 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high); 18516 HOST_WIDE_INT count_op = XVECLEN (op, 0); 18517 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0); 18518 int i = 0; 18519 18520 if (count_op != count_ideal) 18521 return false; 18522 18523 for (i = 0; i < count_ideal; i++) 18524 { 18525 rtx elt_op = XVECEXP (op, 0, i); 18526 rtx elt_ideal = XVECEXP (ideal, 0, i); 18527 18528 if (!CONST_INT_P (elt_op) 18529 || INTVAL (elt_ideal) != INTVAL (elt_op)) 18530 return false; 18531 } 18532 return true; 18533} 18534 18535/* Return a PARALLEL containing NELTS elements, with element I equal 18536 to BASE + I * STEP. */ 18537 18538rtx 18539aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step) 18540{ 18541 rtvec vec = rtvec_alloc (nelts); 18542 for (unsigned int i = 0; i < nelts; ++i) 18543 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode); 18544 return gen_rtx_PARALLEL (VOIDmode, vec); 18545} 18546 18547/* Return true if OP is a PARALLEL of CONST_INTs that form a linear 18548 series with step STEP. */ 18549 18550bool 18551aarch64_stepped_int_parallel_p (rtx op, int step) 18552{ 18553 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0))) 18554 return false; 18555 18556 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0)); 18557 for (int i = 1; i < XVECLEN (op, 0); ++i) 18558 if (!CONST_INT_P (XVECEXP (op, 0, i)) 18559 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step) 18560 return false; 18561 18562 return true; 18563} 18564 18565/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and 18566 HIGH (exclusive). */ 18567void 18568aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high, 18569 const_tree exp) 18570{ 18571 HOST_WIDE_INT lane; 18572 gcc_assert (CONST_INT_P (operand)); 18573 lane = INTVAL (operand); 18574 18575 if (lane < low || lane >= high) 18576 { 18577 if (exp) 18578 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1); 18579 else 18580 error ("lane %wd out of range %wd - %wd", lane, low, high - 1); 18581 } 18582} 18583 18584/* Peform endian correction on lane number N, which indexes a vector 18585 of mode MODE, and return the result as an SImode rtx. */ 18586 18587rtx 18588aarch64_endian_lane_rtx (machine_mode mode, unsigned int n) 18589{ 18590 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode); 18591} 18592 18593/* Return TRUE if OP is a valid vector addressing mode. */ 18594 18595bool 18596aarch64_simd_mem_operand_p (rtx op) 18597{ 18598 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC 18599 || REG_P (XEXP (op, 0))); 18600} 18601 18602/* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */ 18603 18604bool 18605aarch64_sve_ld1r_operand_p (rtx op) 18606{ 18607 struct aarch64_address_info addr; 18608 scalar_mode mode; 18609 18610 return (MEM_P (op) 18611 && is_a <scalar_mode> (GET_MODE (op), &mode) 18612 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false) 18613 && addr.type == ADDRESS_REG_IMM 18614 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset)); 18615} 18616 18617/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction 18618 where the size of the read data is specified by `mode` and the size of the 18619 vector elements are specified by `elem_mode`. */ 18620bool 18621aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode, 18622 scalar_mode elem_mode) 18623{ 18624 struct aarch64_address_info addr; 18625 if (!MEM_P (op) 18626 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false)) 18627 return false; 18628 18629 if (addr.type == ADDRESS_REG_IMM) 18630 return offset_4bit_signed_scaled_p (mode, addr.const_offset); 18631 18632 if (addr.type == ADDRESS_REG_REG) 18633 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode); 18634 18635 return false; 18636} 18637 18638/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */ 18639bool 18640aarch64_sve_ld1rq_operand_p (rtx op) 18641{ 18642 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode, 18643 GET_MODE_INNER (GET_MODE (op))); 18644} 18645 18646/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for 18647 accessing a vector where the element size is specified by `elem_mode`. */ 18648bool 18649aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode) 18650{ 18651 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode); 18652} 18653 18654/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */ 18655bool 18656aarch64_sve_ldff1_operand_p (rtx op) 18657{ 18658 if (!MEM_P (op)) 18659 return false; 18660 18661 struct aarch64_address_info addr; 18662 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false)) 18663 return false; 18664 18665 if (addr.type == ADDRESS_REG_IMM) 18666 return known_eq (addr.const_offset, 0); 18667 18668 return addr.type == ADDRESS_REG_REG; 18669} 18670 18671/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */ 18672bool 18673aarch64_sve_ldnf1_operand_p (rtx op) 18674{ 18675 struct aarch64_address_info addr; 18676 18677 return (MEM_P (op) 18678 && aarch64_classify_address (&addr, XEXP (op, 0), 18679 GET_MODE (op), false) 18680 && addr.type == ADDRESS_REG_IMM); 18681} 18682 18683/* Return true if OP is a valid MEM operand for an SVE LDR instruction. 18684 The conditions for STR are the same. */ 18685bool 18686aarch64_sve_ldr_operand_p (rtx op) 18687{ 18688 struct aarch64_address_info addr; 18689 18690 return (MEM_P (op) 18691 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), 18692 false, ADDR_QUERY_ANY) 18693 && addr.type == ADDRESS_REG_IMM); 18694} 18695 18696/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction, 18697 addressing memory of mode MODE. */ 18698bool 18699aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode) 18700{ 18701 struct aarch64_address_info addr; 18702 if (!aarch64_classify_address (&addr, op, mode, false)) 18703 return false; 18704 18705 if (addr.type == ADDRESS_REG_IMM) 18706 return known_eq (addr.const_offset, 0); 18707 18708 return addr.type == ADDRESS_REG_REG; 18709} 18710 18711/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode. 18712 We need to be able to access the individual pieces, so the range 18713 is different from LD[234] and ST[234]. */ 18714bool 18715aarch64_sve_struct_memory_operand_p (rtx op) 18716{ 18717 if (!MEM_P (op)) 18718 return false; 18719 18720 machine_mode mode = GET_MODE (op); 18721 struct aarch64_address_info addr; 18722 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false, 18723 ADDR_QUERY_ANY) 18724 || addr.type != ADDRESS_REG_IMM) 18725 return false; 18726 18727 poly_int64 first = addr.const_offset; 18728 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR; 18729 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first) 18730 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); 18731} 18732 18733/* Emit a register copy from operand to operand, taking care not to 18734 early-clobber source registers in the process. 18735 18736 COUNT is the number of components into which the copy needs to be 18737 decomposed. */ 18738void 18739aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode, 18740 unsigned int count) 18741{ 18742 unsigned int i; 18743 int rdest = REGNO (operands[0]); 18744 int rsrc = REGNO (operands[1]); 18745 18746 if (!reg_overlap_mentioned_p (operands[0], operands[1]) 18747 || rdest < rsrc) 18748 for (i = 0; i < count; i++) 18749 emit_move_insn (gen_rtx_REG (mode, rdest + i), 18750 gen_rtx_REG (mode, rsrc + i)); 18751 else 18752 for (i = 0; i < count; i++) 18753 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1), 18754 gen_rtx_REG (mode, rsrc + count - i - 1)); 18755} 18756 18757/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is 18758 one of VSTRUCT modes: OI, CI, or XI. */ 18759int 18760aarch64_simd_attr_length_rglist (machine_mode mode) 18761{ 18762 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */ 18763 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4; 18764} 18765 18766/* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum 18767 alignment of a vector to 128 bits. SVE predicates have an alignment of 18768 16 bits. */ 18769static HOST_WIDE_INT 18770aarch64_simd_vector_alignment (const_tree type) 18771{ 18772 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can 18773 be set for non-predicate vectors of booleans. Modes are the most 18774 direct way we have of identifying real SVE predicate types. */ 18775 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL) 18776 return 16; 18777 widest_int min_size 18778 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type))); 18779 return wi::umin (min_size, 128).to_uhwi (); 18780} 18781 18782/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */ 18783static poly_uint64 18784aarch64_vectorize_preferred_vector_alignment (const_tree type) 18785{ 18786 if (aarch64_sve_data_mode_p (TYPE_MODE (type))) 18787 { 18788 /* If the length of the vector is a fixed power of 2, try to align 18789 to that length, otherwise don't try to align at all. */ 18790 HOST_WIDE_INT result; 18791 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result) 18792 || !pow2p_hwi (result)) 18793 result = TYPE_ALIGN (TREE_TYPE (type)); 18794 return result; 18795 } 18796 return TYPE_ALIGN (type); 18797} 18798 18799/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */ 18800static bool 18801aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed) 18802{ 18803 if (is_packed) 18804 return false; 18805 18806 /* For fixed-length vectors, check that the vectorizer will aim for 18807 full-vector alignment. This isn't true for generic GCC vectors 18808 that are wider than the ABI maximum of 128 bits. */ 18809 poly_uint64 preferred_alignment = 18810 aarch64_vectorize_preferred_vector_alignment (type); 18811 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST 18812 && maybe_ne (wi::to_widest (TYPE_SIZE (type)), 18813 preferred_alignment)) 18814 return false; 18815 18816 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */ 18817 return true; 18818} 18819 18820/* Return true if the vector misalignment factor is supported by the 18821 target. */ 18822static bool 18823aarch64_builtin_support_vector_misalignment (machine_mode mode, 18824 const_tree type, int misalignment, 18825 bool is_packed) 18826{ 18827 if (TARGET_SIMD && STRICT_ALIGNMENT) 18828 { 18829 /* Return if movmisalign pattern is not supported for this mode. */ 18830 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) 18831 return false; 18832 18833 /* Misalignment factor is unknown at compile time. */ 18834 if (misalignment == -1) 18835 return false; 18836 } 18837 return default_builtin_support_vector_misalignment (mode, type, misalignment, 18838 is_packed); 18839} 18840 18841/* If VALS is a vector constant that can be loaded into a register 18842 using DUP, generate instructions to do so and return an RTX to 18843 assign to the register. Otherwise return NULL_RTX. */ 18844static rtx 18845aarch64_simd_dup_constant (rtx vals) 18846{ 18847 machine_mode mode = GET_MODE (vals); 18848 machine_mode inner_mode = GET_MODE_INNER (mode); 18849 rtx x; 18850 18851 if (!const_vec_duplicate_p (vals, &x)) 18852 return NULL_RTX; 18853 18854 /* We can load this constant by using DUP and a constant in a 18855 single ARM register. This will be cheaper than a vector 18856 load. */ 18857 x = copy_to_mode_reg (inner_mode, x); 18858 return gen_vec_duplicate (mode, x); 18859} 18860 18861 18862/* Generate code to load VALS, which is a PARALLEL containing only 18863 constants (for vec_init) or CONST_VECTOR, efficiently into a 18864 register. Returns an RTX to copy into the register, or NULL_RTX 18865 for a PARALLEL that cannot be converted into a CONST_VECTOR. */ 18866static rtx 18867aarch64_simd_make_constant (rtx vals) 18868{ 18869 machine_mode mode = GET_MODE (vals); 18870 rtx const_dup; 18871 rtx const_vec = NULL_RTX; 18872 int n_const = 0; 18873 int i; 18874 18875 if (GET_CODE (vals) == CONST_VECTOR) 18876 const_vec = vals; 18877 else if (GET_CODE (vals) == PARALLEL) 18878 { 18879 /* A CONST_VECTOR must contain only CONST_INTs and 18880 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF). 18881 Only store valid constants in a CONST_VECTOR. */ 18882 int n_elts = XVECLEN (vals, 0); 18883 for (i = 0; i < n_elts; ++i) 18884 { 18885 rtx x = XVECEXP (vals, 0, i); 18886 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 18887 n_const++; 18888 } 18889 if (n_const == n_elts) 18890 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); 18891 } 18892 else 18893 gcc_unreachable (); 18894 18895 if (const_vec != NULL_RTX 18896 && aarch64_simd_valid_immediate (const_vec, NULL)) 18897 /* Load using MOVI/MVNI. */ 18898 return const_vec; 18899 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) 18900 /* Loaded using DUP. */ 18901 return const_dup; 18902 else if (const_vec != NULL_RTX) 18903 /* Load from constant pool. We cannot take advantage of single-cycle 18904 LD1 because we need a PC-relative addressing mode. */ 18905 return const_vec; 18906 else 18907 /* A PARALLEL containing something not valid inside CONST_VECTOR. 18908 We cannot construct an initializer. */ 18909 return NULL_RTX; 18910} 18911 18912/* Expand a vector initialisation sequence, such that TARGET is 18913 initialised to contain VALS. */ 18914 18915void 18916aarch64_expand_vector_init (rtx target, rtx vals) 18917{ 18918 machine_mode mode = GET_MODE (target); 18919 scalar_mode inner_mode = GET_MODE_INNER (mode); 18920 /* The number of vector elements. */ 18921 int n_elts = XVECLEN (vals, 0); 18922 /* The number of vector elements which are not constant. */ 18923 int n_var = 0; 18924 rtx any_const = NULL_RTX; 18925 /* The first element of vals. */ 18926 rtx v0 = XVECEXP (vals, 0, 0); 18927 bool all_same = true; 18928 18929 /* This is a special vec_init<M><N> where N is not an element mode but a 18930 vector mode with half the elements of M. We expect to find two entries 18931 of mode N in VALS and we must put their concatentation into TARGET. */ 18932 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0)))) 18933 { 18934 gcc_assert (known_eq (GET_MODE_SIZE (mode), 18935 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0))))); 18936 rtx lo = XVECEXP (vals, 0, 0); 18937 rtx hi = XVECEXP (vals, 0, 1); 18938 machine_mode narrow_mode = GET_MODE (lo); 18939 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode); 18940 gcc_assert (narrow_mode == GET_MODE (hi)); 18941 18942 /* When we want to concatenate a half-width vector with zeroes we can 18943 use the aarch64_combinez[_be] patterns. Just make sure that the 18944 zeroes are in the right half. */ 18945 if (BYTES_BIG_ENDIAN 18946 && aarch64_simd_imm_zero (lo, narrow_mode) 18947 && general_operand (hi, narrow_mode)) 18948 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo)); 18949 else if (!BYTES_BIG_ENDIAN 18950 && aarch64_simd_imm_zero (hi, narrow_mode) 18951 && general_operand (lo, narrow_mode)) 18952 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi)); 18953 else 18954 { 18955 /* Else create the two half-width registers and combine them. */ 18956 if (!REG_P (lo)) 18957 lo = force_reg (GET_MODE (lo), lo); 18958 if (!REG_P (hi)) 18959 hi = force_reg (GET_MODE (hi), hi); 18960 18961 if (BYTES_BIG_ENDIAN) 18962 std::swap (lo, hi); 18963 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi)); 18964 } 18965 return; 18966 } 18967 18968 /* Count the number of variable elements to initialise. */ 18969 for (int i = 0; i < n_elts; ++i) 18970 { 18971 rtx x = XVECEXP (vals, 0, i); 18972 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) 18973 ++n_var; 18974 else 18975 any_const = x; 18976 18977 all_same &= rtx_equal_p (x, v0); 18978 } 18979 18980 /* No variable elements, hand off to aarch64_simd_make_constant which knows 18981 how best to handle this. */ 18982 if (n_var == 0) 18983 { 18984 rtx constant = aarch64_simd_make_constant (vals); 18985 if (constant != NULL_RTX) 18986 { 18987 emit_move_insn (target, constant); 18988 return; 18989 } 18990 } 18991 18992 /* Splat a single non-constant element if we can. */ 18993 if (all_same) 18994 { 18995 rtx x = copy_to_mode_reg (inner_mode, v0); 18996 aarch64_emit_move (target, gen_vec_duplicate (mode, x)); 18997 return; 18998 } 18999 19000 enum insn_code icode = optab_handler (vec_set_optab, mode); 19001 gcc_assert (icode != CODE_FOR_nothing); 19002 19003 /* If there are only variable elements, try to optimize 19004 the insertion using dup for the most common element 19005 followed by insertions. */ 19006 19007 /* The algorithm will fill matches[*][0] with the earliest matching element, 19008 and matches[X][1] with the count of duplicate elements (if X is the 19009 earliest element which has duplicates). */ 19010 19011 if (n_var == n_elts && n_elts <= 16) 19012 { 19013 int matches[16][2] = {0}; 19014 for (int i = 0; i < n_elts; i++) 19015 { 19016 for (int j = 0; j <= i; j++) 19017 { 19018 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j))) 19019 { 19020 matches[i][0] = j; 19021 matches[j][1]++; 19022 break; 19023 } 19024 } 19025 } 19026 int maxelement = 0; 19027 int maxv = 0; 19028 for (int i = 0; i < n_elts; i++) 19029 if (matches[i][1] > maxv) 19030 { 19031 maxelement = i; 19032 maxv = matches[i][1]; 19033 } 19034 19035 /* Create a duplicate of the most common element, unless all elements 19036 are equally useless to us, in which case just immediately set the 19037 vector register using the first element. */ 19038 19039 if (maxv == 1) 19040 { 19041 /* For vectors of two 64-bit elements, we can do even better. */ 19042 if (n_elts == 2 19043 && (inner_mode == E_DImode 19044 || inner_mode == E_DFmode)) 19045 19046 { 19047 rtx x0 = XVECEXP (vals, 0, 0); 19048 rtx x1 = XVECEXP (vals, 0, 1); 19049 /* Combine can pick up this case, but handling it directly 19050 here leaves clearer RTL. 19051 19052 This is load_pair_lanes<mode>, and also gives us a clean-up 19053 for store_pair_lanes<mode>. */ 19054 if (memory_operand (x0, inner_mode) 19055 && memory_operand (x1, inner_mode) 19056 && !STRICT_ALIGNMENT 19057 && rtx_equal_p (XEXP (x1, 0), 19058 plus_constant (Pmode, 19059 XEXP (x0, 0), 19060 GET_MODE_SIZE (inner_mode)))) 19061 { 19062 rtx t; 19063 if (inner_mode == DFmode) 19064 t = gen_load_pair_lanesdf (target, x0, x1); 19065 else 19066 t = gen_load_pair_lanesdi (target, x0, x1); 19067 emit_insn (t); 19068 return; 19069 } 19070 } 19071 /* The subreg-move sequence below will move into lane zero of the 19072 vector register. For big-endian we want that position to hold 19073 the last element of VALS. */ 19074 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0; 19075 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); 19076 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode)); 19077 } 19078 else 19079 { 19080 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); 19081 aarch64_emit_move (target, gen_vec_duplicate (mode, x)); 19082 } 19083 19084 /* Insert the rest. */ 19085 for (int i = 0; i < n_elts; i++) 19086 { 19087 rtx x = XVECEXP (vals, 0, i); 19088 if (matches[i][0] == maxelement) 19089 continue; 19090 x = copy_to_mode_reg (inner_mode, x); 19091 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); 19092 } 19093 return; 19094 } 19095 19096 /* Initialise a vector which is part-variable. We want to first try 19097 to build those lanes which are constant in the most efficient way we 19098 can. */ 19099 if (n_var != n_elts) 19100 { 19101 rtx copy = copy_rtx (vals); 19102 19103 /* Load constant part of vector. We really don't care what goes into the 19104 parts we will overwrite, but we're more likely to be able to load the 19105 constant efficiently if it has fewer, larger, repeating parts 19106 (see aarch64_simd_valid_immediate). */ 19107 for (int i = 0; i < n_elts; i++) 19108 { 19109 rtx x = XVECEXP (vals, 0, i); 19110 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 19111 continue; 19112 rtx subst = any_const; 19113 for (int bit = n_elts / 2; bit > 0; bit /= 2) 19114 { 19115 /* Look in the copied vector, as more elements are const. */ 19116 rtx test = XVECEXP (copy, 0, i ^ bit); 19117 if (CONST_INT_P (test) || CONST_DOUBLE_P (test)) 19118 { 19119 subst = test; 19120 break; 19121 } 19122 } 19123 XVECEXP (copy, 0, i) = subst; 19124 } 19125 aarch64_expand_vector_init (target, copy); 19126 } 19127 19128 /* Insert the variable lanes directly. */ 19129 for (int i = 0; i < n_elts; i++) 19130 { 19131 rtx x = XVECEXP (vals, 0, i); 19132 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 19133 continue; 19134 x = copy_to_mode_reg (inner_mode, x); 19135 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); 19136 } 19137} 19138 19139/* Emit RTL corresponding to: 19140 insr TARGET, ELEM. */ 19141 19142static void 19143emit_insr (rtx target, rtx elem) 19144{ 19145 machine_mode mode = GET_MODE (target); 19146 scalar_mode elem_mode = GET_MODE_INNER (mode); 19147 elem = force_reg (elem_mode, elem); 19148 19149 insn_code icode = optab_handler (vec_shl_insert_optab, mode); 19150 gcc_assert (icode != CODE_FOR_nothing); 19151 emit_insn (GEN_FCN (icode) (target, target, elem)); 19152} 19153 19154/* Subroutine of aarch64_sve_expand_vector_init for handling 19155 trailing constants. 19156 This function works as follows: 19157 (a) Create a new vector consisting of trailing constants. 19158 (b) Initialize TARGET with the constant vector using emit_move_insn. 19159 (c) Insert remaining elements in TARGET using insr. 19160 NELTS is the total number of elements in original vector while 19161 while NELTS_REQD is the number of elements that are actually 19162 significant. 19163 19164 ??? The heuristic used is to do above only if number of constants 19165 is at least half the total number of elements. May need fine tuning. */ 19166 19167static bool 19168aarch64_sve_expand_vector_init_handle_trailing_constants 19169 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd) 19170{ 19171 machine_mode mode = GET_MODE (target); 19172 scalar_mode elem_mode = GET_MODE_INNER (mode); 19173 int n_trailing_constants = 0; 19174 19175 for (int i = nelts_reqd - 1; 19176 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i)); 19177 i--) 19178 n_trailing_constants++; 19179 19180 if (n_trailing_constants >= nelts_reqd / 2) 19181 { 19182 /* Try to use the natural pattern of BUILDER to extend the trailing 19183 constant elements to a full vector. Replace any variables in the 19184 extra elements with zeros. 19185 19186 ??? It would be better if the builders supported "don't care" 19187 elements, with the builder filling in whichever elements 19188 give the most compact encoding. */ 19189 rtx_vector_builder v (mode, nelts, 1); 19190 for (int i = 0; i < nelts; i++) 19191 { 19192 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants); 19193 if (!valid_for_const_vector_p (elem_mode, x)) 19194 x = const0_rtx; 19195 v.quick_push (x); 19196 } 19197 rtx const_vec = v.build (); 19198 emit_move_insn (target, const_vec); 19199 19200 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--) 19201 emit_insr (target, builder.elt (i)); 19202 19203 return true; 19204 } 19205 19206 return false; 19207} 19208 19209/* Subroutine of aarch64_sve_expand_vector_init. 19210 Works as follows: 19211 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. 19212 (b) Skip trailing elements from BUILDER, which are the same as 19213 element NELTS_REQD - 1. 19214 (c) Insert earlier elements in reverse order in TARGET using insr. */ 19215 19216static void 19217aarch64_sve_expand_vector_init_insert_elems (rtx target, 19218 const rtx_vector_builder &builder, 19219 int nelts_reqd) 19220{ 19221 machine_mode mode = GET_MODE (target); 19222 scalar_mode elem_mode = GET_MODE_INNER (mode); 19223 19224 struct expand_operand ops[2]; 19225 enum insn_code icode = optab_handler (vec_duplicate_optab, mode); 19226 gcc_assert (icode != CODE_FOR_nothing); 19227 19228 create_output_operand (&ops[0], target, mode); 19229 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode); 19230 expand_insn (icode, 2, ops); 19231 19232 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); 19233 for (int i = nelts_reqd - ndups - 1; i >= 0; i--) 19234 emit_insr (target, builder.elt (i)); 19235} 19236 19237/* Subroutine of aarch64_sve_expand_vector_init to handle case 19238 when all trailing elements of builder are same. 19239 This works as follows: 19240 (a) Use expand_insn interface to broadcast last vector element in TARGET. 19241 (b) Insert remaining elements in TARGET using insr. 19242 19243 ??? The heuristic used is to do above if number of same trailing elements 19244 is at least 3/4 of total number of elements, loosely based on 19245 heuristic from mostly_zeros_p. May need fine-tuning. */ 19246 19247static bool 19248aarch64_sve_expand_vector_init_handle_trailing_same_elem 19249 (rtx target, const rtx_vector_builder &builder, int nelts_reqd) 19250{ 19251 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1); 19252 if (ndups >= (3 * nelts_reqd) / 4) 19253 { 19254 aarch64_sve_expand_vector_init_insert_elems (target, builder, 19255 nelts_reqd - ndups + 1); 19256 return true; 19257 } 19258 19259 return false; 19260} 19261 19262/* Initialize register TARGET from BUILDER. NELTS is the constant number 19263 of elements in BUILDER. 19264 19265 The function tries to initialize TARGET from BUILDER if it fits one 19266 of the special cases outlined below. 19267 19268 Failing that, the function divides BUILDER into two sub-vectors: 19269 v_even = even elements of BUILDER; 19270 v_odd = odd elements of BUILDER; 19271 19272 and recursively calls itself with v_even and v_odd. 19273 19274 if (recursive call succeeded for v_even or v_odd) 19275 TARGET = zip (v_even, v_odd) 19276 19277 The function returns true if it managed to build TARGET from BUILDER 19278 with one of the special cases, false otherwise. 19279 19280 Example: {a, 1, b, 2, c, 3, d, 4} 19281 19282 The vector gets divided into: 19283 v_even = {a, b, c, d} 19284 v_odd = {1, 2, 3, 4} 19285 19286 aarch64_sve_expand_vector_init(v_odd) hits case 1 and 19287 initialize tmp2 from constant vector v_odd using emit_move_insn. 19288 19289 aarch64_sve_expand_vector_init(v_even) fails since v_even contains 19290 4 elements, so we construct tmp1 from v_even using insr: 19291 tmp1 = dup(d) 19292 insr tmp1, c 19293 insr tmp1, b 19294 insr tmp1, a 19295 19296 And finally: 19297 TARGET = zip (tmp1, tmp2) 19298 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */ 19299 19300static bool 19301aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder, 19302 int nelts, int nelts_reqd) 19303{ 19304 machine_mode mode = GET_MODE (target); 19305 19306 /* Case 1: Vector contains trailing constants. */ 19307 19308 if (aarch64_sve_expand_vector_init_handle_trailing_constants 19309 (target, builder, nelts, nelts_reqd)) 19310 return true; 19311 19312 /* Case 2: Vector contains leading constants. */ 19313 19314 rtx_vector_builder rev_builder (mode, nelts_reqd, 1); 19315 for (int i = 0; i < nelts_reqd; i++) 19316 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1)); 19317 rev_builder.finalize (); 19318 19319 if (aarch64_sve_expand_vector_init_handle_trailing_constants 19320 (target, rev_builder, nelts, nelts_reqd)) 19321 { 19322 emit_insn (gen_aarch64_sve_rev (mode, target, target)); 19323 return true; 19324 } 19325 19326 /* Case 3: Vector contains trailing same element. */ 19327 19328 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem 19329 (target, builder, nelts_reqd)) 19330 return true; 19331 19332 /* Case 4: Vector contains leading same element. */ 19333 19334 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem 19335 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts) 19336 { 19337 emit_insn (gen_aarch64_sve_rev (mode, target, target)); 19338 return true; 19339 } 19340 19341 /* Avoid recursing below 4-elements. 19342 ??? The threshold 4 may need fine-tuning. */ 19343 19344 if (nelts_reqd <= 4) 19345 return false; 19346 19347 rtx_vector_builder v_even (mode, nelts, 1); 19348 rtx_vector_builder v_odd (mode, nelts, 1); 19349 19350 for (int i = 0; i < nelts * 2; i += 2) 19351 { 19352 v_even.quick_push (builder.elt (i)); 19353 v_odd.quick_push (builder.elt (i + 1)); 19354 } 19355 19356 v_even.finalize (); 19357 v_odd.finalize (); 19358 19359 rtx tmp1 = gen_reg_rtx (mode); 19360 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even, 19361 nelts, nelts_reqd / 2); 19362 19363 rtx tmp2 = gen_reg_rtx (mode); 19364 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd, 19365 nelts, nelts_reqd / 2); 19366 19367 if (!did_even_p && !did_odd_p) 19368 return false; 19369 19370 /* Initialize v_even and v_odd using INSR if it didn't match any of the 19371 special cases and zip v_even, v_odd. */ 19372 19373 if (!did_even_p) 19374 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2); 19375 19376 if (!did_odd_p) 19377 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2); 19378 19379 rtvec v = gen_rtvec (2, tmp1, tmp2); 19380 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); 19381 return true; 19382} 19383 19384/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ 19385 19386void 19387aarch64_sve_expand_vector_init (rtx target, rtx vals) 19388{ 19389 machine_mode mode = GET_MODE (target); 19390 int nelts = XVECLEN (vals, 0); 19391 19392 rtx_vector_builder v (mode, nelts, 1); 19393 for (int i = 0; i < nelts; i++) 19394 v.quick_push (XVECEXP (vals, 0, i)); 19395 v.finalize (); 19396 19397 /* If neither sub-vectors of v could be initialized specially, 19398 then use INSR to insert all elements from v into TARGET. 19399 ??? This might not be optimal for vectors with large 19400 initializers like 16-element or above. 19401 For nelts < 4, it probably isn't useful to handle specially. */ 19402 19403 if (nelts < 4 19404 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts)) 19405 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts); 19406} 19407 19408/* Check whether VALUE is a vector constant in which every element 19409 is either a power of 2 or a negated power of 2. If so, return 19410 a constant vector of log2s, and flip CODE between PLUS and MINUS 19411 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */ 19412 19413static rtx 19414aarch64_convert_mult_to_shift (rtx value, rtx_code &code) 19415{ 19416 if (GET_CODE (value) != CONST_VECTOR) 19417 return NULL_RTX; 19418 19419 rtx_vector_builder builder; 19420 if (!builder.new_unary_operation (GET_MODE (value), value, false)) 19421 return NULL_RTX; 19422 19423 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value)); 19424 /* 1 if the result of the multiplication must be negated, 19425 0 if it mustn't, or -1 if we don't yet care. */ 19426 int negate = -1; 19427 unsigned int encoded_nelts = const_vector_encoded_nelts (value); 19428 for (unsigned int i = 0; i < encoded_nelts; ++i) 19429 { 19430 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i); 19431 if (!CONST_SCALAR_INT_P (elt)) 19432 return NULL_RTX; 19433 rtx_mode_t val (elt, int_mode); 19434 wide_int pow2 = wi::neg (val); 19435 if (val != pow2) 19436 { 19437 /* It matters whether we negate or not. Make that choice, 19438 and make sure that it's consistent with previous elements. */ 19439 if (negate == !wi::neg_p (val)) 19440 return NULL_RTX; 19441 negate = wi::neg_p (val); 19442 if (!negate) 19443 pow2 = val; 19444 } 19445 /* POW2 is now the value that we want to be a power of 2. */ 19446 int shift = wi::exact_log2 (pow2); 19447 if (shift < 0) 19448 return NULL_RTX; 19449 builder.quick_push (gen_int_mode (shift, int_mode)); 19450 } 19451 if (negate == -1) 19452 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */ 19453 code = PLUS; 19454 else if (negate == 1) 19455 code = code == PLUS ? MINUS : PLUS; 19456 return builder.build (); 19457} 19458 19459/* Prepare for an integer SVE multiply-add or multiply-subtract pattern; 19460 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the 19461 operands array, in the same order as for fma_optab. Return true if 19462 the function emitted all the necessary instructions, false if the caller 19463 should generate the pattern normally with the new OPERANDS array. */ 19464 19465bool 19466aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code) 19467{ 19468 machine_mode mode = GET_MODE (operands[0]); 19469 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code)) 19470 { 19471 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts, 19472 NULL_RTX, true, OPTAB_DIRECT); 19473 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab, 19474 operands[3], product, operands[0], true, 19475 OPTAB_DIRECT); 19476 return true; 19477 } 19478 operands[2] = force_reg (mode, operands[2]); 19479 return false; 19480} 19481 19482/* Likewise, but for a conditional pattern. */ 19483 19484bool 19485aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code) 19486{ 19487 machine_mode mode = GET_MODE (operands[0]); 19488 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code)) 19489 { 19490 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts, 19491 NULL_RTX, true, OPTAB_DIRECT); 19492 emit_insn (gen_cond (code, mode, operands[0], operands[1], 19493 operands[4], product, operands[5])); 19494 return true; 19495 } 19496 operands[3] = force_reg (mode, operands[3]); 19497 return false; 19498} 19499 19500static unsigned HOST_WIDE_INT 19501aarch64_shift_truncation_mask (machine_mode mode) 19502{ 19503 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode)) 19504 return 0; 19505 return GET_MODE_UNIT_BITSIZE (mode) - 1; 19506} 19507 19508/* Select a format to encode pointers in exception handling data. */ 19509int 19510aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global) 19511{ 19512 int type; 19513 switch (aarch64_cmodel) 19514 { 19515 case AARCH64_CMODEL_TINY: 19516 case AARCH64_CMODEL_TINY_PIC: 19517 case AARCH64_CMODEL_SMALL: 19518 case AARCH64_CMODEL_SMALL_PIC: 19519 case AARCH64_CMODEL_SMALL_SPIC: 19520 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient 19521 for everything. */ 19522 type = DW_EH_PE_sdata4; 19523 break; 19524 default: 19525 /* No assumptions here. 8-byte relocs required. */ 19526 type = DW_EH_PE_sdata8; 19527 break; 19528 } 19529 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; 19530} 19531 19532/* Output .variant_pcs for aarch64_vector_pcs function symbols. */ 19533 19534static void 19535aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name) 19536{ 19537 if (TREE_CODE (decl) == FUNCTION_DECL) 19538 { 19539 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id (); 19540 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE) 19541 { 19542 fprintf (stream, "\t.variant_pcs\t"); 19543 assemble_name (stream, name); 19544 fprintf (stream, "\n"); 19545 } 19546 } 19547} 19548 19549/* The last .arch and .tune assembly strings that we printed. */ 19550static std::string aarch64_last_printed_arch_string; 19551static std::string aarch64_last_printed_tune_string; 19552 19553/* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used 19554 by the function fndecl. */ 19555 19556void 19557aarch64_declare_function_name (FILE *stream, const char* name, 19558 tree fndecl) 19559{ 19560 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 19561 19562 struct cl_target_option *targ_options; 19563 if (target_parts) 19564 targ_options = TREE_TARGET_OPTION (target_parts); 19565 else 19566 targ_options = TREE_TARGET_OPTION (target_option_current_node); 19567 gcc_assert (targ_options); 19568 19569 const struct processor *this_arch 19570 = aarch64_get_arch (targ_options->x_explicit_arch); 19571 19572 uint64_t isa_flags = targ_options->x_aarch64_isa_flags; 19573 std::string extension 19574 = aarch64_get_extension_string_for_isa_flags (isa_flags, 19575 this_arch->flags); 19576 /* Only update the assembler .arch string if it is distinct from the last 19577 such string we printed. */ 19578 std::string to_print = this_arch->name + extension; 19579 if (to_print != aarch64_last_printed_arch_string) 19580 { 19581 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ()); 19582 aarch64_last_printed_arch_string = to_print; 19583 } 19584 19585 /* Print the cpu name we're tuning for in the comments, might be 19586 useful to readers of the generated asm. Do it only when it changes 19587 from function to function and verbose assembly is requested. */ 19588 const struct processor *this_tune 19589 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core); 19590 19591 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name) 19592 { 19593 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n", 19594 this_tune->name); 19595 aarch64_last_printed_tune_string = this_tune->name; 19596 } 19597 19598 aarch64_asm_output_variant_pcs (stream, fndecl, name); 19599 19600 /* Don't forget the type directive for ELF. */ 19601 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function"); 19602 ASM_OUTPUT_LABEL (stream, name); 19603 19604 cfun->machine->label_is_assembled = true; 19605} 19606 19607/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */ 19608 19609void 19610aarch64_print_patchable_function_entry (FILE *file, 19611 unsigned HOST_WIDE_INT patch_area_size, 19612 bool record_p) 19613{ 19614 if (!cfun->machine->label_is_assembled) 19615 { 19616 /* Emit the patching area before the entry label, if any. */ 19617 default_print_patchable_function_entry (file, patch_area_size, 19618 record_p); 19619 return; 19620 } 19621 19622 rtx pa = gen_patchable_area (GEN_INT (patch_area_size), 19623 GEN_INT (record_p)); 19624 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; 19625 19626 if (!aarch64_bti_enabled () 19627 || cgraph_node::get (cfun->decl)->only_called_directly_p ()) 19628 { 19629 /* Emit the patchable_area at the beginning of the function. */ 19630 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb)); 19631 INSN_ADDRESSES_NEW (insn, -1); 19632 return; 19633 } 19634 19635 rtx_insn *insn = next_real_nondebug_insn (get_insns ()); 19636 if (!insn 19637 || !INSN_P (insn) 19638 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE 19639 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C) 19640 { 19641 /* Emit a BTI_C. */ 19642 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb)); 19643 } 19644 19645 /* Emit the patchable_area after BTI_C. */ 19646 insn = emit_insn_after (pa, insn); 19647 INSN_ADDRESSES_NEW (insn, -1); 19648} 19649 19650/* Output patchable area. */ 19651 19652void 19653aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p) 19654{ 19655 default_print_patchable_function_entry (asm_out_file, patch_area_size, 19656 record_p); 19657} 19658 19659/* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */ 19660 19661void 19662aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target) 19663{ 19664 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0); 19665 const char *value = IDENTIFIER_POINTER (target); 19666 aarch64_asm_output_variant_pcs (stream, decl, name); 19667 ASM_OUTPUT_DEF (stream, name, value); 19668} 19669 19670/* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined 19671 function symbol references. */ 19672 19673void 19674aarch64_asm_output_external (FILE *stream, tree decl, const char* name) 19675{ 19676 default_elf_asm_output_external (stream, decl, name); 19677 aarch64_asm_output_variant_pcs (stream, decl, name); 19678} 19679 19680/* Triggered after a .cfi_startproc directive is emitted into the assembly file. 19681 Used to output the .cfi_b_key_frame directive when signing the current 19682 function with the B key. */ 19683 19684void 19685aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED) 19686{ 19687 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled () 19688 && aarch64_ra_sign_key == AARCH64_KEY_B) 19689 asm_fprintf (f, "\t.cfi_b_key_frame\n"); 19690} 19691 19692/* Implements TARGET_ASM_FILE_START. Output the assembly header. */ 19693 19694static void 19695aarch64_start_file (void) 19696{ 19697 struct cl_target_option *default_options 19698 = TREE_TARGET_OPTION (target_option_default_node); 19699 19700 const struct processor *default_arch 19701 = aarch64_get_arch (default_options->x_explicit_arch); 19702 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags; 19703 std::string extension 19704 = aarch64_get_extension_string_for_isa_flags (default_isa_flags, 19705 default_arch->flags); 19706 19707 aarch64_last_printed_arch_string = default_arch->name + extension; 19708 aarch64_last_printed_tune_string = ""; 19709 asm_fprintf (asm_out_file, "\t.arch %s\n", 19710 aarch64_last_printed_arch_string.c_str ()); 19711 19712 default_file_start (); 19713} 19714 19715/* Emit load exclusive. */ 19716 19717static void 19718aarch64_emit_load_exclusive (machine_mode mode, rtx rval, 19719 rtx mem, rtx model_rtx) 19720{ 19721 if (mode == TImode) 19722 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval), 19723 gen_highpart (DImode, rval), 19724 mem, model_rtx)); 19725 else 19726 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx)); 19727} 19728 19729/* Emit store exclusive. */ 19730 19731static void 19732aarch64_emit_store_exclusive (machine_mode mode, rtx bval, 19733 rtx mem, rtx rval, rtx model_rtx) 19734{ 19735 if (mode == TImode) 19736 emit_insn (gen_aarch64_store_exclusive_pair 19737 (bval, mem, operand_subword (rval, 0, 0, TImode), 19738 operand_subword (rval, 1, 0, TImode), model_rtx)); 19739 else 19740 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx)); 19741} 19742 19743/* Mark the previous jump instruction as unlikely. */ 19744 19745static void 19746aarch64_emit_unlikely_jump (rtx insn) 19747{ 19748 rtx_insn *jump = emit_jump_insn (insn); 19749 add_reg_br_prob_note (jump, profile_probability::very_unlikely ()); 19750} 19751 19752/* We store the names of the various atomic helpers in a 5x5 array. 19753 Return the libcall function given MODE, MODEL and NAMES. */ 19754 19755rtx 19756aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx, 19757 const atomic_ool_names *names) 19758{ 19759 memmodel model = memmodel_from_int (INTVAL (model_rtx)); 19760 int mode_idx, model_idx; 19761 19762 switch (mode) 19763 { 19764 case E_QImode: 19765 mode_idx = 0; 19766 break; 19767 case E_HImode: 19768 mode_idx = 1; 19769 break; 19770 case E_SImode: 19771 mode_idx = 2; 19772 break; 19773 case E_DImode: 19774 mode_idx = 3; 19775 break; 19776 case E_TImode: 19777 mode_idx = 4; 19778 break; 19779 default: 19780 gcc_unreachable (); 19781 } 19782 19783 switch (model) 19784 { 19785 case MEMMODEL_RELAXED: 19786 model_idx = 0; 19787 break; 19788 case MEMMODEL_CONSUME: 19789 case MEMMODEL_ACQUIRE: 19790 model_idx = 1; 19791 break; 19792 case MEMMODEL_RELEASE: 19793 model_idx = 2; 19794 break; 19795 case MEMMODEL_ACQ_REL: 19796 case MEMMODEL_SEQ_CST: 19797 model_idx = 3; 19798 break; 19799 case MEMMODEL_SYNC_ACQUIRE: 19800 case MEMMODEL_SYNC_RELEASE: 19801 case MEMMODEL_SYNC_SEQ_CST: 19802 model_idx = 4; 19803 break; 19804 default: 19805 gcc_unreachable (); 19806 } 19807 19808 return init_one_libfunc_visibility (names->str[mode_idx][model_idx], 19809 VISIBILITY_HIDDEN); 19810} 19811 19812#define DEF0(B, N) \ 19813 { "__aarch64_" #B #N "_relax", \ 19814 "__aarch64_" #B #N "_acq", \ 19815 "__aarch64_" #B #N "_rel", \ 19816 "__aarch64_" #B #N "_acq_rel", \ 19817 "__aarch64_" #B #N "_sync" } 19818 19819#define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \ 19820 { NULL, NULL, NULL, NULL } 19821#define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16) 19822 19823static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } }; 19824const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } }; 19825const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } }; 19826const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } }; 19827const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } }; 19828const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } }; 19829 19830#undef DEF0 19831#undef DEF4 19832#undef DEF5 19833 19834/* Expand a compare and swap pattern. */ 19835 19836void 19837aarch64_expand_compare_and_swap (rtx operands[]) 19838{ 19839 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg; 19840 machine_mode mode, r_mode; 19841 19842 bval = operands[0]; 19843 rval = operands[1]; 19844 mem = operands[2]; 19845 oldval = operands[3]; 19846 newval = operands[4]; 19847 is_weak = operands[5]; 19848 mod_s = operands[6]; 19849 mod_f = operands[7]; 19850 mode = GET_MODE (mem); 19851 19852 /* Normally the succ memory model must be stronger than fail, but in the 19853 unlikely event of fail being ACQUIRE and succ being RELEASE we need to 19854 promote succ to ACQ_REL so that we don't lose the acquire semantics. */ 19855 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f))) 19856 && is_mm_release (memmodel_from_int (INTVAL (mod_s)))) 19857 mod_s = GEN_INT (MEMMODEL_ACQ_REL); 19858 19859 r_mode = mode; 19860 if (mode == QImode || mode == HImode) 19861 { 19862 r_mode = SImode; 19863 rval = gen_reg_rtx (r_mode); 19864 } 19865 19866 if (TARGET_LSE) 19867 { 19868 /* The CAS insn requires oldval and rval overlap, but we need to 19869 have a copy of oldval saved across the operation to tell if 19870 the operation is successful. */ 19871 if (reg_overlap_mentioned_p (rval, oldval)) 19872 rval = copy_to_mode_reg (r_mode, oldval); 19873 else 19874 emit_move_insn (rval, gen_lowpart (r_mode, oldval)); 19875 19876 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, 19877 newval, mod_s)); 19878 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode); 19879 } 19880 else if (TARGET_OUTLINE_ATOMICS) 19881 { 19882 /* Oldval must satisfy compare afterward. */ 19883 if (!aarch64_plus_operand (oldval, mode)) 19884 oldval = force_reg (mode, oldval); 19885 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names); 19886 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode, 19887 oldval, mode, newval, mode, 19888 XEXP (mem, 0), Pmode); 19889 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode); 19890 } 19891 else 19892 { 19893 /* The oldval predicate varies by mode. Test it and force to reg. */ 19894 insn_code code = code_for_aarch64_compare_and_swap (mode); 19895 if (!insn_data[code].operand[2].predicate (oldval, mode)) 19896 oldval = force_reg (mode, oldval); 19897 19898 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval, 19899 is_weak, mod_s, mod_f)); 19900 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); 19901 } 19902 19903 if (r_mode != mode) 19904 rval = gen_lowpart (mode, rval); 19905 emit_move_insn (operands[1], rval); 19906 19907 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx); 19908 emit_insn (gen_rtx_SET (bval, x)); 19909} 19910 19911/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a 19912 sequence implementing an atomic operation. */ 19913 19914static void 19915aarch64_emit_post_barrier (enum memmodel model) 19916{ 19917 const enum memmodel base_model = memmodel_base (model); 19918 19919 if (is_mm_sync (model) 19920 && (base_model == MEMMODEL_ACQUIRE 19921 || base_model == MEMMODEL_ACQ_REL 19922 || base_model == MEMMODEL_SEQ_CST)) 19923 { 19924 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST))); 19925 } 19926} 19927 19928/* Split a compare and swap pattern. */ 19929 19930void 19931aarch64_split_compare_and_swap (rtx operands[]) 19932{ 19933 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */ 19934 gcc_assert (epilogue_completed); 19935 19936 rtx rval, mem, oldval, newval, scratch, x, model_rtx; 19937 machine_mode mode; 19938 bool is_weak; 19939 rtx_code_label *label1, *label2; 19940 enum memmodel model; 19941 19942 rval = operands[0]; 19943 mem = operands[1]; 19944 oldval = operands[2]; 19945 newval = operands[3]; 19946 is_weak = (operands[4] != const0_rtx); 19947 model_rtx = operands[5]; 19948 scratch = operands[7]; 19949 mode = GET_MODE (mem); 19950 model = memmodel_from_int (INTVAL (model_rtx)); 19951 19952 /* When OLDVAL is zero and we want the strong version we can emit a tighter 19953 loop: 19954 .label1: 19955 LD[A]XR rval, [mem] 19956 CBNZ rval, .label2 19957 ST[L]XR scratch, newval, [mem] 19958 CBNZ scratch, .label1 19959 .label2: 19960 CMP rval, 0. */ 19961 bool strong_zero_p = (!is_weak && !aarch64_track_speculation && 19962 oldval == const0_rtx && mode != TImode); 19963 19964 label1 = NULL; 19965 if (!is_weak) 19966 { 19967 label1 = gen_label_rtx (); 19968 emit_label (label1); 19969 } 19970 label2 = gen_label_rtx (); 19971 19972 /* The initial load can be relaxed for a __sync operation since a final 19973 barrier will be emitted to stop code hoisting. */ 19974 if (is_mm_sync (model)) 19975 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED)); 19976 else 19977 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx); 19978 19979 if (strong_zero_p) 19980 x = gen_rtx_NE (VOIDmode, rval, const0_rtx); 19981 else 19982 { 19983 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode); 19984 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); 19985 } 19986 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 19987 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); 19988 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 19989 19990 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx); 19991 19992 if (!is_weak) 19993 { 19994 if (aarch64_track_speculation) 19995 { 19996 /* Emit an explicit compare instruction, so that we can correctly 19997 track the condition codes. */ 19998 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx); 19999 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); 20000 } 20001 else 20002 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx); 20003 20004 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 20005 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx); 20006 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 20007 } 20008 else 20009 aarch64_gen_compare_reg (NE, scratch, const0_rtx); 20010 20011 emit_label (label2); 20012 20013 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL 20014 to set the condition flags. If this is not used it will be removed by 20015 later passes. */ 20016 if (strong_zero_p) 20017 aarch64_gen_compare_reg (NE, rval, const0_rtx); 20018 20019 /* Emit any final barrier needed for a __sync operation. */ 20020 if (is_mm_sync (model)) 20021 aarch64_emit_post_barrier (model); 20022} 20023 20024/* Split an atomic operation. */ 20025 20026void 20027aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, 20028 rtx value, rtx model_rtx, rtx cond) 20029{ 20030 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */ 20031 gcc_assert (epilogue_completed); 20032 20033 machine_mode mode = GET_MODE (mem); 20034 machine_mode wmode = (mode == DImode ? DImode : SImode); 20035 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx)); 20036 const bool is_sync = is_mm_sync (model); 20037 rtx_code_label *label; 20038 rtx x; 20039 20040 /* Split the atomic operation into a sequence. */ 20041 label = gen_label_rtx (); 20042 emit_label (label); 20043 20044 if (new_out) 20045 new_out = gen_lowpart (wmode, new_out); 20046 if (old_out) 20047 old_out = gen_lowpart (wmode, old_out); 20048 else 20049 old_out = new_out; 20050 value = simplify_gen_subreg (wmode, value, mode, 0); 20051 20052 /* The initial load can be relaxed for a __sync operation since a final 20053 barrier will be emitted to stop code hoisting. */ 20054 if (is_sync) 20055 aarch64_emit_load_exclusive (mode, old_out, mem, 20056 GEN_INT (MEMMODEL_RELAXED)); 20057 else 20058 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx); 20059 20060 switch (code) 20061 { 20062 case SET: 20063 new_out = value; 20064 break; 20065 20066 case NOT: 20067 x = gen_rtx_AND (wmode, old_out, value); 20068 emit_insn (gen_rtx_SET (new_out, x)); 20069 x = gen_rtx_NOT (wmode, new_out); 20070 emit_insn (gen_rtx_SET (new_out, x)); 20071 break; 20072 20073 case MINUS: 20074 if (CONST_INT_P (value)) 20075 { 20076 value = GEN_INT (-INTVAL (value)); 20077 code = PLUS; 20078 } 20079 /* Fall through. */ 20080 20081 default: 20082 x = gen_rtx_fmt_ee (code, wmode, old_out, value); 20083 emit_insn (gen_rtx_SET (new_out, x)); 20084 break; 20085 } 20086 20087 aarch64_emit_store_exclusive (mode, cond, mem, 20088 gen_lowpart (mode, new_out), model_rtx); 20089 20090 if (aarch64_track_speculation) 20091 { 20092 /* Emit an explicit compare instruction, so that we can correctly 20093 track the condition codes. */ 20094 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx); 20095 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); 20096 } 20097 else 20098 x = gen_rtx_NE (VOIDmode, cond, const0_rtx); 20099 20100 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 20101 gen_rtx_LABEL_REF (Pmode, label), pc_rtx); 20102 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 20103 20104 /* Emit any final barrier needed for a __sync operation. */ 20105 if (is_sync) 20106 aarch64_emit_post_barrier (model); 20107} 20108 20109static void 20110aarch64_init_libfuncs (void) 20111{ 20112 /* Half-precision float operations. The compiler handles all operations 20113 with NULL libfuncs by converting to SFmode. */ 20114 20115 /* Conversions. */ 20116 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee"); 20117 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee"); 20118 20119 /* Arithmetic. */ 20120 set_optab_libfunc (add_optab, HFmode, NULL); 20121 set_optab_libfunc (sdiv_optab, HFmode, NULL); 20122 set_optab_libfunc (smul_optab, HFmode, NULL); 20123 set_optab_libfunc (neg_optab, HFmode, NULL); 20124 set_optab_libfunc (sub_optab, HFmode, NULL); 20125 20126 /* Comparisons. */ 20127 set_optab_libfunc (eq_optab, HFmode, NULL); 20128 set_optab_libfunc (ne_optab, HFmode, NULL); 20129 set_optab_libfunc (lt_optab, HFmode, NULL); 20130 set_optab_libfunc (le_optab, HFmode, NULL); 20131 set_optab_libfunc (ge_optab, HFmode, NULL); 20132 set_optab_libfunc (gt_optab, HFmode, NULL); 20133 set_optab_libfunc (unord_optab, HFmode, NULL); 20134} 20135 20136/* Target hook for c_mode_for_suffix. */ 20137static machine_mode 20138aarch64_c_mode_for_suffix (char suffix) 20139{ 20140 if (suffix == 'q') 20141 return TFmode; 20142 20143 return VOIDmode; 20144} 20145 20146/* We can only represent floating point constants which will fit in 20147 "quarter-precision" values. These values are characterised by 20148 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given 20149 by: 20150 20151 (-1)^s * (n/16) * 2^r 20152 20153 Where: 20154 's' is the sign bit. 20155 'n' is an integer in the range 16 <= n <= 31. 20156 'r' is an integer in the range -3 <= r <= 4. */ 20157 20158/* Return true iff X can be represented by a quarter-precision 20159 floating point immediate operand X. Note, we cannot represent 0.0. */ 20160bool 20161aarch64_float_const_representable_p (rtx x) 20162{ 20163 /* This represents our current view of how many bits 20164 make up the mantissa. */ 20165 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; 20166 int exponent; 20167 unsigned HOST_WIDE_INT mantissa, mask; 20168 REAL_VALUE_TYPE r, m; 20169 bool fail; 20170 20171 x = unwrap_const_vec_duplicate (x); 20172 if (!CONST_DOUBLE_P (x)) 20173 return false; 20174 20175 if (GET_MODE (x) == VOIDmode 20176 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST)) 20177 return false; 20178 20179 r = *CONST_DOUBLE_REAL_VALUE (x); 20180 20181 /* We cannot represent infinities, NaNs or +/-zero. We won't 20182 know if we have +zero until we analyse the mantissa, but we 20183 can reject the other invalid values. */ 20184 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) 20185 || REAL_VALUE_MINUS_ZERO (r)) 20186 return false; 20187 20188 /* Extract exponent. */ 20189 r = real_value_abs (&r); 20190 exponent = REAL_EXP (&r); 20191 20192 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the 20193 highest (sign) bit, with a fixed binary point at bit point_pos. 20194 m1 holds the low part of the mantissa, m2 the high part. 20195 WARNING: If we ever have a representation using more than 2 * H_W_I - 1 20196 bits for the mantissa, this can fail (low bits will be lost). */ 20197 real_ldexp (&m, &r, point_pos - exponent); 20198 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2); 20199 20200 /* If the low part of the mantissa has bits set we cannot represent 20201 the value. */ 20202 if (w.ulow () != 0) 20203 return false; 20204 /* We have rejected the lower HOST_WIDE_INT, so update our 20205 understanding of how many bits lie in the mantissa and 20206 look only at the high HOST_WIDE_INT. */ 20207 mantissa = w.elt (1); 20208 point_pos -= HOST_BITS_PER_WIDE_INT; 20209 20210 /* We can only represent values with a mantissa of the form 1.xxxx. */ 20211 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1; 20212 if ((mantissa & mask) != 0) 20213 return false; 20214 20215 /* Having filtered unrepresentable values, we may now remove all 20216 but the highest 5 bits. */ 20217 mantissa >>= point_pos - 5; 20218 20219 /* We cannot represent the value 0.0, so reject it. This is handled 20220 elsewhere. */ 20221 if (mantissa == 0) 20222 return false; 20223 20224 /* Then, as bit 4 is always set, we can mask it off, leaving 20225 the mantissa in the range [0, 15]. */ 20226 mantissa &= ~(1 << 4); 20227 gcc_assert (mantissa <= 15); 20228 20229 /* GCC internally does not use IEEE754-like encoding (where normalized 20230 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c). 20231 Our mantissa values are shifted 4 places to the left relative to 20232 normalized IEEE754 so we must modify the exponent returned by REAL_EXP 20233 by 5 places to correct for GCC's representation. */ 20234 exponent = 5 - exponent; 20235 20236 return (exponent >= 0 && exponent <= 7); 20237} 20238 20239/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC 20240 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to 20241 output MOVI/MVNI, ORR or BIC immediate. */ 20242char* 20243aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, 20244 enum simd_immediate_check which) 20245{ 20246 bool is_valid; 20247 static char templ[40]; 20248 const char *mnemonic; 20249 const char *shift_op; 20250 unsigned int lane_count = 0; 20251 char element_char; 20252 20253 struct simd_immediate_info info; 20254 20255 /* This will return true to show const_vector is legal for use as either 20256 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate. 20257 It will also update INFO to show how the immediate should be generated. 20258 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */ 20259 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which); 20260 gcc_assert (is_valid); 20261 20262 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); 20263 lane_count = width / GET_MODE_BITSIZE (info.elt_mode); 20264 20265 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) 20266 { 20267 gcc_assert (info.insn == simd_immediate_info::MOV 20268 && info.u.mov.shift == 0); 20269 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD 20270 move immediate path. */ 20271 if (aarch64_float_const_zero_rtx_p (info.u.mov.value)) 20272 info.u.mov.value = GEN_INT (0); 20273 else 20274 { 20275 const unsigned int buf_size = 20; 20276 char float_buf[buf_size] = {'\0'}; 20277 real_to_decimal_for_mode (float_buf, 20278 CONST_DOUBLE_REAL_VALUE (info.u.mov.value), 20279 buf_size, buf_size, 1, info.elt_mode); 20280 20281 if (lane_count == 1) 20282 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf); 20283 else 20284 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", 20285 lane_count, element_char, float_buf); 20286 return templ; 20287 } 20288 } 20289 20290 gcc_assert (CONST_INT_P (info.u.mov.value)); 20291 20292 if (which == AARCH64_CHECK_MOV) 20293 { 20294 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi"; 20295 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL 20296 ? "msl" : "lsl"); 20297 if (lane_count == 1) 20298 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX, 20299 mnemonic, UINTVAL (info.u.mov.value)); 20300 else if (info.u.mov.shift) 20301 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " 20302 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count, 20303 element_char, UINTVAL (info.u.mov.value), shift_op, 20304 info.u.mov.shift); 20305 else 20306 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " 20307 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count, 20308 element_char, UINTVAL (info.u.mov.value)); 20309 } 20310 else 20311 { 20312 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */ 20313 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr"; 20314 if (info.u.mov.shift) 20315 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #" 20316 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count, 20317 element_char, UINTVAL (info.u.mov.value), "lsl", 20318 info.u.mov.shift); 20319 else 20320 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #" 20321 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count, 20322 element_char, UINTVAL (info.u.mov.value)); 20323 } 20324 return templ; 20325} 20326 20327char* 20328aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode) 20329{ 20330 20331 /* If a floating point number was passed and we desire to use it in an 20332 integer mode do the conversion to integer. */ 20333 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT) 20334 { 20335 unsigned HOST_WIDE_INT ival; 20336 if (!aarch64_reinterpret_float_as_int (immediate, &ival)) 20337 gcc_unreachable (); 20338 immediate = gen_int_mode (ival, mode); 20339 } 20340 20341 machine_mode vmode; 20342 /* use a 64 bit mode for everything except for DI/DF mode, where we use 20343 a 128 bit vector mode. */ 20344 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64; 20345 20346 vmode = aarch64_simd_container_mode (mode, width); 20347 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate)); 20348 return aarch64_output_simd_mov_immediate (v_op, width); 20349} 20350 20351/* Return the output string to use for moving immediate CONST_VECTOR 20352 into an SVE register. */ 20353 20354char * 20355aarch64_output_sve_mov_immediate (rtx const_vector) 20356{ 20357 static char templ[40]; 20358 struct simd_immediate_info info; 20359 char element_char; 20360 20361 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info); 20362 gcc_assert (is_valid); 20363 20364 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); 20365 20366 machine_mode vec_mode = GET_MODE (const_vector); 20367 if (aarch64_sve_pred_mode_p (vec_mode)) 20368 { 20369 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")]; 20370 if (info.insn == simd_immediate_info::MOV) 20371 { 20372 gcc_assert (info.u.mov.value == const0_rtx); 20373 snprintf (buf, sizeof (buf), "pfalse\t%%0.b"); 20374 } 20375 else 20376 { 20377 gcc_assert (info.insn == simd_immediate_info::PTRUE); 20378 unsigned int total_bytes; 20379 if (info.u.pattern == AARCH64_SV_ALL 20380 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes)) 20381 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char, 20382 total_bytes / GET_MODE_SIZE (info.elt_mode)); 20383 else 20384 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char, 20385 svpattern_token (info.u.pattern)); 20386 } 20387 return buf; 20388 } 20389 20390 if (info.insn == simd_immediate_info::INDEX) 20391 { 20392 snprintf (templ, sizeof (templ), "index\t%%0.%c, #" 20393 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC, 20394 element_char, INTVAL (info.u.index.base), 20395 INTVAL (info.u.index.step)); 20396 return templ; 20397 } 20398 20399 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) 20400 { 20401 if (aarch64_float_const_zero_rtx_p (info.u.mov.value)) 20402 info.u.mov.value = GEN_INT (0); 20403 else 20404 { 20405 const int buf_size = 20; 20406 char float_buf[buf_size] = {}; 20407 real_to_decimal_for_mode (float_buf, 20408 CONST_DOUBLE_REAL_VALUE (info.u.mov.value), 20409 buf_size, buf_size, 1, info.elt_mode); 20410 20411 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s", 20412 element_char, float_buf); 20413 return templ; 20414 } 20415 } 20416 20417 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC, 20418 element_char, INTVAL (info.u.mov.value)); 20419 return templ; 20420} 20421 20422/* Return the asm template for a PTRUES. CONST_UNSPEC is the 20423 aarch64_sve_ptrue_svpattern_immediate that describes the predicate 20424 pattern. */ 20425 20426char * 20427aarch64_output_sve_ptrues (rtx const_unspec) 20428{ 20429 static char templ[40]; 20430 20431 struct simd_immediate_info info; 20432 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info); 20433 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE); 20434 20435 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); 20436 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char, 20437 svpattern_token (info.u.pattern)); 20438 return templ; 20439} 20440 20441/* Split operands into moves from op[1] + op[2] into op[0]. */ 20442 20443void 20444aarch64_split_combinev16qi (rtx operands[3]) 20445{ 20446 unsigned int dest = REGNO (operands[0]); 20447 unsigned int src1 = REGNO (operands[1]); 20448 unsigned int src2 = REGNO (operands[2]); 20449 machine_mode halfmode = GET_MODE (operands[1]); 20450 unsigned int halfregs = REG_NREGS (operands[1]); 20451 rtx destlo, desthi; 20452 20453 gcc_assert (halfmode == V16QImode); 20454 20455 if (src1 == dest && src2 == dest + halfregs) 20456 { 20457 /* No-op move. Can't split to nothing; emit something. */ 20458 emit_note (NOTE_INSN_DELETED); 20459 return; 20460 } 20461 20462 /* Preserve register attributes for variable tracking. */ 20463 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0); 20464 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs, 20465 GET_MODE_SIZE (halfmode)); 20466 20467 /* Special case of reversed high/low parts. */ 20468 if (reg_overlap_mentioned_p (operands[2], destlo) 20469 && reg_overlap_mentioned_p (operands[1], desthi)) 20470 { 20471 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); 20472 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2])); 20473 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); 20474 } 20475 else if (!reg_overlap_mentioned_p (operands[2], destlo)) 20476 { 20477 /* Try to avoid unnecessary moves if part of the result 20478 is in the right place already. */ 20479 if (src1 != dest) 20480 emit_move_insn (destlo, operands[1]); 20481 if (src2 != dest + halfregs) 20482 emit_move_insn (desthi, operands[2]); 20483 } 20484 else 20485 { 20486 if (src2 != dest + halfregs) 20487 emit_move_insn (desthi, operands[2]); 20488 if (src1 != dest) 20489 emit_move_insn (destlo, operands[1]); 20490 } 20491} 20492 20493/* vec_perm support. */ 20494 20495struct expand_vec_perm_d 20496{ 20497 rtx target, op0, op1; 20498 vec_perm_indices perm; 20499 machine_mode vmode; 20500 unsigned int vec_flags; 20501 bool one_vector_p; 20502 bool testing_p; 20503}; 20504 20505/* Generate a variable permutation. */ 20506 20507static void 20508aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) 20509{ 20510 machine_mode vmode = GET_MODE (target); 20511 bool one_vector_p = rtx_equal_p (op0, op1); 20512 20513 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode); 20514 gcc_checking_assert (GET_MODE (op0) == vmode); 20515 gcc_checking_assert (GET_MODE (op1) == vmode); 20516 gcc_checking_assert (GET_MODE (sel) == vmode); 20517 gcc_checking_assert (TARGET_SIMD); 20518 20519 if (one_vector_p) 20520 { 20521 if (vmode == V8QImode) 20522 { 20523 /* Expand the argument to a V16QI mode by duplicating it. */ 20524 rtx pair = gen_reg_rtx (V16QImode); 20525 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0)); 20526 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); 20527 } 20528 else 20529 { 20530 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel)); 20531 } 20532 } 20533 else 20534 { 20535 rtx pair; 20536 20537 if (vmode == V8QImode) 20538 { 20539 pair = gen_reg_rtx (V16QImode); 20540 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1)); 20541 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); 20542 } 20543 else 20544 { 20545 pair = gen_reg_rtx (OImode); 20546 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1)); 20547 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel)); 20548 } 20549 } 20550} 20551 20552/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL. 20553 NELT is the number of elements in the vector. */ 20554 20555void 20556aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel, 20557 unsigned int nelt) 20558{ 20559 machine_mode vmode = GET_MODE (target); 20560 bool one_vector_p = rtx_equal_p (op0, op1); 20561 rtx mask; 20562 20563 /* The TBL instruction does not use a modulo index, so we must take care 20564 of that ourselves. */ 20565 mask = aarch64_simd_gen_const_vector_dup (vmode, 20566 one_vector_p ? nelt - 1 : 2 * nelt - 1); 20567 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN); 20568 20569 /* For big-endian, we also need to reverse the index within the vector 20570 (but not which vector). */ 20571 if (BYTES_BIG_ENDIAN) 20572 { 20573 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */ 20574 if (!one_vector_p) 20575 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1); 20576 sel = expand_simple_binop (vmode, XOR, sel, mask, 20577 NULL, 0, OPTAB_LIB_WIDEN); 20578 } 20579 aarch64_expand_vec_perm_1 (target, op0, op1, sel); 20580} 20581 20582/* Generate (set TARGET (unspec [OP0 OP1] CODE)). */ 20583 20584static void 20585emit_unspec2 (rtx target, int code, rtx op0, rtx op1) 20586{ 20587 emit_insn (gen_rtx_SET (target, 20588 gen_rtx_UNSPEC (GET_MODE (target), 20589 gen_rtvec (2, op0, op1), code))); 20590} 20591 20592/* Expand an SVE vec_perm with the given operands. */ 20593 20594void 20595aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) 20596{ 20597 machine_mode data_mode = GET_MODE (target); 20598 machine_mode sel_mode = GET_MODE (sel); 20599 /* Enforced by the pattern condition. */ 20600 int nunits = GET_MODE_NUNITS (sel_mode).to_constant (); 20601 20602 /* Note: vec_perm indices are supposed to wrap when they go beyond the 20603 size of the two value vectors, i.e. the upper bits of the indices 20604 are effectively ignored. SVE TBL instead produces 0 for any 20605 out-of-range indices, so we need to modulo all the vec_perm indices 20606 to ensure they are all in range. */ 20607 rtx sel_reg = force_reg (sel_mode, sel); 20608 20609 /* Check if the sel only references the first values vector. */ 20610 if (GET_CODE (sel) == CONST_VECTOR 20611 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)) 20612 { 20613 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg); 20614 return; 20615 } 20616 20617 /* Check if the two values vectors are the same. */ 20618 if (rtx_equal_p (op0, op1)) 20619 { 20620 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1); 20621 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel, 20622 NULL, 0, OPTAB_DIRECT); 20623 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod); 20624 return; 20625 } 20626 20627 /* Run TBL on for each value vector and combine the results. */ 20628 20629 rtx res0 = gen_reg_rtx (data_mode); 20630 rtx res1 = gen_reg_rtx (data_mode); 20631 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits); 20632 if (GET_CODE (sel) != CONST_VECTOR 20633 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1)) 20634 { 20635 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, 20636 2 * nunits - 1); 20637 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel, 20638 NULL, 0, OPTAB_DIRECT); 20639 } 20640 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg); 20641 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems, 20642 NULL, 0, OPTAB_DIRECT); 20643 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub); 20644 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT) 20645 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1))); 20646 else 20647 emit_unspec2 (target, UNSPEC_IORF, res0, res1); 20648} 20649 20650/* Recognize patterns suitable for the TRN instructions. */ 20651static bool 20652aarch64_evpc_trn (struct expand_vec_perm_d *d) 20653{ 20654 HOST_WIDE_INT odd; 20655 poly_uint64 nelt = d->perm.length (); 20656 rtx out, in0, in1, x; 20657 machine_mode vmode = d->vmode; 20658 20659 if (GET_MODE_UNIT_SIZE (vmode) > 8) 20660 return false; 20661 20662 /* Note that these are little-endian tests. 20663 We correct for big-endian later. */ 20664 if (!d->perm[0].is_constant (&odd) 20665 || (odd != 0 && odd != 1) 20666 || !d->perm.series_p (0, 2, odd, 2) 20667 || !d->perm.series_p (1, 2, nelt + odd, 2)) 20668 return false; 20669 20670 /* Success! */ 20671 if (d->testing_p) 20672 return true; 20673 20674 in0 = d->op0; 20675 in1 = d->op1; 20676 /* We don't need a big-endian lane correction for SVE; see the comment 20677 at the head of aarch64-sve.md for details. */ 20678 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 20679 { 20680 x = in0, in0 = in1, in1 = x; 20681 odd = !odd; 20682 } 20683 out = d->target; 20684 20685 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 20686 odd ? UNSPEC_TRN2 : UNSPEC_TRN1)); 20687 return true; 20688} 20689 20690/* Recognize patterns suitable for the UZP instructions. */ 20691static bool 20692aarch64_evpc_uzp (struct expand_vec_perm_d *d) 20693{ 20694 HOST_WIDE_INT odd; 20695 rtx out, in0, in1, x; 20696 machine_mode vmode = d->vmode; 20697 20698 if (GET_MODE_UNIT_SIZE (vmode) > 8) 20699 return false; 20700 20701 /* Note that these are little-endian tests. 20702 We correct for big-endian later. */ 20703 if (!d->perm[0].is_constant (&odd) 20704 || (odd != 0 && odd != 1) 20705 || !d->perm.series_p (0, 1, odd, 2)) 20706 return false; 20707 20708 /* Success! */ 20709 if (d->testing_p) 20710 return true; 20711 20712 in0 = d->op0; 20713 in1 = d->op1; 20714 /* We don't need a big-endian lane correction for SVE; see the comment 20715 at the head of aarch64-sve.md for details. */ 20716 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 20717 { 20718 x = in0, in0 = in1, in1 = x; 20719 odd = !odd; 20720 } 20721 out = d->target; 20722 20723 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 20724 odd ? UNSPEC_UZP2 : UNSPEC_UZP1)); 20725 return true; 20726} 20727 20728/* Recognize patterns suitable for the ZIP instructions. */ 20729static bool 20730aarch64_evpc_zip (struct expand_vec_perm_d *d) 20731{ 20732 unsigned int high; 20733 poly_uint64 nelt = d->perm.length (); 20734 rtx out, in0, in1, x; 20735 machine_mode vmode = d->vmode; 20736 20737 if (GET_MODE_UNIT_SIZE (vmode) > 8) 20738 return false; 20739 20740 /* Note that these are little-endian tests. 20741 We correct for big-endian later. */ 20742 poly_uint64 first = d->perm[0]; 20743 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt)) 20744 || !d->perm.series_p (0, 2, first, 1) 20745 || !d->perm.series_p (1, 2, first + nelt, 1)) 20746 return false; 20747 high = maybe_ne (first, 0U); 20748 20749 /* Success! */ 20750 if (d->testing_p) 20751 return true; 20752 20753 in0 = d->op0; 20754 in1 = d->op1; 20755 /* We don't need a big-endian lane correction for SVE; see the comment 20756 at the head of aarch64-sve.md for details. */ 20757 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 20758 { 20759 x = in0, in0 = in1, in1 = x; 20760 high = !high; 20761 } 20762 out = d->target; 20763 20764 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 20765 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1)); 20766 return true; 20767} 20768 20769/* Recognize patterns for the EXT insn. */ 20770 20771static bool 20772aarch64_evpc_ext (struct expand_vec_perm_d *d) 20773{ 20774 HOST_WIDE_INT location; 20775 rtx offset; 20776 20777 /* The first element always refers to the first vector. 20778 Check if the extracted indices are increasing by one. */ 20779 if (d->vec_flags == VEC_SVE_PRED 20780 || !d->perm[0].is_constant (&location) 20781 || !d->perm.series_p (0, 1, location, 1)) 20782 return false; 20783 20784 /* Success! */ 20785 if (d->testing_p) 20786 return true; 20787 20788 /* The case where (location == 0) is a no-op for both big- and little-endian, 20789 and is removed by the mid-end at optimization levels -O1 and higher. 20790 20791 We don't need a big-endian lane correction for SVE; see the comment 20792 at the head of aarch64-sve.md for details. */ 20793 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD) 20794 { 20795 /* After setup, we want the high elements of the first vector (stored 20796 at the LSB end of the register), and the low elements of the second 20797 vector (stored at the MSB end of the register). So swap. */ 20798 std::swap (d->op0, d->op1); 20799 /* location != 0 (above), so safe to assume (nelt - location) < nelt. 20800 to_constant () is safe since this is restricted to Advanced SIMD 20801 vectors. */ 20802 location = d->perm.length ().to_constant () - location; 20803 } 20804 20805 offset = GEN_INT (location); 20806 emit_set_insn (d->target, 20807 gen_rtx_UNSPEC (d->vmode, 20808 gen_rtvec (3, d->op0, d->op1, offset), 20809 UNSPEC_EXT)); 20810 return true; 20811} 20812 20813/* Recognize patterns for the REV{64,32,16} insns, which reverse elements 20814 within each 64-bit, 32-bit or 16-bit granule. */ 20815 20816static bool 20817aarch64_evpc_rev_local (struct expand_vec_perm_d *d) 20818{ 20819 HOST_WIDE_INT diff; 20820 unsigned int i, size, unspec; 20821 machine_mode pred_mode; 20822 20823 if (d->vec_flags == VEC_SVE_PRED 20824 || !d->one_vector_p 20825 || !d->perm[0].is_constant (&diff) 20826 || !diff) 20827 return false; 20828 20829 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode); 20830 if (size == 8) 20831 { 20832 unspec = UNSPEC_REV64; 20833 pred_mode = VNx2BImode; 20834 } 20835 else if (size == 4) 20836 { 20837 unspec = UNSPEC_REV32; 20838 pred_mode = VNx4BImode; 20839 } 20840 else if (size == 2) 20841 { 20842 unspec = UNSPEC_REV16; 20843 pred_mode = VNx8BImode; 20844 } 20845 else 20846 return false; 20847 20848 unsigned int step = diff + 1; 20849 for (i = 0; i < step; ++i) 20850 if (!d->perm.series_p (i, step, diff - i, step)) 20851 return false; 20852 20853 /* Success! */ 20854 if (d->testing_p) 20855 return true; 20856 20857 if (d->vec_flags == VEC_SVE_DATA) 20858 { 20859 machine_mode int_mode = aarch64_sve_int_mode (pred_mode); 20860 rtx target = gen_reg_rtx (int_mode); 20861 if (BYTES_BIG_ENDIAN) 20862 /* The act of taking a subreg between INT_MODE and d->vmode 20863 is itself a reversing operation on big-endian targets; 20864 see the comment at the head of aarch64-sve.md for details. 20865 First reinterpret OP0 as INT_MODE without using a subreg 20866 and without changing the contents. */ 20867 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0)); 20868 else 20869 { 20870 /* For SVE we use REV[BHW] unspecs derived from the element size 20871 of v->mode and vector modes whose elements have SIZE bytes. 20872 This ensures that the vector modes match the predicate modes. */ 20873 int unspec = aarch64_sve_rev_unspec (d->vmode); 20874 rtx pred = aarch64_ptrue_reg (pred_mode); 20875 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred, 20876 gen_lowpart (int_mode, d->op0))); 20877 } 20878 emit_move_insn (d->target, gen_lowpart (d->vmode, target)); 20879 return true; 20880 } 20881 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec); 20882 emit_set_insn (d->target, src); 20883 return true; 20884} 20885 20886/* Recognize patterns for the REV insn, which reverses elements within 20887 a full vector. */ 20888 20889static bool 20890aarch64_evpc_rev_global (struct expand_vec_perm_d *d) 20891{ 20892 poly_uint64 nelt = d->perm.length (); 20893 20894 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD) 20895 return false; 20896 20897 if (!d->perm.series_p (0, 1, nelt - 1, -1)) 20898 return false; 20899 20900 /* Success! */ 20901 if (d->testing_p) 20902 return true; 20903 20904 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV); 20905 emit_set_insn (d->target, src); 20906 return true; 20907} 20908 20909static bool 20910aarch64_evpc_dup (struct expand_vec_perm_d *d) 20911{ 20912 rtx out = d->target; 20913 rtx in0; 20914 HOST_WIDE_INT elt; 20915 machine_mode vmode = d->vmode; 20916 rtx lane; 20917 20918 if (d->vec_flags == VEC_SVE_PRED 20919 || d->perm.encoding ().encoded_nelts () != 1 20920 || !d->perm[0].is_constant (&elt)) 20921 return false; 20922 20923 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode)) 20924 return false; 20925 20926 /* Success! */ 20927 if (d->testing_p) 20928 return true; 20929 20930 /* The generic preparation in aarch64_expand_vec_perm_const_1 20931 swaps the operand order and the permute indices if it finds 20932 d->perm[0] to be in the second operand. Thus, we can always 20933 use d->op0 and need not do any extra arithmetic to get the 20934 correct lane number. */ 20935 in0 = d->op0; 20936 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */ 20937 20938 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane)); 20939 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel); 20940 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select)); 20941 return true; 20942} 20943 20944static bool 20945aarch64_evpc_tbl (struct expand_vec_perm_d *d) 20946{ 20947 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel; 20948 machine_mode vmode = d->vmode; 20949 20950 /* Make sure that the indices are constant. */ 20951 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts (); 20952 for (unsigned int i = 0; i < encoded_nelts; ++i) 20953 if (!d->perm[i].is_constant ()) 20954 return false; 20955 20956 if (d->testing_p) 20957 return true; 20958 20959 /* Generic code will try constant permutation twice. Once with the 20960 original mode and again with the elements lowered to QImode. 20961 So wait and don't do the selector expansion ourselves. */ 20962 if (vmode != V8QImode && vmode != V16QImode) 20963 return false; 20964 20965 /* to_constant is safe since this routine is specific to Advanced SIMD 20966 vectors. */ 20967 unsigned int nelt = d->perm.length ().to_constant (); 20968 for (unsigned int i = 0; i < nelt; ++i) 20969 /* If big-endian and two vectors we end up with a weird mixed-endian 20970 mode on NEON. Reverse the index within each word but not the word 20971 itself. to_constant is safe because we checked is_constant above. */ 20972 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN 20973 ? d->perm[i].to_constant () ^ (nelt - 1) 20974 : d->perm[i].to_constant ()); 20975 20976 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); 20977 sel = force_reg (vmode, sel); 20978 20979 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel); 20980 return true; 20981} 20982 20983/* Try to implement D using an SVE TBL instruction. */ 20984 20985static bool 20986aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) 20987{ 20988 unsigned HOST_WIDE_INT nelt; 20989 20990 /* Permuting two variable-length vectors could overflow the 20991 index range. */ 20992 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt)) 20993 return false; 20994 20995 if (d->testing_p) 20996 return true; 20997 20998 machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); 20999 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); 21000 if (d->one_vector_p) 21001 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel)); 21002 else 21003 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel); 21004 return true; 21005} 21006 21007/* Try to implement D using SVE SEL instruction. */ 21008 21009static bool 21010aarch64_evpc_sel (struct expand_vec_perm_d *d) 21011{ 21012 machine_mode vmode = d->vmode; 21013 int unit_size = GET_MODE_UNIT_SIZE (vmode); 21014 21015 if (d->vec_flags != VEC_SVE_DATA 21016 || unit_size > 8) 21017 return false; 21018 21019 int n_patterns = d->perm.encoding ().npatterns (); 21020 poly_int64 vec_len = d->perm.length (); 21021 21022 for (int i = 0; i < n_patterns; ++i) 21023 if (!known_eq (d->perm[i], i) 21024 && !known_eq (d->perm[i], vec_len + i)) 21025 return false; 21026 21027 for (int i = n_patterns; i < n_patterns * 2; i++) 21028 if (!d->perm.series_p (i, n_patterns, i, n_patterns) 21029 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns)) 21030 return false; 21031 21032 if (d->testing_p) 21033 return true; 21034 21035 machine_mode pred_mode = aarch64_sve_pred_mode (vmode); 21036 21037 /* Build a predicate that is true when op0 elements should be used. */ 21038 rtx_vector_builder builder (pred_mode, n_patterns, 2); 21039 for (int i = 0; i < n_patterns * 2; i++) 21040 { 21041 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode) 21042 : CONST0_RTX (BImode); 21043 builder.quick_push (elem); 21044 } 21045 21046 rtx const_vec = builder.build (); 21047 rtx pred = force_reg (pred_mode, const_vec); 21048 /* TARGET = PRED ? OP0 : OP1. */ 21049 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred)); 21050 return true; 21051} 21052 21053static bool 21054aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) 21055{ 21056 /* The pattern matching functions above are written to look for a small 21057 number to begin the sequence (0, 1, N/2). If we begin with an index 21058 from the second operand, we can swap the operands. */ 21059 poly_int64 nelt = d->perm.length (); 21060 if (known_ge (d->perm[0], nelt)) 21061 { 21062 d->perm.rotate_inputs (1); 21063 std::swap (d->op0, d->op1); 21064 } 21065 21066 if ((d->vec_flags == VEC_ADVSIMD 21067 || d->vec_flags == VEC_SVE_DATA 21068 || d->vec_flags == VEC_SVE_PRED) 21069 && known_gt (nelt, 1)) 21070 { 21071 if (aarch64_evpc_rev_local (d)) 21072 return true; 21073 else if (aarch64_evpc_rev_global (d)) 21074 return true; 21075 else if (aarch64_evpc_ext (d)) 21076 return true; 21077 else if (aarch64_evpc_dup (d)) 21078 return true; 21079 else if (aarch64_evpc_zip (d)) 21080 return true; 21081 else if (aarch64_evpc_uzp (d)) 21082 return true; 21083 else if (aarch64_evpc_trn (d)) 21084 return true; 21085 else if (aarch64_evpc_sel (d)) 21086 return true; 21087 if (d->vec_flags == VEC_SVE_DATA) 21088 return aarch64_evpc_sve_tbl (d); 21089 else if (d->vec_flags == VEC_ADVSIMD) 21090 return aarch64_evpc_tbl (d); 21091 } 21092 return false; 21093} 21094 21095/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ 21096 21097static bool 21098aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, 21099 rtx op1, const vec_perm_indices &sel) 21100{ 21101 struct expand_vec_perm_d d; 21102 21103 /* Check whether the mask can be applied to a single vector. */ 21104 if (sel.ninputs () == 1 21105 || (op0 && rtx_equal_p (op0, op1))) 21106 d.one_vector_p = true; 21107 else if (sel.all_from_input_p (0)) 21108 { 21109 d.one_vector_p = true; 21110 op1 = op0; 21111 } 21112 else if (sel.all_from_input_p (1)) 21113 { 21114 d.one_vector_p = true; 21115 op0 = op1; 21116 } 21117 else 21118 d.one_vector_p = false; 21119 21120 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, 21121 sel.nelts_per_input ()); 21122 d.vmode = vmode; 21123 d.vec_flags = aarch64_classify_vector_mode (d.vmode); 21124 d.target = target; 21125 d.op0 = op0; 21126 d.op1 = op1; 21127 d.testing_p = !target; 21128 21129 if (!d.testing_p) 21130 return aarch64_expand_vec_perm_const_1 (&d); 21131 21132 rtx_insn *last = get_last_insn (); 21133 bool ret = aarch64_expand_vec_perm_const_1 (&d); 21134 gcc_assert (last == get_last_insn ()); 21135 21136 return ret; 21137} 21138 21139/* Generate a byte permute mask for a register of mode MODE, 21140 which has NUNITS units. */ 21141 21142rtx 21143aarch64_reverse_mask (machine_mode mode, unsigned int nunits) 21144{ 21145 /* We have to reverse each vector because we dont have 21146 a permuted load that can reverse-load according to ABI rules. */ 21147 rtx mask; 21148 rtvec v = rtvec_alloc (16); 21149 unsigned int i, j; 21150 unsigned int usize = GET_MODE_UNIT_SIZE (mode); 21151 21152 gcc_assert (BYTES_BIG_ENDIAN); 21153 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode)); 21154 21155 for (i = 0; i < nunits; i++) 21156 for (j = 0; j < usize; j++) 21157 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j); 21158 mask = gen_rtx_CONST_VECTOR (V16QImode, v); 21159 return force_reg (V16QImode, mask); 21160} 21161 21162/* Expand an SVE integer comparison using the SVE equivalent of: 21163 21164 (set TARGET (CODE OP0 OP1)). */ 21165 21166void 21167aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1) 21168{ 21169 machine_mode pred_mode = GET_MODE (target); 21170 machine_mode data_mode = GET_MODE (op0); 21171 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode, 21172 op0, op1); 21173 if (!rtx_equal_p (target, res)) 21174 emit_move_insn (target, res); 21175} 21176 21177/* Return the UNSPEC_COND_* code for comparison CODE. */ 21178 21179static unsigned int 21180aarch64_unspec_cond_code (rtx_code code) 21181{ 21182 switch (code) 21183 { 21184 case NE: 21185 return UNSPEC_COND_FCMNE; 21186 case EQ: 21187 return UNSPEC_COND_FCMEQ; 21188 case LT: 21189 return UNSPEC_COND_FCMLT; 21190 case GT: 21191 return UNSPEC_COND_FCMGT; 21192 case LE: 21193 return UNSPEC_COND_FCMLE; 21194 case GE: 21195 return UNSPEC_COND_FCMGE; 21196 case UNORDERED: 21197 return UNSPEC_COND_FCMUO; 21198 default: 21199 gcc_unreachable (); 21200 } 21201} 21202 21203/* Emit: 21204 21205 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>)) 21206 21207 where <X> is the operation associated with comparison CODE. 21208 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */ 21209 21210static void 21211aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred, 21212 bool known_ptrue_p, rtx op0, rtx op1) 21213{ 21214 rtx flag = gen_int_mode (known_ptrue_p, SImode); 21215 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred), 21216 gen_rtvec (4, pred, flag, op0, op1), 21217 aarch64_unspec_cond_code (code)); 21218 emit_set_insn (target, unspec); 21219} 21220 21221/* Emit the SVE equivalent of: 21222 21223 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>)) 21224 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>)) 21225 (set TARGET (ior:PRED_MODE TMP1 TMP2)) 21226 21227 where <Xi> is the operation associated with comparison CODEi. 21228 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */ 21229 21230static void 21231aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2, 21232 rtx pred, bool known_ptrue_p, rtx op0, rtx op1) 21233{ 21234 machine_mode pred_mode = GET_MODE (pred); 21235 rtx tmp1 = gen_reg_rtx (pred_mode); 21236 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1); 21237 rtx tmp2 = gen_reg_rtx (pred_mode); 21238 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1); 21239 aarch64_emit_binop (target, ior_optab, tmp1, tmp2); 21240} 21241 21242/* Emit the SVE equivalent of: 21243 21244 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>)) 21245 (set TARGET (not TMP)) 21246 21247 where <X> is the operation associated with comparison CODE. 21248 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */ 21249 21250static void 21251aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred, 21252 bool known_ptrue_p, rtx op0, rtx op1) 21253{ 21254 machine_mode pred_mode = GET_MODE (pred); 21255 rtx tmp = gen_reg_rtx (pred_mode); 21256 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1); 21257 aarch64_emit_unop (target, one_cmpl_optab, tmp); 21258} 21259 21260/* Expand an SVE floating-point comparison using the SVE equivalent of: 21261 21262 (set TARGET (CODE OP0 OP1)) 21263 21264 If CAN_INVERT_P is true, the caller can also handle inverted results; 21265 return true if the result is in fact inverted. */ 21266 21267bool 21268aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, 21269 rtx op0, rtx op1, bool can_invert_p) 21270{ 21271 machine_mode pred_mode = GET_MODE (target); 21272 machine_mode data_mode = GET_MODE (op0); 21273 21274 rtx ptrue = aarch64_ptrue_reg (pred_mode); 21275 switch (code) 21276 { 21277 case UNORDERED: 21278 /* UNORDERED has no immediate form. */ 21279 op1 = force_reg (data_mode, op1); 21280 /* fall through */ 21281 case LT: 21282 case LE: 21283 case GT: 21284 case GE: 21285 case EQ: 21286 case NE: 21287 { 21288 /* There is native support for the comparison. */ 21289 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1); 21290 return false; 21291 } 21292 21293 case LTGT: 21294 /* This is a trapping operation (LT or GT). */ 21295 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1); 21296 return false; 21297 21298 case UNEQ: 21299 if (!flag_trapping_math) 21300 { 21301 /* This would trap for signaling NaNs. */ 21302 op1 = force_reg (data_mode, op1); 21303 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ, 21304 ptrue, true, op0, op1); 21305 return false; 21306 } 21307 /* fall through */ 21308 case UNLT: 21309 case UNLE: 21310 case UNGT: 21311 case UNGE: 21312 if (flag_trapping_math) 21313 { 21314 /* Work out which elements are ordered. */ 21315 rtx ordered = gen_reg_rtx (pred_mode); 21316 op1 = force_reg (data_mode, op1); 21317 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED, 21318 ptrue, true, op0, op1); 21319 21320 /* Test the opposite condition for the ordered elements, 21321 then invert the result. */ 21322 if (code == UNEQ) 21323 code = NE; 21324 else 21325 code = reverse_condition_maybe_unordered (code); 21326 if (can_invert_p) 21327 { 21328 aarch64_emit_sve_fp_cond (target, code, 21329 ordered, false, op0, op1); 21330 return true; 21331 } 21332 aarch64_emit_sve_invert_fp_cond (target, code, 21333 ordered, false, op0, op1); 21334 return false; 21335 } 21336 break; 21337 21338 case ORDERED: 21339 /* ORDERED has no immediate form. */ 21340 op1 = force_reg (data_mode, op1); 21341 break; 21342 21343 default: 21344 gcc_unreachable (); 21345 } 21346 21347 /* There is native support for the inverse comparison. */ 21348 code = reverse_condition_maybe_unordered (code); 21349 if (can_invert_p) 21350 { 21351 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1); 21352 return true; 21353 } 21354 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1); 21355 return false; 21356} 21357 21358/* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode 21359 of the data being selected and CMP_MODE is the mode of the values being 21360 compared. */ 21361 21362void 21363aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode, 21364 rtx *ops) 21365{ 21366 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require (); 21367 rtx pred = gen_reg_rtx (pred_mode); 21368 if (FLOAT_MODE_P (cmp_mode)) 21369 { 21370 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]), 21371 ops[4], ops[5], true)) 21372 std::swap (ops[1], ops[2]); 21373 } 21374 else 21375 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]); 21376 21377 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode)) 21378 ops[1] = force_reg (data_mode, ops[1]); 21379 /* The "false" value can only be zero if the "true" value is a constant. */ 21380 if (register_operand (ops[1], data_mode) 21381 || !aarch64_simd_reg_or_zero (ops[2], data_mode)) 21382 ops[2] = force_reg (data_mode, ops[2]); 21383 21384 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]); 21385 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL)); 21386} 21387 21388/* Implement TARGET_MODES_TIEABLE_P. In principle we should always return 21389 true. However due to issues with register allocation it is preferable 21390 to avoid tieing integer scalar and FP scalar modes. Executing integer 21391 operations in general registers is better than treating them as scalar 21392 vector operations. This reduces latency and avoids redundant int<->FP 21393 moves. So tie modes if they are either the same class, or vector modes 21394 with other vector modes, vector structs or any scalar mode. */ 21395 21396static bool 21397aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2) 21398{ 21399 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2)) 21400 return true; 21401 21402 /* We specifically want to allow elements of "structure" modes to 21403 be tieable to the structure. This more general condition allows 21404 other rarer situations too. The reason we don't extend this to 21405 predicate modes is that there are no predicate structure modes 21406 nor any specific instructions for extracting part of a predicate 21407 register. */ 21408 if (aarch64_vector_data_mode_p (mode1) 21409 && aarch64_vector_data_mode_p (mode2)) 21410 return true; 21411 21412 /* Also allow any scalar modes with vectors. */ 21413 if (aarch64_vector_mode_supported_p (mode1) 21414 || aarch64_vector_mode_supported_p (mode2)) 21415 return true; 21416 21417 return false; 21418} 21419 21420/* Return a new RTX holding the result of moving POINTER forward by 21421 AMOUNT bytes. */ 21422 21423static rtx 21424aarch64_move_pointer (rtx pointer, poly_int64 amount) 21425{ 21426 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount); 21427 21428 return adjust_automodify_address (pointer, GET_MODE (pointer), 21429 next, amount); 21430} 21431 21432/* Return a new RTX holding the result of moving POINTER forward by the 21433 size of the mode it points to. */ 21434 21435static rtx 21436aarch64_progress_pointer (rtx pointer) 21437{ 21438 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); 21439} 21440 21441/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by 21442 MODE bytes. */ 21443 21444static void 21445aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, 21446 machine_mode mode) 21447{ 21448 rtx reg = gen_reg_rtx (mode); 21449 21450 /* "Cast" the pointers to the correct mode. */ 21451 *src = adjust_address (*src, mode, 0); 21452 *dst = adjust_address (*dst, mode, 0); 21453 /* Emit the memcpy. */ 21454 emit_move_insn (reg, *src); 21455 emit_move_insn (*dst, reg); 21456 /* Move the pointers forward. */ 21457 *src = aarch64_progress_pointer (*src); 21458 *dst = aarch64_progress_pointer (*dst); 21459} 21460 21461/* Expand cpymem, as if from a __builtin_memcpy. Return true if 21462 we succeed, otherwise return false. */ 21463 21464bool 21465aarch64_expand_cpymem (rtx *operands) 21466{ 21467 /* These need to be signed as we need to perform arithmetic on n as 21468 signed operations. */ 21469 int n, mode_bits; 21470 rtx dst = operands[0]; 21471 rtx src = operands[1]; 21472 rtx base; 21473 machine_mode cur_mode = BLKmode, next_mode; 21474 bool speed_p = !optimize_function_for_size_p (cfun); 21475 21476 /* When optimizing for size, give a better estimate of the length of a 21477 memcpy call, but use the default otherwise. Moves larger than 8 bytes 21478 will always require an even number of instructions to do now. And each 21479 operation requires both a load+store, so divide the max number by 2. */ 21480 unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2; 21481 21482 /* We can't do anything smart if the amount to copy is not constant. */ 21483 if (!CONST_INT_P (operands[2])) 21484 return false; 21485 21486 unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]); 21487 21488 /* Try to keep the number of instructions low. For all cases we will do at 21489 most two moves for the residual amount, since we'll always overlap the 21490 remainder. */ 21491 if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves) 21492 return false; 21493 21494 /* At this point tmp is known to have to fit inside an int. */ 21495 n = tmp; 21496 21497 base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); 21498 dst = adjust_automodify_address (dst, VOIDmode, base, 0); 21499 21500 base = copy_to_mode_reg (Pmode, XEXP (src, 0)); 21501 src = adjust_automodify_address (src, VOIDmode, base, 0); 21502 21503 /* Convert n to bits to make the rest of the code simpler. */ 21504 n = n * BITS_PER_UNIT; 21505 21506 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes 21507 larger than TImode, but we should not use them for loads/stores here. */ 21508 const int copy_limit = GET_MODE_BITSIZE (TImode); 21509 21510 while (n > 0) 21511 { 21512 /* Find the largest mode in which to do the copy in without over reading 21513 or writing. */ 21514 opt_scalar_int_mode mode_iter; 21515 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) 21516 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit)) 21517 cur_mode = mode_iter.require (); 21518 21519 gcc_assert (cur_mode != BLKmode); 21520 21521 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); 21522 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); 21523 21524 n -= mode_bits; 21525 21526 /* Do certain trailing copies as overlapping if it's going to be 21527 cheaper. i.e. less instructions to do so. For instance doing a 15 21528 byte copy it's more efficient to do two overlapping 8 byte copies than 21529 8 + 6 + 1. */ 21530 if (n > 0 && n <= 8 * BITS_PER_UNIT) 21531 { 21532 next_mode = smallest_mode_for_size (n, MODE_INT); 21533 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); 21534 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT); 21535 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); 21536 n = n_bits; 21537 } 21538 } 21539 21540 return true; 21541} 21542 21543/* Split a DImode store of a CONST_INT SRC to MEM DST as two 21544 SImode stores. Handle the case when the constant has identical 21545 bottom and top halves. This is beneficial when the two stores can be 21546 merged into an STP and we avoid synthesising potentially expensive 21547 immediates twice. Return true if such a split is possible. */ 21548 21549bool 21550aarch64_split_dimode_const_store (rtx dst, rtx src) 21551{ 21552 rtx lo = gen_lowpart (SImode, src); 21553 rtx hi = gen_highpart_mode (SImode, DImode, src); 21554 21555 bool size_p = optimize_function_for_size_p (cfun); 21556 21557 if (!rtx_equal_p (lo, hi)) 21558 return false; 21559 21560 unsigned int orig_cost 21561 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode); 21562 unsigned int lo_cost 21563 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode); 21564 21565 /* We want to transform: 21566 MOV x1, 49370 21567 MOVK x1, 0x140, lsl 16 21568 MOVK x1, 0xc0da, lsl 32 21569 MOVK x1, 0x140, lsl 48 21570 STR x1, [x0] 21571 into: 21572 MOV w1, 49370 21573 MOVK w1, 0x140, lsl 16 21574 STP w1, w1, [x0] 21575 So we want to perform this only when we save two instructions 21576 or more. When optimizing for size, however, accept any code size 21577 savings we can. */ 21578 if (size_p && orig_cost <= lo_cost) 21579 return false; 21580 21581 if (!size_p 21582 && (orig_cost <= lo_cost + 1)) 21583 return false; 21584 21585 rtx mem_lo = adjust_address (dst, SImode, 0); 21586 if (!aarch64_mem_pair_operand (mem_lo, SImode)) 21587 return false; 21588 21589 rtx tmp_reg = gen_reg_rtx (SImode); 21590 aarch64_expand_mov_immediate (tmp_reg, lo); 21591 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode)); 21592 /* Don't emit an explicit store pair as this may not be always profitable. 21593 Let the sched-fusion logic decide whether to merge them. */ 21594 emit_move_insn (mem_lo, tmp_reg); 21595 emit_move_insn (mem_hi, tmp_reg); 21596 21597 return true; 21598} 21599 21600/* Generate RTL for a conditional branch with rtx comparison CODE in 21601 mode CC_MODE. The destination of the unlikely conditional branch 21602 is LABEL_REF. */ 21603 21604void 21605aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode, 21606 rtx label_ref) 21607{ 21608 rtx x; 21609 x = gen_rtx_fmt_ee (code, VOIDmode, 21610 gen_rtx_REG (cc_mode, CC_REGNUM), 21611 const0_rtx); 21612 21613 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 21614 gen_rtx_LABEL_REF (VOIDmode, label_ref), 21615 pc_rtx); 21616 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 21617} 21618 21619/* Generate DImode scratch registers for 128-bit (TImode) addition. 21620 21621 OP1 represents the TImode destination operand 1 21622 OP2 represents the TImode destination operand 2 21623 LOW_DEST represents the low half (DImode) of TImode operand 0 21624 LOW_IN1 represents the low half (DImode) of TImode operand 1 21625 LOW_IN2 represents the low half (DImode) of TImode operand 2 21626 HIGH_DEST represents the high half (DImode) of TImode operand 0 21627 HIGH_IN1 represents the high half (DImode) of TImode operand 1 21628 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */ 21629 21630void 21631aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest, 21632 rtx *low_in1, rtx *low_in2, 21633 rtx *high_dest, rtx *high_in1, 21634 rtx *high_in2) 21635{ 21636 *low_dest = gen_reg_rtx (DImode); 21637 *low_in1 = gen_lowpart (DImode, op1); 21638 *low_in2 = simplify_gen_subreg (DImode, op2, TImode, 21639 subreg_lowpart_offset (DImode, TImode)); 21640 *high_dest = gen_reg_rtx (DImode); 21641 *high_in1 = gen_highpart (DImode, op1); 21642 *high_in2 = simplify_gen_subreg (DImode, op2, TImode, 21643 subreg_highpart_offset (DImode, TImode)); 21644} 21645 21646/* Generate DImode scratch registers for 128-bit (TImode) subtraction. 21647 21648 This function differs from 'arch64_addti_scratch_regs' in that 21649 OP1 can be an immediate constant (zero). We must call 21650 subreg_highpart_offset with DImode and TImode arguments, otherwise 21651 VOIDmode will be used for the const_int which generates an internal 21652 error from subreg_size_highpart_offset which does not expect a size of zero. 21653 21654 OP1 represents the TImode destination operand 1 21655 OP2 represents the TImode destination operand 2 21656 LOW_DEST represents the low half (DImode) of TImode operand 0 21657 LOW_IN1 represents the low half (DImode) of TImode operand 1 21658 LOW_IN2 represents the low half (DImode) of TImode operand 2 21659 HIGH_DEST represents the high half (DImode) of TImode operand 0 21660 HIGH_IN1 represents the high half (DImode) of TImode operand 1 21661 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */ 21662 21663 21664void 21665aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest, 21666 rtx *low_in1, rtx *low_in2, 21667 rtx *high_dest, rtx *high_in1, 21668 rtx *high_in2) 21669{ 21670 *low_dest = gen_reg_rtx (DImode); 21671 *low_in1 = simplify_gen_subreg (DImode, op1, TImode, 21672 subreg_lowpart_offset (DImode, TImode)); 21673 21674 *low_in2 = simplify_gen_subreg (DImode, op2, TImode, 21675 subreg_lowpart_offset (DImode, TImode)); 21676 *high_dest = gen_reg_rtx (DImode); 21677 21678 *high_in1 = simplify_gen_subreg (DImode, op1, TImode, 21679 subreg_highpart_offset (DImode, TImode)); 21680 *high_in2 = simplify_gen_subreg (DImode, op2, TImode, 21681 subreg_highpart_offset (DImode, TImode)); 21682} 21683 21684/* Generate RTL for 128-bit (TImode) subtraction with overflow. 21685 21686 OP0 represents the TImode destination operand 0 21687 LOW_DEST represents the low half (DImode) of TImode operand 0 21688 LOW_IN1 represents the low half (DImode) of TImode operand 1 21689 LOW_IN2 represents the low half (DImode) of TImode operand 2 21690 HIGH_DEST represents the high half (DImode) of TImode operand 0 21691 HIGH_IN1 represents the high half (DImode) of TImode operand 1 21692 HIGH_IN2 represents the high half (DImode) of TImode operand 2 21693 UNSIGNED_P is true if the operation is being performed on unsigned 21694 values. */ 21695void 21696aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1, 21697 rtx low_in2, rtx high_dest, rtx high_in1, 21698 rtx high_in2, bool unsigned_p) 21699{ 21700 if (low_in2 == const0_rtx) 21701 { 21702 low_dest = low_in1; 21703 high_in2 = force_reg (DImode, high_in2); 21704 if (unsigned_p) 21705 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2)); 21706 else 21707 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2)); 21708 } 21709 else 21710 { 21711 if (aarch64_plus_immediate (low_in2, DImode)) 21712 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2, 21713 GEN_INT (-INTVAL (low_in2)))); 21714 else 21715 { 21716 low_in2 = force_reg (DImode, low_in2); 21717 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2)); 21718 } 21719 high_in2 = force_reg (DImode, high_in2); 21720 21721 if (unsigned_p) 21722 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2)); 21723 else 21724 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2)); 21725 } 21726 21727 emit_move_insn (gen_lowpart (DImode, op0), low_dest); 21728 emit_move_insn (gen_highpart (DImode, op0), high_dest); 21729 21730} 21731 21732/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */ 21733 21734static unsigned HOST_WIDE_INT 21735aarch64_asan_shadow_offset (void) 21736{ 21737 if (TARGET_ILP32) 21738 return (HOST_WIDE_INT_1 << 29); 21739 else 21740 return (HOST_WIDE_INT_1 << 36); 21741} 21742 21743static rtx 21744aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, 21745 int code, tree treeop0, tree treeop1) 21746{ 21747 machine_mode op_mode, cmp_mode, cc_mode = CCmode; 21748 rtx op0, op1; 21749 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0)); 21750 insn_code icode; 21751 struct expand_operand ops[4]; 21752 21753 start_sequence (); 21754 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); 21755 21756 op_mode = GET_MODE (op0); 21757 if (op_mode == VOIDmode) 21758 op_mode = GET_MODE (op1); 21759 21760 switch (op_mode) 21761 { 21762 case E_QImode: 21763 case E_HImode: 21764 case E_SImode: 21765 cmp_mode = SImode; 21766 icode = CODE_FOR_cmpsi; 21767 break; 21768 21769 case E_DImode: 21770 cmp_mode = DImode; 21771 icode = CODE_FOR_cmpdi; 21772 break; 21773 21774 case E_SFmode: 21775 cmp_mode = SFmode; 21776 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); 21777 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf; 21778 break; 21779 21780 case E_DFmode: 21781 cmp_mode = DFmode; 21782 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); 21783 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf; 21784 break; 21785 21786 default: 21787 end_sequence (); 21788 return NULL_RTX; 21789 } 21790 21791 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp); 21792 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp); 21793 if (!op0 || !op1) 21794 { 21795 end_sequence (); 21796 return NULL_RTX; 21797 } 21798 *prep_seq = get_insns (); 21799 end_sequence (); 21800 21801 create_fixed_operand (&ops[0], op0); 21802 create_fixed_operand (&ops[1], op1); 21803 21804 start_sequence (); 21805 if (!maybe_expand_insn (icode, 2, ops)) 21806 { 21807 end_sequence (); 21808 return NULL_RTX; 21809 } 21810 *gen_seq = get_insns (); 21811 end_sequence (); 21812 21813 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode, 21814 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx); 21815} 21816 21817static rtx 21818aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, 21819 int cmp_code, tree treeop0, tree treeop1, int bit_code) 21820{ 21821 rtx op0, op1, target; 21822 machine_mode op_mode, cmp_mode, cc_mode = CCmode; 21823 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0)); 21824 insn_code icode; 21825 struct expand_operand ops[6]; 21826 int aarch64_cond; 21827 21828 push_to_sequence (*prep_seq); 21829 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); 21830 21831 op_mode = GET_MODE (op0); 21832 if (op_mode == VOIDmode) 21833 op_mode = GET_MODE (op1); 21834 21835 switch (op_mode) 21836 { 21837 case E_QImode: 21838 case E_HImode: 21839 case E_SImode: 21840 cmp_mode = SImode; 21841 break; 21842 21843 case E_DImode: 21844 cmp_mode = DImode; 21845 break; 21846 21847 case E_SFmode: 21848 cmp_mode = SFmode; 21849 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); 21850 break; 21851 21852 case E_DFmode: 21853 cmp_mode = DFmode; 21854 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); 21855 break; 21856 21857 default: 21858 end_sequence (); 21859 return NULL_RTX; 21860 } 21861 21862 icode = code_for_ccmp (cc_mode, cmp_mode); 21863 21864 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp); 21865 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp); 21866 if (!op0 || !op1) 21867 { 21868 end_sequence (); 21869 return NULL_RTX; 21870 } 21871 *prep_seq = get_insns (); 21872 end_sequence (); 21873 21874 target = gen_rtx_REG (cc_mode, CC_REGNUM); 21875 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code); 21876 21877 if (bit_code != AND) 21878 { 21879 /* Treat the ccmp patterns as canonical and use them where possible, 21880 but fall back to ccmp_rev patterns if there's no other option. */ 21881 rtx_code prev_code = GET_CODE (prev); 21882 machine_mode prev_mode = GET_MODE (XEXP (prev, 0)); 21883 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode) 21884 && !(prev_code == EQ 21885 || prev_code == NE 21886 || prev_code == ORDERED 21887 || prev_code == UNORDERED)) 21888 icode = code_for_ccmp_rev (cc_mode, cmp_mode); 21889 else 21890 { 21891 rtx_code code = reverse_condition (prev_code); 21892 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx); 21893 } 21894 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond); 21895 } 21896 21897 create_fixed_operand (&ops[0], XEXP (prev, 0)); 21898 create_fixed_operand (&ops[1], target); 21899 create_fixed_operand (&ops[2], op0); 21900 create_fixed_operand (&ops[3], op1); 21901 create_fixed_operand (&ops[4], prev); 21902 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond)); 21903 21904 push_to_sequence (*gen_seq); 21905 if (!maybe_expand_insn (icode, 6, ops)) 21906 { 21907 end_sequence (); 21908 return NULL_RTX; 21909 } 21910 21911 *gen_seq = get_insns (); 21912 end_sequence (); 21913 21914 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx); 21915} 21916 21917#undef TARGET_GEN_CCMP_FIRST 21918#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first 21919 21920#undef TARGET_GEN_CCMP_NEXT 21921#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next 21922 21923/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports 21924 instruction fusion of some sort. */ 21925 21926static bool 21927aarch64_macro_fusion_p (void) 21928{ 21929 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING; 21930} 21931 21932 21933/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR 21934 should be kept together during scheduling. */ 21935 21936static bool 21937aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) 21938{ 21939 rtx set_dest; 21940 rtx prev_set = single_set (prev); 21941 rtx curr_set = single_set (curr); 21942 /* prev and curr are simple SET insns i.e. no flag setting or branching. */ 21943 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); 21944 21945 if (!aarch64_macro_fusion_p ()) 21946 return false; 21947 21948 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK)) 21949 { 21950 /* We are trying to match: 21951 prev (mov) == (set (reg r0) (const_int imm16)) 21952 curr (movk) == (set (zero_extract (reg r0) 21953 (const_int 16) 21954 (const_int 16)) 21955 (const_int imm16_1)) */ 21956 21957 set_dest = SET_DEST (curr_set); 21958 21959 if (GET_CODE (set_dest) == ZERO_EXTRACT 21960 && CONST_INT_P (SET_SRC (curr_set)) 21961 && CONST_INT_P (SET_SRC (prev_set)) 21962 && CONST_INT_P (XEXP (set_dest, 2)) 21963 && INTVAL (XEXP (set_dest, 2)) == 16 21964 && REG_P (XEXP (set_dest, 0)) 21965 && REG_P (SET_DEST (prev_set)) 21966 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set))) 21967 { 21968 return true; 21969 } 21970 } 21971 21972 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD)) 21973 { 21974 21975 /* We're trying to match: 21976 prev (adrp) == (set (reg r1) 21977 (high (symbol_ref ("SYM")))) 21978 curr (add) == (set (reg r0) 21979 (lo_sum (reg r1) 21980 (symbol_ref ("SYM")))) 21981 Note that r0 need not necessarily be the same as r1, especially 21982 during pre-regalloc scheduling. */ 21983 21984 if (satisfies_constraint_Ush (SET_SRC (prev_set)) 21985 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set))) 21986 { 21987 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM 21988 && REG_P (XEXP (SET_SRC (curr_set), 0)) 21989 && REGNO (XEXP (SET_SRC (curr_set), 0)) 21990 == REGNO (SET_DEST (prev_set)) 21991 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0), 21992 XEXP (SET_SRC (curr_set), 1))) 21993 return true; 21994 } 21995 } 21996 21997 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK)) 21998 { 21999 22000 /* We're trying to match: 22001 prev (movk) == (set (zero_extract (reg r0) 22002 (const_int 16) 22003 (const_int 32)) 22004 (const_int imm16_1)) 22005 curr (movk) == (set (zero_extract (reg r0) 22006 (const_int 16) 22007 (const_int 48)) 22008 (const_int imm16_2)) */ 22009 22010 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT 22011 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT 22012 && REG_P (XEXP (SET_DEST (prev_set), 0)) 22013 && REG_P (XEXP (SET_DEST (curr_set), 0)) 22014 && REGNO (XEXP (SET_DEST (prev_set), 0)) 22015 == REGNO (XEXP (SET_DEST (curr_set), 0)) 22016 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2)) 22017 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2)) 22018 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32 22019 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48 22020 && CONST_INT_P (SET_SRC (prev_set)) 22021 && CONST_INT_P (SET_SRC (curr_set))) 22022 return true; 22023 22024 } 22025 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR)) 22026 { 22027 /* We're trying to match: 22028 prev (adrp) == (set (reg r0) 22029 (high (symbol_ref ("SYM")))) 22030 curr (ldr) == (set (reg r1) 22031 (mem (lo_sum (reg r0) 22032 (symbol_ref ("SYM"))))) 22033 or 22034 curr (ldr) == (set (reg r1) 22035 (zero_extend (mem 22036 (lo_sum (reg r0) 22037 (symbol_ref ("SYM")))))) */ 22038 if (satisfies_constraint_Ush (SET_SRC (prev_set)) 22039 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set))) 22040 { 22041 rtx curr_src = SET_SRC (curr_set); 22042 22043 if (GET_CODE (curr_src) == ZERO_EXTEND) 22044 curr_src = XEXP (curr_src, 0); 22045 22046 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM 22047 && REG_P (XEXP (XEXP (curr_src, 0), 0)) 22048 && REGNO (XEXP (XEXP (curr_src, 0), 0)) 22049 == REGNO (SET_DEST (prev_set)) 22050 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1), 22051 XEXP (SET_SRC (prev_set), 0))) 22052 return true; 22053 } 22054 } 22055 22056 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */ 22057 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH) 22058 && prev_set && curr_set && any_condjump_p (curr) 22059 && GET_CODE (SET_SRC (prev_set)) == COMPARE 22060 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0))) 22061 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr))) 22062 return true; 22063 22064 /* Fuse flag-setting ALU instructions and conditional branch. */ 22065 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH) 22066 && any_condjump_p (curr)) 22067 { 22068 unsigned int condreg1, condreg2; 22069 rtx cc_reg_1; 22070 aarch64_fixed_condition_code_regs (&condreg1, &condreg2); 22071 cc_reg_1 = gen_rtx_REG (CCmode, condreg1); 22072 22073 if (reg_referenced_p (cc_reg_1, PATTERN (curr)) 22074 && prev 22075 && modified_in_p (cc_reg_1, prev)) 22076 { 22077 enum attr_type prev_type = get_attr_type (prev); 22078 22079 /* FIXME: this misses some which is considered simple arthematic 22080 instructions for ThunderX. Simple shifts are missed here. */ 22081 if (prev_type == TYPE_ALUS_SREG 22082 || prev_type == TYPE_ALUS_IMM 22083 || prev_type == TYPE_LOGICS_REG 22084 || prev_type == TYPE_LOGICS_IMM) 22085 return true; 22086 } 22087 } 22088 22089 /* Fuse ALU instructions and CBZ/CBNZ. */ 22090 if (prev_set 22091 && curr_set 22092 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ) 22093 && any_condjump_p (curr)) 22094 { 22095 /* We're trying to match: 22096 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm))) 22097 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0) 22098 (const_int 0)) 22099 (label_ref ("SYM")) 22100 (pc)) */ 22101 if (SET_DEST (curr_set) == (pc_rtx) 22102 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE 22103 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) 22104 && REG_P (SET_DEST (prev_set)) 22105 && REGNO (SET_DEST (prev_set)) 22106 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0))) 22107 { 22108 /* Fuse ALU operations followed by conditional branch instruction. */ 22109 switch (get_attr_type (prev)) 22110 { 22111 case TYPE_ALU_IMM: 22112 case TYPE_ALU_SREG: 22113 case TYPE_ADC_REG: 22114 case TYPE_ADC_IMM: 22115 case TYPE_ADCS_REG: 22116 case TYPE_ADCS_IMM: 22117 case TYPE_LOGIC_REG: 22118 case TYPE_LOGIC_IMM: 22119 case TYPE_CSEL: 22120 case TYPE_ADR: 22121 case TYPE_MOV_IMM: 22122 case TYPE_SHIFT_REG: 22123 case TYPE_SHIFT_IMM: 22124 case TYPE_BFM: 22125 case TYPE_RBIT: 22126 case TYPE_REV: 22127 case TYPE_EXTEND: 22128 return true; 22129 22130 default:; 22131 } 22132 } 22133 } 22134 22135 /* Fuse A+B+1 and A-B-1 */ 22136 if (simple_sets_p 22137 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1)) 22138 { 22139 /* We're trying to match: 22140 prev == (set (r0) (plus (r0) (r1))) 22141 curr == (set (r0) (plus (r0) (const_int 1))) 22142 or: 22143 prev == (set (r0) (minus (r0) (r1))) 22144 curr == (set (r0) (plus (r0) (const_int -1))) */ 22145 22146 rtx prev_src = SET_SRC (prev_set); 22147 rtx curr_src = SET_SRC (curr_set); 22148 22149 int polarity = 1; 22150 if (GET_CODE (prev_src) == MINUS) 22151 polarity = -1; 22152 22153 if (GET_CODE (curr_src) == PLUS 22154 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS) 22155 && CONST_INT_P (XEXP (curr_src, 1)) 22156 && INTVAL (XEXP (curr_src, 1)) == polarity 22157 && REG_P (XEXP (curr_src, 0)) 22158 && REG_P (SET_DEST (prev_set)) 22159 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0))) 22160 return true; 22161 } 22162 22163 return false; 22164} 22165 22166/* Return true iff the instruction fusion described by OP is enabled. */ 22167 22168bool 22169aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op) 22170{ 22171 return (aarch64_tune_params.fusible_ops & op) != 0; 22172} 22173 22174/* If MEM is in the form of [base+offset], extract the two parts 22175 of address and set to BASE and OFFSET, otherwise return false 22176 after clearing BASE and OFFSET. */ 22177 22178bool 22179extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset) 22180{ 22181 rtx addr; 22182 22183 gcc_assert (MEM_P (mem)); 22184 22185 addr = XEXP (mem, 0); 22186 22187 if (REG_P (addr)) 22188 { 22189 *base = addr; 22190 *offset = const0_rtx; 22191 return true; 22192 } 22193 22194 if (GET_CODE (addr) == PLUS 22195 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1))) 22196 { 22197 *base = XEXP (addr, 0); 22198 *offset = XEXP (addr, 1); 22199 return true; 22200 } 22201 22202 *base = NULL_RTX; 22203 *offset = NULL_RTX; 22204 22205 return false; 22206} 22207 22208/* Types for scheduling fusion. */ 22209enum sched_fusion_type 22210{ 22211 SCHED_FUSION_NONE = 0, 22212 SCHED_FUSION_LD_SIGN_EXTEND, 22213 SCHED_FUSION_LD_ZERO_EXTEND, 22214 SCHED_FUSION_LD, 22215 SCHED_FUSION_ST, 22216 SCHED_FUSION_NUM 22217}; 22218 22219/* If INSN is a load or store of address in the form of [base+offset], 22220 extract the two parts and set to BASE and OFFSET. Return scheduling 22221 fusion type this INSN is. */ 22222 22223static enum sched_fusion_type 22224fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset) 22225{ 22226 rtx x, dest, src; 22227 enum sched_fusion_type fusion = SCHED_FUSION_LD; 22228 22229 gcc_assert (INSN_P (insn)); 22230 x = PATTERN (insn); 22231 if (GET_CODE (x) != SET) 22232 return SCHED_FUSION_NONE; 22233 22234 src = SET_SRC (x); 22235 dest = SET_DEST (x); 22236 22237 machine_mode dest_mode = GET_MODE (dest); 22238 22239 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode)) 22240 return SCHED_FUSION_NONE; 22241 22242 if (GET_CODE (src) == SIGN_EXTEND) 22243 { 22244 fusion = SCHED_FUSION_LD_SIGN_EXTEND; 22245 src = XEXP (src, 0); 22246 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) 22247 return SCHED_FUSION_NONE; 22248 } 22249 else if (GET_CODE (src) == ZERO_EXTEND) 22250 { 22251 fusion = SCHED_FUSION_LD_ZERO_EXTEND; 22252 src = XEXP (src, 0); 22253 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) 22254 return SCHED_FUSION_NONE; 22255 } 22256 22257 if (GET_CODE (src) == MEM && REG_P (dest)) 22258 extract_base_offset_in_addr (src, base, offset); 22259 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx)) 22260 { 22261 fusion = SCHED_FUSION_ST; 22262 extract_base_offset_in_addr (dest, base, offset); 22263 } 22264 else 22265 return SCHED_FUSION_NONE; 22266 22267 if (*base == NULL_RTX || *offset == NULL_RTX) 22268 fusion = SCHED_FUSION_NONE; 22269 22270 return fusion; 22271} 22272 22273/* Implement the TARGET_SCHED_FUSION_PRIORITY hook. 22274 22275 Currently we only support to fuse ldr or str instructions, so FUSION_PRI 22276 and PRI are only calculated for these instructions. For other instruction, 22277 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other 22278 type instruction fusion can be added by returning different priorities. 22279 22280 It's important that irrelevant instructions get the largest FUSION_PRI. */ 22281 22282static void 22283aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri, 22284 int *fusion_pri, int *pri) 22285{ 22286 int tmp, off_val; 22287 rtx base, offset; 22288 enum sched_fusion_type fusion; 22289 22290 gcc_assert (INSN_P (insn)); 22291 22292 tmp = max_pri - 1; 22293 fusion = fusion_load_store (insn, &base, &offset); 22294 if (fusion == SCHED_FUSION_NONE) 22295 { 22296 *pri = tmp; 22297 *fusion_pri = tmp; 22298 return; 22299 } 22300 22301 /* Set FUSION_PRI according to fusion type and base register. */ 22302 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base); 22303 22304 /* Calculate PRI. */ 22305 tmp /= 2; 22306 22307 /* INSN with smaller offset goes first. */ 22308 off_val = (int)(INTVAL (offset)); 22309 if (off_val >= 0) 22310 tmp -= (off_val & 0xfffff); 22311 else 22312 tmp += ((- off_val) & 0xfffff); 22313 22314 *pri = tmp; 22315 return; 22316} 22317 22318/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook. 22319 Adjust priority of sha1h instructions so they are scheduled before 22320 other SHA1 instructions. */ 22321 22322static int 22323aarch64_sched_adjust_priority (rtx_insn *insn, int priority) 22324{ 22325 rtx x = PATTERN (insn); 22326 22327 if (GET_CODE (x) == SET) 22328 { 22329 x = SET_SRC (x); 22330 22331 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H) 22332 return priority + 10; 22333 } 22334 22335 return priority; 22336} 22337 22338/* Given OPERANDS of consecutive load/store, check if we can merge 22339 them into ldp/stp. LOAD is true if they are load instructions. 22340 MODE is the mode of memory operands. */ 22341 22342bool 22343aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, 22344 machine_mode mode) 22345{ 22346 HOST_WIDE_INT offval_1, offval_2, msize; 22347 enum reg_class rclass_1, rclass_2; 22348 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2; 22349 22350 /* Allow the tuning structure to disable LDP instruction formation 22351 from combining instructions (e.g., in peephole2). */ 22352 if (load && (aarch64_tune_params.extra_tuning_flags 22353 & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE)) 22354 return false; 22355 22356 if (load) 22357 { 22358 mem_1 = operands[1]; 22359 mem_2 = operands[3]; 22360 reg_1 = operands[0]; 22361 reg_2 = operands[2]; 22362 gcc_assert (REG_P (reg_1) && REG_P (reg_2)); 22363 if (REGNO (reg_1) == REGNO (reg_2)) 22364 return false; 22365 } 22366 else 22367 { 22368 mem_1 = operands[0]; 22369 mem_2 = operands[2]; 22370 reg_1 = operands[1]; 22371 reg_2 = operands[3]; 22372 } 22373 22374 /* The mems cannot be volatile. */ 22375 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)) 22376 return false; 22377 22378 /* If we have SImode and slow unaligned ldp, 22379 check the alignment to be at least 8 byte. */ 22380 if (mode == SImode 22381 && (aarch64_tune_params.extra_tuning_flags 22382 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) 22383 && !optimize_size 22384 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT) 22385 return false; 22386 22387 /* Check if the addresses are in the form of [base+offset]. */ 22388 extract_base_offset_in_addr (mem_1, &base_1, &offset_1); 22389 if (base_1 == NULL_RTX || offset_1 == NULL_RTX) 22390 return false; 22391 extract_base_offset_in_addr (mem_2, &base_2, &offset_2); 22392 if (base_2 == NULL_RTX || offset_2 == NULL_RTX) 22393 return false; 22394 22395 /* Check if the bases are same. */ 22396 if (!rtx_equal_p (base_1, base_2)) 22397 return false; 22398 22399 /* The operands must be of the same size. */ 22400 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)), 22401 GET_MODE_SIZE (GET_MODE (mem_2)))); 22402 22403 offval_1 = INTVAL (offset_1); 22404 offval_2 = INTVAL (offset_2); 22405 /* We should only be trying this for fixed-sized modes. There is no 22406 SVE LDP/STP instruction. */ 22407 msize = GET_MODE_SIZE (mode).to_constant (); 22408 /* Check if the offsets are consecutive. */ 22409 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize)) 22410 return false; 22411 22412 /* Check if the addresses are clobbered by load. */ 22413 if (load) 22414 { 22415 if (reg_mentioned_p (reg_1, mem_1)) 22416 return false; 22417 22418 /* In increasing order, the last load can clobber the address. */ 22419 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2)) 22420 return false; 22421 } 22422 22423 /* One of the memory accesses must be a mempair operand. 22424 If it is not the first one, they need to be swapped by the 22425 peephole. */ 22426 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1)) 22427 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2))) 22428 return false; 22429 22430 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1))) 22431 rclass_1 = FP_REGS; 22432 else 22433 rclass_1 = GENERAL_REGS; 22434 22435 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2))) 22436 rclass_2 = FP_REGS; 22437 else 22438 rclass_2 = GENERAL_REGS; 22439 22440 /* Check if the registers are of same class. */ 22441 if (rclass_1 != rclass_2) 22442 return false; 22443 22444 return true; 22445} 22446 22447/* Given OPERANDS of consecutive load/store that can be merged, 22448 swap them if they are not in ascending order. */ 22449void 22450aarch64_swap_ldrstr_operands (rtx* operands, bool load) 22451{ 22452 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2; 22453 HOST_WIDE_INT offval_1, offval_2; 22454 22455 if (load) 22456 { 22457 mem_1 = operands[1]; 22458 mem_2 = operands[3]; 22459 } 22460 else 22461 { 22462 mem_1 = operands[0]; 22463 mem_2 = operands[2]; 22464 } 22465 22466 extract_base_offset_in_addr (mem_1, &base_1, &offset_1); 22467 extract_base_offset_in_addr (mem_2, &base_2, &offset_2); 22468 22469 offval_1 = INTVAL (offset_1); 22470 offval_2 = INTVAL (offset_2); 22471 22472 if (offval_1 > offval_2) 22473 { 22474 /* Irrespective of whether this is a load or a store, 22475 we do the same swap. */ 22476 std::swap (operands[0], operands[2]); 22477 std::swap (operands[1], operands[3]); 22478 } 22479} 22480 22481/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a 22482 comparison between the two. */ 22483int 22484aarch64_host_wide_int_compare (const void *x, const void *y) 22485{ 22486 return wi::cmps (* ((const HOST_WIDE_INT *) x), 22487 * ((const HOST_WIDE_INT *) y)); 22488} 22489 22490/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the 22491 other pointing to a REG rtx containing an offset, compare the offsets 22492 of the two pairs. 22493 22494 Return: 22495 22496 1 iff offset (X) > offset (Y) 22497 0 iff offset (X) == offset (Y) 22498 -1 iff offset (X) < offset (Y) */ 22499int 22500aarch64_ldrstr_offset_compare (const void *x, const void *y) 22501{ 22502 const rtx * operands_1 = (const rtx *) x; 22503 const rtx * operands_2 = (const rtx *) y; 22504 rtx mem_1, mem_2, base, offset_1, offset_2; 22505 22506 if (MEM_P (operands_1[0])) 22507 mem_1 = operands_1[0]; 22508 else 22509 mem_1 = operands_1[1]; 22510 22511 if (MEM_P (operands_2[0])) 22512 mem_2 = operands_2[0]; 22513 else 22514 mem_2 = operands_2[1]; 22515 22516 /* Extract the offsets. */ 22517 extract_base_offset_in_addr (mem_1, &base, &offset_1); 22518 extract_base_offset_in_addr (mem_2, &base, &offset_2); 22519 22520 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX); 22521 22522 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2)); 22523} 22524 22525/* Given OPERANDS of consecutive load/store, check if we can merge 22526 them into ldp/stp by adjusting the offset. LOAD is true if they 22527 are load instructions. MODE is the mode of memory operands. 22528 22529 Given below consecutive stores: 22530 22531 str w1, [xb, 0x100] 22532 str w1, [xb, 0x104] 22533 str w1, [xb, 0x108] 22534 str w1, [xb, 0x10c] 22535 22536 Though the offsets are out of the range supported by stp, we can 22537 still pair them after adjusting the offset, like: 22538 22539 add scratch, xb, 0x100 22540 stp w1, w1, [scratch] 22541 stp w1, w1, [scratch, 0x8] 22542 22543 The peephole patterns detecting this opportunity should guarantee 22544 the scratch register is avaliable. */ 22545 22546bool 22547aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, 22548 scalar_mode mode) 22549{ 22550 const int num_insns = 4; 22551 enum reg_class rclass; 22552 HOST_WIDE_INT offvals[num_insns], msize; 22553 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns]; 22554 22555 if (load) 22556 { 22557 for (int i = 0; i < num_insns; i++) 22558 { 22559 reg[i] = operands[2 * i]; 22560 mem[i] = operands[2 * i + 1]; 22561 22562 gcc_assert (REG_P (reg[i])); 22563 } 22564 22565 /* Do not attempt to merge the loads if the loads clobber each other. */ 22566 for (int i = 0; i < 8; i += 2) 22567 for (int j = i + 2; j < 8; j += 2) 22568 if (reg_overlap_mentioned_p (operands[i], operands[j])) 22569 return false; 22570 } 22571 else 22572 for (int i = 0; i < num_insns; i++) 22573 { 22574 mem[i] = operands[2 * i]; 22575 reg[i] = operands[2 * i + 1]; 22576 } 22577 22578 /* Skip if memory operand is by itself valid for ldp/stp. */ 22579 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode)) 22580 return false; 22581 22582 for (int i = 0; i < num_insns; i++) 22583 { 22584 /* The mems cannot be volatile. */ 22585 if (MEM_VOLATILE_P (mem[i])) 22586 return false; 22587 22588 /* Check if the addresses are in the form of [base+offset]. */ 22589 extract_base_offset_in_addr (mem[i], base + i, offset + i); 22590 if (base[i] == NULL_RTX || offset[i] == NULL_RTX) 22591 return false; 22592 } 22593 22594 /* Check if the registers are of same class. */ 22595 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0])) 22596 ? FP_REGS : GENERAL_REGS; 22597 22598 for (int i = 1; i < num_insns; i++) 22599 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i]))) 22600 { 22601 if (rclass != FP_REGS) 22602 return false; 22603 } 22604 else 22605 { 22606 if (rclass != GENERAL_REGS) 22607 return false; 22608 } 22609 22610 /* Only the last register in the order in which they occur 22611 may be clobbered by the load. */ 22612 if (rclass == GENERAL_REGS && load) 22613 for (int i = 0; i < num_insns - 1; i++) 22614 if (reg_mentioned_p (reg[i], mem[i])) 22615 return false; 22616 22617 /* Check if the bases are same. */ 22618 for (int i = 0; i < num_insns - 1; i++) 22619 if (!rtx_equal_p (base[i], base[i + 1])) 22620 return false; 22621 22622 for (int i = 0; i < num_insns; i++) 22623 offvals[i] = INTVAL (offset[i]); 22624 22625 msize = GET_MODE_SIZE (mode); 22626 22627 /* Check if the offsets can be put in the right order to do a ldp/stp. */ 22628 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT), 22629 aarch64_host_wide_int_compare); 22630 22631 if (!(offvals[1] == offvals[0] + msize 22632 && offvals[3] == offvals[2] + msize)) 22633 return false; 22634 22635 /* Check that offsets are within range of each other. The ldp/stp 22636 instructions have 7 bit immediate offsets, so use 0x80. */ 22637 if (offvals[2] - offvals[0] >= msize * 0x80) 22638 return false; 22639 22640 /* The offsets must be aligned with respect to each other. */ 22641 if (offvals[0] % msize != offvals[2] % msize) 22642 return false; 22643 22644 /* If we have SImode and slow unaligned ldp, 22645 check the alignment to be at least 8 byte. */ 22646 if (mode == SImode 22647 && (aarch64_tune_params.extra_tuning_flags 22648 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) 22649 && !optimize_size 22650 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT) 22651 return false; 22652 22653 return true; 22654} 22655 22656/* Given OPERANDS of consecutive load/store, this function pairs them 22657 into LDP/STP after adjusting the offset. It depends on the fact 22658 that the operands can be sorted so the offsets are correct for STP. 22659 MODE is the mode of memory operands. CODE is the rtl operator 22660 which should be applied to all memory operands, it's SIGN_EXTEND, 22661 ZERO_EXTEND or UNKNOWN. */ 22662 22663bool 22664aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, 22665 scalar_mode mode, RTX_CODE code) 22666{ 22667 rtx base, offset_1, offset_3, t1, t2; 22668 rtx mem_1, mem_2, mem_3, mem_4; 22669 rtx temp_operands[8]; 22670 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3, 22671 stp_off_upper_limit, stp_off_lower_limit, msize; 22672 22673 /* We make changes on a copy as we may still bail out. */ 22674 for (int i = 0; i < 8; i ++) 22675 temp_operands[i] = operands[i]; 22676 22677 /* Sort the operands. */ 22678 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare); 22679 22680 /* Copy the memory operands so that if we have to bail for some 22681 reason the original addresses are unchanged. */ 22682 if (load) 22683 { 22684 mem_1 = copy_rtx (temp_operands[1]); 22685 mem_2 = copy_rtx (temp_operands[3]); 22686 mem_3 = copy_rtx (temp_operands[5]); 22687 mem_4 = copy_rtx (temp_operands[7]); 22688 } 22689 else 22690 { 22691 mem_1 = copy_rtx (temp_operands[0]); 22692 mem_2 = copy_rtx (temp_operands[2]); 22693 mem_3 = copy_rtx (temp_operands[4]); 22694 mem_4 = copy_rtx (temp_operands[6]); 22695 gcc_assert (code == UNKNOWN); 22696 } 22697 22698 extract_base_offset_in_addr (mem_1, &base, &offset_1); 22699 extract_base_offset_in_addr (mem_3, &base, &offset_3); 22700 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX 22701 && offset_3 != NULL_RTX); 22702 22703 /* Adjust offset so it can fit in LDP/STP instruction. */ 22704 msize = GET_MODE_SIZE (mode); 22705 stp_off_upper_limit = msize * (0x40 - 1); 22706 stp_off_lower_limit = - msize * 0x40; 22707 22708 off_val_1 = INTVAL (offset_1); 22709 off_val_3 = INTVAL (offset_3); 22710 22711 /* The base offset is optimally half way between the two STP/LDP offsets. */ 22712 if (msize <= 4) 22713 base_off = (off_val_1 + off_val_3) / 2; 22714 else 22715 /* However, due to issues with negative LDP/STP offset generation for 22716 larger modes, for DF, DI and vector modes. we must not use negative 22717 addresses smaller than 9 signed unadjusted bits can store. This 22718 provides the most range in this case. */ 22719 base_off = off_val_1; 22720 22721 /* Adjust the base so that it is aligned with the addresses but still 22722 optimal. */ 22723 if (base_off % msize != off_val_1 % msize) 22724 /* Fix the offset, bearing in mind we want to make it bigger not 22725 smaller. */ 22726 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 22727 else if (msize <= 4) 22728 /* The negative range of LDP/STP is one larger than the positive range. */ 22729 base_off += msize; 22730 22731 /* Check if base offset is too big or too small. We can attempt to resolve 22732 this issue by setting it to the maximum value and seeing if the offsets 22733 still fit. */ 22734 if (base_off >= 0x1000) 22735 { 22736 base_off = 0x1000 - 1; 22737 /* We must still make sure that the base offset is aligned with respect 22738 to the address. But it may not be made any bigger. */ 22739 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 22740 } 22741 22742 /* Likewise for the case where the base is too small. */ 22743 if (base_off <= -0x1000) 22744 { 22745 base_off = -0x1000 + 1; 22746 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 22747 } 22748 22749 /* Offset of the first STP/LDP. */ 22750 new_off_1 = off_val_1 - base_off; 22751 22752 /* Offset of the second STP/LDP. */ 22753 new_off_3 = off_val_3 - base_off; 22754 22755 /* The offsets must be within the range of the LDP/STP instructions. */ 22756 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit 22757 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit) 22758 return false; 22759 22760 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8], 22761 new_off_1), true); 22762 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8], 22763 new_off_1 + msize), true); 22764 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8], 22765 new_off_3), true); 22766 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8], 22767 new_off_3 + msize), true); 22768 22769 if (!aarch64_mem_pair_operand (mem_1, mode) 22770 || !aarch64_mem_pair_operand (mem_3, mode)) 22771 return false; 22772 22773 if (code == ZERO_EXTEND) 22774 { 22775 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1); 22776 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2); 22777 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3); 22778 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4); 22779 } 22780 else if (code == SIGN_EXTEND) 22781 { 22782 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1); 22783 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2); 22784 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3); 22785 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4); 22786 } 22787 22788 if (load) 22789 { 22790 operands[0] = temp_operands[0]; 22791 operands[1] = mem_1; 22792 operands[2] = temp_operands[2]; 22793 operands[3] = mem_2; 22794 operands[4] = temp_operands[4]; 22795 operands[5] = mem_3; 22796 operands[6] = temp_operands[6]; 22797 operands[7] = mem_4; 22798 } 22799 else 22800 { 22801 operands[0] = mem_1; 22802 operands[1] = temp_operands[1]; 22803 operands[2] = mem_2; 22804 operands[3] = temp_operands[3]; 22805 operands[4] = mem_3; 22806 operands[5] = temp_operands[5]; 22807 operands[6] = mem_4; 22808 operands[7] = temp_operands[7]; 22809 } 22810 22811 /* Emit adjusting instruction. */ 22812 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off))); 22813 /* Emit ldp/stp instructions. */ 22814 t1 = gen_rtx_SET (operands[0], operands[1]); 22815 t2 = gen_rtx_SET (operands[2], operands[3]); 22816 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); 22817 t1 = gen_rtx_SET (operands[4], operands[5]); 22818 t2 = gen_rtx_SET (operands[6], operands[7]); 22819 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); 22820 return true; 22821} 22822 22823/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that 22824 it isn't worth branching around empty masked ops (including masked 22825 stores). */ 22826 22827static bool 22828aarch64_empty_mask_is_expensive (unsigned) 22829{ 22830 return false; 22831} 22832 22833/* Return 1 if pseudo register should be created and used to hold 22834 GOT address for PIC code. */ 22835 22836bool 22837aarch64_use_pseudo_pic_reg (void) 22838{ 22839 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC; 22840} 22841 22842/* Implement TARGET_UNSPEC_MAY_TRAP_P. */ 22843 22844static int 22845aarch64_unspec_may_trap_p (const_rtx x, unsigned flags) 22846{ 22847 switch (XINT (x, 1)) 22848 { 22849 case UNSPEC_GOTSMALLPIC: 22850 case UNSPEC_GOTSMALLPIC28K: 22851 case UNSPEC_GOTTINYPIC: 22852 return 0; 22853 default: 22854 break; 22855 } 22856 22857 return default_unspec_may_trap_p (x, flags); 22858} 22859 22860 22861/* If X is a positive CONST_DOUBLE with a value that is a power of 2 22862 return the log2 of that value. Otherwise return -1. */ 22863 22864int 22865aarch64_fpconst_pow_of_2 (rtx x) 22866{ 22867 const REAL_VALUE_TYPE *r; 22868 22869 if (!CONST_DOUBLE_P (x)) 22870 return -1; 22871 22872 r = CONST_DOUBLE_REAL_VALUE (x); 22873 22874 if (REAL_VALUE_NEGATIVE (*r) 22875 || REAL_VALUE_ISNAN (*r) 22876 || REAL_VALUE_ISINF (*r) 22877 || !real_isinteger (r, DFmode)) 22878 return -1; 22879 22880 return exact_log2 (real_to_integer (r)); 22881} 22882 22883/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a 22884 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n) 22885 return n. Otherwise return -1. */ 22886 22887int 22888aarch64_fpconst_pow2_recip (rtx x) 22889{ 22890 REAL_VALUE_TYPE r0; 22891 22892 if (!CONST_DOUBLE_P (x)) 22893 return -1; 22894 22895 r0 = *CONST_DOUBLE_REAL_VALUE (x); 22896 if (exact_real_inverse (DFmode, &r0) 22897 && !REAL_VALUE_NEGATIVE (r0)) 22898 { 22899 int ret = exact_log2 (real_to_integer (&r0)); 22900 if (ret >= 1 && ret <= 32) 22901 return ret; 22902 } 22903 return -1; 22904} 22905 22906/* If X is a vector of equal CONST_DOUBLE values and that value is 22907 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */ 22908 22909int 22910aarch64_vec_fpconst_pow_of_2 (rtx x) 22911{ 22912 int nelts; 22913 if (GET_CODE (x) != CONST_VECTOR 22914 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts)) 22915 return -1; 22916 22917 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT) 22918 return -1; 22919 22920 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0)); 22921 if (firstval <= 0) 22922 return -1; 22923 22924 for (int i = 1; i < nelts; i++) 22925 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval) 22926 return -1; 22927 22928 return firstval; 22929} 22930 22931/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types 22932 to float. 22933 22934 __fp16 always promotes through this hook. 22935 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that 22936 through the generic excess precision logic rather than here. */ 22937 22938static tree 22939aarch64_promoted_type (const_tree t) 22940{ 22941 if (SCALAR_FLOAT_TYPE_P (t) 22942 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node) 22943 return float_type_node; 22944 22945 return NULL_TREE; 22946} 22947 22948/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */ 22949 22950static bool 22951aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode, 22952 optimization_type opt_type) 22953{ 22954 switch (op) 22955 { 22956 case rsqrt_optab: 22957 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); 22958 22959 default: 22960 return true; 22961 } 22962} 22963 22964/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */ 22965 22966static unsigned int 22967aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor, 22968 int *offset) 22969{ 22970 /* Polynomial invariant 1 == (VG / 2) - 1. */ 22971 gcc_assert (i == 1); 22972 *factor = 2; 22973 *offset = 1; 22974 return AARCH64_DWARF_VG; 22975} 22976 22977/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE 22978 if MODE is HFmode, and punt to the generic implementation otherwise. */ 22979 22980static bool 22981aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode) 22982{ 22983 return (mode == HFmode 22984 ? true 22985 : default_libgcc_floating_mode_supported_p (mode)); 22986} 22987 22988/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE 22989 if MODE is HFmode, and punt to the generic implementation otherwise. */ 22990 22991static bool 22992aarch64_scalar_mode_supported_p (scalar_mode mode) 22993{ 22994 return (mode == HFmode 22995 ? true 22996 : default_scalar_mode_supported_p (mode)); 22997} 22998 22999/* Set the value of FLT_EVAL_METHOD. 23000 ISO/IEC TS 18661-3 defines two values that we'd like to make use of: 23001 23002 0: evaluate all operations and constants, whose semantic type has at 23003 most the range and precision of type float, to the range and 23004 precision of float; evaluate all other operations and constants to 23005 the range and precision of the semantic type; 23006 23007 N, where _FloatN is a supported interchange floating type 23008 evaluate all operations and constants, whose semantic type has at 23009 most the range and precision of _FloatN type, to the range and 23010 precision of the _FloatN type; evaluate all other operations and 23011 constants to the range and precision of the semantic type; 23012 23013 If we have the ARMv8.2-A extensions then we support _Float16 in native 23014 precision, so we should set this to 16. Otherwise, we support the type, 23015 but want to evaluate expressions in float precision, so set this to 23016 0. */ 23017 23018static enum flt_eval_method 23019aarch64_excess_precision (enum excess_precision_type type) 23020{ 23021 switch (type) 23022 { 23023 case EXCESS_PRECISION_TYPE_FAST: 23024 case EXCESS_PRECISION_TYPE_STANDARD: 23025 /* We can calculate either in 16-bit range and precision or 23026 32-bit range and precision. Make that decision based on whether 23027 we have native support for the ARMv8.2-A 16-bit floating-point 23028 instructions or not. */ 23029 return (TARGET_FP_F16INST 23030 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 23031 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT); 23032 case EXCESS_PRECISION_TYPE_IMPLICIT: 23033 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; 23034 default: 23035 gcc_unreachable (); 23036 } 23037 return FLT_EVAL_METHOD_UNPREDICTABLE; 23038} 23039 23040/* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be 23041 scheduled for speculative execution. Reject the long-running division 23042 and square-root instructions. */ 23043 23044static bool 23045aarch64_sched_can_speculate_insn (rtx_insn *insn) 23046{ 23047 switch (get_attr_type (insn)) 23048 { 23049 case TYPE_SDIV: 23050 case TYPE_UDIV: 23051 case TYPE_FDIVS: 23052 case TYPE_FDIVD: 23053 case TYPE_FSQRTS: 23054 case TYPE_FSQRTD: 23055 case TYPE_NEON_FP_SQRT_S: 23056 case TYPE_NEON_FP_SQRT_D: 23057 case TYPE_NEON_FP_SQRT_S_Q: 23058 case TYPE_NEON_FP_SQRT_D_Q: 23059 case TYPE_NEON_FP_DIV_S: 23060 case TYPE_NEON_FP_DIV_D: 23061 case TYPE_NEON_FP_DIV_S_Q: 23062 case TYPE_NEON_FP_DIV_D_Q: 23063 return false; 23064 default: 23065 return true; 23066 } 23067} 23068 23069/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */ 23070 23071static int 23072aarch64_compute_pressure_classes (reg_class *classes) 23073{ 23074 int i = 0; 23075 classes[i++] = GENERAL_REGS; 23076 classes[i++] = FP_REGS; 23077 /* PR_REGS isn't a useful pressure class because many predicate pseudo 23078 registers need to go in PR_LO_REGS at some point during their 23079 lifetime. Splitting it into two halves has the effect of making 23080 all predicates count against PR_LO_REGS, so that we try whenever 23081 possible to restrict the number of live predicates to 8. This 23082 greatly reduces the amount of spilling in certain loops. */ 23083 classes[i++] = PR_LO_REGS; 23084 classes[i++] = PR_HI_REGS; 23085 return i; 23086} 23087 23088/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ 23089 23090static bool 23091aarch64_can_change_mode_class (machine_mode from, 23092 machine_mode to, reg_class_t) 23093{ 23094 unsigned int from_flags = aarch64_classify_vector_mode (from); 23095 unsigned int to_flags = aarch64_classify_vector_mode (to); 23096 23097 bool from_sve_p = (from_flags & VEC_ANY_SVE); 23098 bool to_sve_p = (to_flags & VEC_ANY_SVE); 23099 23100 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL); 23101 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL); 23102 23103 bool from_pred_p = (from_flags & VEC_SVE_PRED); 23104 bool to_pred_p = (to_flags & VEC_SVE_PRED); 23105 23106 /* Don't allow changes between predicate modes and other modes. 23107 Only predicate registers can hold predicate modes and only 23108 non-predicate registers can hold non-predicate modes, so any 23109 attempt to mix them would require a round trip through memory. */ 23110 if (from_pred_p != to_pred_p) 23111 return false; 23112 23113 /* Don't allow changes between partial SVE modes and other modes. 23114 The contents of partial SVE modes are distributed evenly across 23115 the register, whereas GCC expects them to be clustered together. */ 23116 if (from_partial_sve_p != to_partial_sve_p) 23117 return false; 23118 23119 /* Similarly reject changes between partial SVE modes that have 23120 different patterns of significant and insignificant bits. */ 23121 if (from_partial_sve_p 23122 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to) 23123 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))) 23124 return false; 23125 23126 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u)) 23127 { 23128 /* Don't allow changes between SVE modes and other modes that might 23129 be bigger than 128 bits. In particular, OImode, CImode and XImode 23130 divide into 128-bit quantities while SVE modes divide into 23131 BITS_PER_SVE_VECTOR quantities. */ 23132 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128)) 23133 return false; 23134 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128)) 23135 return false; 23136 } 23137 23138 if (BYTES_BIG_ENDIAN) 23139 { 23140 /* Don't allow changes between SVE data modes and non-SVE modes. 23141 See the comment at the head of aarch64-sve.md for details. */ 23142 if (from_sve_p != to_sve_p) 23143 return false; 23144 23145 /* Don't allow changes in element size: lane 0 of the new vector 23146 would not then be lane 0 of the old vector. See the comment 23147 above aarch64_maybe_expand_sve_subreg_move for a more detailed 23148 description. 23149 23150 In the worst case, this forces a register to be spilled in 23151 one mode and reloaded in the other, which handles the 23152 endianness correctly. */ 23153 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)) 23154 return false; 23155 } 23156 return true; 23157} 23158 23159/* Implement TARGET_EARLY_REMAT_MODES. */ 23160 23161static void 23162aarch64_select_early_remat_modes (sbitmap modes) 23163{ 23164 /* SVE values are not normally live across a call, so it should be 23165 worth doing early rematerialization even in VL-specific mode. */ 23166 for (int i = 0; i < NUM_MACHINE_MODES; ++i) 23167 if (aarch64_sve_mode_p ((machine_mode) i)) 23168 bitmap_set_bit (modes, i); 23169} 23170 23171/* Override the default target speculation_safe_value. */ 23172static rtx 23173aarch64_speculation_safe_value (machine_mode mode, 23174 rtx result, rtx val, rtx failval) 23175{ 23176 /* Maybe we should warn if falling back to hard barriers. They are 23177 likely to be noticably more expensive than the alternative below. */ 23178 if (!aarch64_track_speculation) 23179 return default_speculation_safe_value (mode, result, val, failval); 23180 23181 if (!REG_P (val)) 23182 val = copy_to_mode_reg (mode, val); 23183 23184 if (!aarch64_reg_or_zero (failval, mode)) 23185 failval = copy_to_mode_reg (mode, failval); 23186 23187 emit_insn (gen_despeculate_copy (mode, result, val, failval)); 23188 return result; 23189} 23190 23191/* Implement TARGET_ESTIMATED_POLY_VALUE. 23192 Look into the tuning structure for an estimate. 23193 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial 23194 Advanced SIMD 128 bits. */ 23195 23196static HOST_WIDE_INT 23197aarch64_estimated_poly_value (poly_int64 val) 23198{ 23199 enum aarch64_sve_vector_bits_enum width_source 23200 = aarch64_tune_params.sve_width; 23201 23202 /* If we still don't have an estimate, use the default. */ 23203 if (width_source == SVE_SCALABLE) 23204 return default_estimated_poly_value (val); 23205 23206 HOST_WIDE_INT over_128 = width_source - 128; 23207 return val.coeffs[0] + val.coeffs[1] * over_128 / 128; 23208} 23209 23210 23211/* Return true for types that could be supported as SIMD return or 23212 argument types. */ 23213 23214static bool 23215supported_simd_type (tree t) 23216{ 23217 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t)) 23218 { 23219 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t)); 23220 return s == 1 || s == 2 || s == 4 || s == 8; 23221 } 23222 return false; 23223} 23224 23225/* Return true for types that currently are supported as SIMD return 23226 or argument types. */ 23227 23228static bool 23229currently_supported_simd_type (tree t, tree b) 23230{ 23231 if (COMPLEX_FLOAT_TYPE_P (t)) 23232 return false; 23233 23234 if (TYPE_SIZE (t) != TYPE_SIZE (b)) 23235 return false; 23236 23237 return supported_simd_type (t); 23238} 23239 23240/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */ 23241 23242static int 23243aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, 23244 struct cgraph_simd_clone *clonei, 23245 tree base_type, int num) 23246{ 23247 tree t, ret_type, arg_type; 23248 unsigned int elt_bits, vec_bits, count; 23249 23250 if (!TARGET_SIMD) 23251 return 0; 23252 23253 if (clonei->simdlen 23254 && (clonei->simdlen < 2 23255 || clonei->simdlen > 1024 23256 || (clonei->simdlen & (clonei->simdlen - 1)) != 0)) 23257 { 23258 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23259 "unsupported simdlen %d", clonei->simdlen); 23260 return 0; 23261 } 23262 23263 ret_type = TREE_TYPE (TREE_TYPE (node->decl)); 23264 if (TREE_CODE (ret_type) != VOID_TYPE 23265 && !currently_supported_simd_type (ret_type, base_type)) 23266 { 23267 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type)) 23268 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23269 "GCC does not currently support mixed size types " 23270 "for %<simd%> functions"); 23271 else if (supported_simd_type (ret_type)) 23272 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23273 "GCC does not currently support return type %qT " 23274 "for %<simd%> functions", ret_type); 23275 else 23276 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23277 "unsupported return type %qT for %<simd%> functions", 23278 ret_type); 23279 return 0; 23280 } 23281 23282 int i; 23283 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); 23284 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE); 23285 23286 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0; 23287 t && t != void_list_node; t = TREE_CHAIN (t), i++) 23288 { 23289 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); 23290 23291 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM 23292 && !currently_supported_simd_type (arg_type, base_type)) 23293 { 23294 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type)) 23295 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23296 "GCC does not currently support mixed size types " 23297 "for %<simd%> functions"); 23298 else 23299 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23300 "GCC does not currently support argument type %qT " 23301 "for %<simd%> functions", arg_type); 23302 return 0; 23303 } 23304 } 23305 23306 clonei->vecsize_mangle = 'n'; 23307 clonei->mask_mode = VOIDmode; 23308 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); 23309 if (clonei->simdlen == 0) 23310 { 23311 count = 2; 23312 vec_bits = (num == 0 ? 64 : 128); 23313 clonei->simdlen = vec_bits / elt_bits; 23314 } 23315 else 23316 { 23317 count = 1; 23318 vec_bits = clonei->simdlen * elt_bits; 23319 if (vec_bits != 64 && vec_bits != 128) 23320 { 23321 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 23322 "GCC does not currently support simdlen %d for type %qT", 23323 clonei->simdlen, base_type); 23324 return 0; 23325 } 23326 } 23327 clonei->vecsize_int = vec_bits; 23328 clonei->vecsize_float = vec_bits; 23329 return count; 23330} 23331 23332/* Implement TARGET_SIMD_CLONE_ADJUST. */ 23333 23334static void 23335aarch64_simd_clone_adjust (struct cgraph_node *node) 23336{ 23337 /* Add aarch64_vector_pcs target attribute to SIMD clones so they 23338 use the correct ABI. */ 23339 23340 tree t = TREE_TYPE (node->decl); 23341 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", 23342 TYPE_ATTRIBUTES (t)); 23343} 23344 23345/* Implement TARGET_SIMD_CLONE_USABLE. */ 23346 23347static int 23348aarch64_simd_clone_usable (struct cgraph_node *node) 23349{ 23350 switch (node->simdclone->vecsize_mangle) 23351 { 23352 case 'n': 23353 if (!TARGET_SIMD) 23354 return -1; 23355 return 0; 23356 default: 23357 gcc_unreachable (); 23358 } 23359} 23360 23361/* Implement TARGET_COMP_TYPE_ATTRIBUTES */ 23362 23363static int 23364aarch64_comp_type_attributes (const_tree type1, const_tree type2) 23365{ 23366 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1)) 23367 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2))) 23368 return 0; 23369 return 1; 23370} 23371 23372/* Implement TARGET_GET_MULTILIB_ABI_NAME */ 23373 23374static const char * 23375aarch64_get_multilib_abi_name (void) 23376{ 23377 if (TARGET_BIG_END) 23378 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be"; 23379 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64"; 23380} 23381 23382/* Implement TARGET_STACK_PROTECT_GUARD. In case of a 23383 global variable based guard use the default else 23384 return a null tree. */ 23385static tree 23386aarch64_stack_protect_guard (void) 23387{ 23388 if (aarch64_stack_protector_guard == SSP_GLOBAL) 23389 return default_stack_protect_guard (); 23390 23391 return NULL_TREE; 23392} 23393 23394/* Return the diagnostic message string if conversion from FROMTYPE to 23395 TOTYPE is not allowed, NULL otherwise. */ 23396 23397static const char * 23398aarch64_invalid_conversion (const_tree fromtype, const_tree totype) 23399{ 23400 if (element_mode (fromtype) != element_mode (totype)) 23401 { 23402 /* Do no allow conversions to/from BFmode scalar types. */ 23403 if (TYPE_MODE (fromtype) == BFmode) 23404 return N_("invalid conversion from type %<bfloat16_t%>"); 23405 if (TYPE_MODE (totype) == BFmode) 23406 return N_("invalid conversion to type %<bfloat16_t%>"); 23407 } 23408 23409 /* Conversion allowed. */ 23410 return NULL; 23411} 23412 23413/* Return the diagnostic message string if the unary operation OP is 23414 not permitted on TYPE, NULL otherwise. */ 23415 23416static const char * 23417aarch64_invalid_unary_op (int op, const_tree type) 23418{ 23419 /* Reject all single-operand operations on BFmode except for &. */ 23420 if (element_mode (type) == BFmode && op != ADDR_EXPR) 23421 return N_("operation not permitted on type %<bfloat16_t%>"); 23422 23423 /* Operation allowed. */ 23424 return NULL; 23425} 23426 23427/* Return the diagnostic message string if the binary operation OP is 23428 not permitted on TYPE1 and TYPE2, NULL otherwise. */ 23429 23430static const char * 23431aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, 23432 const_tree type2) 23433{ 23434 /* Reject all 2-operand operations on BFmode. */ 23435 if (element_mode (type1) == BFmode 23436 || element_mode (type2) == BFmode) 23437 return N_("operation not permitted on type %<bfloat16_t%>"); 23438 23439 if (VECTOR_TYPE_P (type1) 23440 && VECTOR_TYPE_P (type2) 23441 && !TYPE_INDIVISIBLE_P (type1) 23442 && !TYPE_INDIVISIBLE_P (type2) 23443 && (aarch64_sve::builtin_type_p (type1) 23444 != aarch64_sve::builtin_type_p (type2))) 23445 return N_("cannot combine GNU and SVE vectors in a binary operation"); 23446 23447 /* Operation allowed. */ 23448 return NULL; 23449} 23450 23451/* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE 23452 section at the end if needed. */ 23453#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 23454#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) 23455#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) 23456void 23457aarch64_file_end_indicate_exec_stack () 23458{ 23459 file_end_indicate_exec_stack (); 23460 23461 unsigned feature_1_and = 0; 23462 if (aarch64_bti_enabled ()) 23463 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI; 23464 23465 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE) 23466 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC; 23467 23468 if (feature_1_and) 23469 { 23470 /* Generate .note.gnu.property section. */ 23471 switch_to_section (get_section (".note.gnu.property", 23472 SECTION_NOTYPE, NULL)); 23473 23474 /* PT_NOTE header: namesz, descsz, type. 23475 namesz = 4 ("GNU\0") 23476 descsz = 16 (Size of the program property array) 23477 [(12 + padding) * Number of array elements] 23478 type = 5 (NT_GNU_PROPERTY_TYPE_0). */ 23479 assemble_align (POINTER_SIZE); 23480 assemble_integer (GEN_INT (4), 4, 32, 1); 23481 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1); 23482 assemble_integer (GEN_INT (5), 4, 32, 1); 23483 23484 /* PT_NOTE name. */ 23485 assemble_string ("GNU", 4); 23486 23487 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0: 23488 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND 23489 datasz = 4 23490 data = feature_1_and. */ 23491 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1); 23492 assemble_integer (GEN_INT (4), 4, 32, 1); 23493 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1); 23494 23495 /* Pad the size of the note to the required alignment. */ 23496 assemble_align (POINTER_SIZE); 23497 } 23498} 23499#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC 23500#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI 23501#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND 23502 23503/* Helper function for straight line speculation. 23504 Return what barrier should be emitted for straight line speculation 23505 mitigation. 23506 When not mitigating against straight line speculation this function returns 23507 an empty string. 23508 When mitigating against straight line speculation, use: 23509 * SB when the v8.5-A SB extension is enabled. 23510 * DSB+ISB otherwise. */ 23511const char * 23512aarch64_sls_barrier (int mitigation_required) 23513{ 23514 return mitigation_required 23515 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb") 23516 : ""; 23517} 23518 23519static GTY (()) tree aarch64_sls_shared_thunks[30]; 23520static GTY (()) bool aarch64_sls_shared_thunks_needed = false; 23521const char *indirect_symbol_names[30] = { 23522 "__call_indirect_x0", 23523 "__call_indirect_x1", 23524 "__call_indirect_x2", 23525 "__call_indirect_x3", 23526 "__call_indirect_x4", 23527 "__call_indirect_x5", 23528 "__call_indirect_x6", 23529 "__call_indirect_x7", 23530 "__call_indirect_x8", 23531 "__call_indirect_x9", 23532 "__call_indirect_x10", 23533 "__call_indirect_x11", 23534 "__call_indirect_x12", 23535 "__call_indirect_x13", 23536 "__call_indirect_x14", 23537 "__call_indirect_x15", 23538 "", /* "__call_indirect_x16", */ 23539 "", /* "__call_indirect_x17", */ 23540 "__call_indirect_x18", 23541 "__call_indirect_x19", 23542 "__call_indirect_x20", 23543 "__call_indirect_x21", 23544 "__call_indirect_x22", 23545 "__call_indirect_x23", 23546 "__call_indirect_x24", 23547 "__call_indirect_x25", 23548 "__call_indirect_x26", 23549 "__call_indirect_x27", 23550 "__call_indirect_x28", 23551 "__call_indirect_x29", 23552}; 23553 23554/* Function to create a BLR thunk. This thunk is used to mitigate straight 23555 line speculation. Instead of a simple BLR that can be speculated past, 23556 we emit a BL to this thunk, and this thunk contains a BR to the relevant 23557 register. These thunks have the relevant speculation barries put after 23558 their indirect branch so that speculation is blocked. 23559 23560 We use such a thunk so the speculation barriers are kept off the 23561 architecturally executed path in order to reduce the performance overhead. 23562 23563 When optimizing for size we use stubs shared by the linked object. 23564 When optimizing for performance we emit stubs for each function in the hope 23565 that the branch predictor can better train on jumps specific for a given 23566 function. */ 23567rtx 23568aarch64_sls_create_blr_label (int regnum) 23569{ 23570 gcc_assert (STUB_REGNUM_P (regnum)); 23571 if (optimize_function_for_size_p (cfun)) 23572 { 23573 /* For the thunks shared between different functions in this compilation 23574 unit we use a named symbol -- this is just for users to more easily 23575 understand the generated assembly. */ 23576 aarch64_sls_shared_thunks_needed = true; 23577 const char *thunk_name = indirect_symbol_names[regnum]; 23578 if (aarch64_sls_shared_thunks[regnum] == NULL) 23579 { 23580 /* Build a decl representing this function stub and record it for 23581 later. We build a decl here so we can use the GCC machinery for 23582 handling sections automatically (through `get_named_section` and 23583 `make_decl_one_only`). That saves us a lot of trouble handling 23584 the specifics of different output file formats. */ 23585 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, 23586 get_identifier (thunk_name), 23587 build_function_type_list (void_type_node, 23588 NULL_TREE)); 23589 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, 23590 NULL_TREE, void_type_node); 23591 TREE_PUBLIC (decl) = 1; 23592 TREE_STATIC (decl) = 1; 23593 DECL_IGNORED_P (decl) = 1; 23594 DECL_ARTIFICIAL (decl) = 1; 23595 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl)); 23596 resolve_unique_section (decl, 0, false); 23597 aarch64_sls_shared_thunks[regnum] = decl; 23598 } 23599 23600 return gen_rtx_SYMBOL_REF (Pmode, thunk_name); 23601 } 23602 23603 if (cfun->machine->call_via[regnum] == NULL) 23604 cfun->machine->call_via[regnum] 23605 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ()); 23606 return cfun->machine->call_via[regnum]; 23607} 23608 23609/* Helper function for aarch64_sls_emit_blr_function_thunks and 23610 aarch64_sls_emit_shared_blr_thunks below. */ 23611static void 23612aarch64_sls_emit_function_stub (FILE *out_file, int regnum) 23613{ 23614 /* Save in x16 and branch to that function so this transformation does 23615 not prevent jumping to `BTI c` instructions. */ 23616 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum); 23617 asm_fprintf (out_file, "\tbr\tx16\n"); 23618} 23619 23620/* Emit all BLR stubs for this particular function. 23621 Here we emit all the BLR stubs needed for the current function. Since we 23622 emit these stubs in a consecutive block we know there will be no speculation 23623 gadgets between each stub, and hence we only emit a speculation barrier at 23624 the end of the stub sequences. 23625 23626 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */ 23627void 23628aarch64_sls_emit_blr_function_thunks (FILE *out_file) 23629{ 23630 if (! aarch64_harden_sls_blr_p ()) 23631 return; 23632 23633 bool any_functions_emitted = false; 23634 /* We must save and restore the current function section since this assembly 23635 is emitted at the end of the function. This means it can be emitted *just 23636 after* the cold section of a function. That cold part would be emitted in 23637 a different section. That switch would trigger a `.cfi_endproc` directive 23638 to be emitted in the original section and a `.cfi_startproc` directive to 23639 be emitted in the new section. Switching to the original section without 23640 restoring would mean that the `.cfi_endproc` emitted as a function ends 23641 would happen in a different section -- leaving an unmatched 23642 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc` 23643 in the standard text section. */ 23644 section *save_text_section = in_section; 23645 switch_to_section (function_section (current_function_decl)); 23646 for (int regnum = 0; regnum < 30; ++regnum) 23647 { 23648 rtx specu_label = cfun->machine->call_via[regnum]; 23649 if (specu_label == NULL) 23650 continue; 23651 23652 targetm.asm_out.print_operand (out_file, specu_label, 0); 23653 asm_fprintf (out_file, ":\n"); 23654 aarch64_sls_emit_function_stub (out_file, regnum); 23655 any_functions_emitted = true; 23656 } 23657 if (any_functions_emitted) 23658 /* Can use the SB if needs be here, since this stub will only be used 23659 by the current function, and hence for the current target. */ 23660 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true)); 23661 switch_to_section (save_text_section); 23662} 23663 23664/* Emit shared BLR stubs for the current compilation unit. 23665 Over the course of compiling this unit we may have converted some BLR 23666 instructions to a BL to a shared stub function. This is where we emit those 23667 stub functions. 23668 This function is for the stubs shared between different functions in this 23669 compilation unit. We share when optimizing for size instead of speed. 23670 23671 This function is called through the TARGET_ASM_FILE_END hook. */ 23672void 23673aarch64_sls_emit_shared_blr_thunks (FILE *out_file) 23674{ 23675 if (! aarch64_sls_shared_thunks_needed) 23676 return; 23677 23678 for (int regnum = 0; regnum < 30; ++regnum) 23679 { 23680 tree decl = aarch64_sls_shared_thunks[regnum]; 23681 if (!decl) 23682 continue; 23683 23684 const char *name = indirect_symbol_names[regnum]; 23685 switch_to_section (get_named_section (decl, NULL, 0)); 23686 ASM_OUTPUT_ALIGN (out_file, 2); 23687 targetm.asm_out.globalize_label (out_file, name); 23688 /* Only emits if the compiler is configured for an assembler that can 23689 handle visibility directives. */ 23690 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN); 23691 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function"); 23692 ASM_OUTPUT_LABEL (out_file, name); 23693 aarch64_sls_emit_function_stub (out_file, regnum); 23694 /* Use the most conservative target to ensure it can always be used by any 23695 function in the translation unit. */ 23696 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n"); 23697 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl); 23698 } 23699} 23700 23701/* Implement TARGET_ASM_FILE_END. */ 23702void 23703aarch64_asm_file_end () 23704{ 23705 aarch64_sls_emit_shared_blr_thunks (asm_out_file); 23706 /* Since this function will be called for the ASM_FILE_END hook, we ensure 23707 that what would be called otherwise (e.g. `file_end_indicate_exec_stack` 23708 for FreeBSD) still gets called. */ 23709#ifdef TARGET_ASM_FILE_END 23710 TARGET_ASM_FILE_END (); 23711#endif 23712} 23713 23714const char * 23715aarch64_indirect_call_asm (rtx addr) 23716{ 23717 gcc_assert (REG_P (addr)); 23718 if (aarch64_harden_sls_blr_p ()) 23719 { 23720 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr)); 23721 output_asm_insn ("bl\t%0", &stub_label); 23722 } 23723 else 23724 output_asm_insn ("blr\t%0", &addr); 23725 return ""; 23726} 23727 23728/* Target-specific selftests. */ 23729 23730#if CHECKING_P 23731 23732namespace selftest { 23733 23734/* Selftest for the RTL loader. 23735 Verify that the RTL loader copes with a dump from 23736 print_rtx_function. This is essentially just a test that class 23737 function_reader can handle a real dump, but it also verifies 23738 that lookup_reg_by_dump_name correctly handles hard regs. 23739 The presence of hard reg names in the dump means that the test is 23740 target-specific, hence it is in this file. */ 23741 23742static void 23743aarch64_test_loading_full_dump () 23744{ 23745 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl")); 23746 23747 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl))); 23748 23749 rtx_insn *insn_1 = get_insn_by_uid (1); 23750 ASSERT_EQ (NOTE, GET_CODE (insn_1)); 23751 23752 rtx_insn *insn_15 = get_insn_by_uid (15); 23753 ASSERT_EQ (INSN, GET_CODE (insn_15)); 23754 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15))); 23755 23756 /* Verify crtl->return_rtx. */ 23757 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx)); 23758 ASSERT_EQ (0, REGNO (crtl->return_rtx)); 23759 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx)); 23760} 23761 23762/* Run all target-specific selftests. */ 23763 23764static void 23765aarch64_run_selftests (void) 23766{ 23767 aarch64_test_loading_full_dump (); 23768} 23769 23770} // namespace selftest 23771 23772#endif /* #if CHECKING_P */ 23773 23774#undef TARGET_STACK_PROTECT_GUARD 23775#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard 23776 23777#undef TARGET_ADDRESS_COST 23778#define TARGET_ADDRESS_COST aarch64_address_cost 23779 23780/* This hook will determines whether unnamed bitfields affect the alignment 23781 of the containing structure. The hook returns true if the structure 23782 should inherit the alignment requirements of an unnamed bitfield's 23783 type. */ 23784#undef TARGET_ALIGN_ANON_BITFIELD 23785#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true 23786 23787#undef TARGET_ASM_ALIGNED_DI_OP 23788#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t" 23789 23790#undef TARGET_ASM_ALIGNED_HI_OP 23791#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t" 23792 23793#undef TARGET_ASM_ALIGNED_SI_OP 23794#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t" 23795 23796#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK 23797#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \ 23798 hook_bool_const_tree_hwi_hwi_const_tree_true 23799 23800#undef TARGET_ASM_FILE_START 23801#define TARGET_ASM_FILE_START aarch64_start_file 23802 23803#undef TARGET_ASM_OUTPUT_MI_THUNK 23804#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk 23805 23806#undef TARGET_ASM_SELECT_RTX_SECTION 23807#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section 23808 23809#undef TARGET_ASM_TRAMPOLINE_TEMPLATE 23810#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template 23811 23812#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY 23813#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry 23814 23815#undef TARGET_BUILD_BUILTIN_VA_LIST 23816#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list 23817 23818#undef TARGET_CALLEE_COPIES 23819#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false 23820 23821#undef TARGET_CAN_ELIMINATE 23822#define TARGET_CAN_ELIMINATE aarch64_can_eliminate 23823 23824#undef TARGET_CAN_INLINE_P 23825#define TARGET_CAN_INLINE_P aarch64_can_inline_p 23826 23827#undef TARGET_CANNOT_FORCE_CONST_MEM 23828#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem 23829 23830#undef TARGET_CASE_VALUES_THRESHOLD 23831#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold 23832 23833#undef TARGET_CONDITIONAL_REGISTER_USAGE 23834#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage 23835 23836#undef TARGET_MEMBER_TYPE_FORCES_BLK 23837#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk 23838 23839/* Only the least significant bit is used for initialization guard 23840 variables. */ 23841#undef TARGET_CXX_GUARD_MASK_BIT 23842#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true 23843 23844#undef TARGET_C_MODE_FOR_SUFFIX 23845#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix 23846 23847#ifdef TARGET_BIG_ENDIAN_DEFAULT 23848#undef TARGET_DEFAULT_TARGET_FLAGS 23849#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END) 23850#endif 23851 23852#undef TARGET_CLASS_MAX_NREGS 23853#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs 23854 23855#undef TARGET_BUILTIN_DECL 23856#define TARGET_BUILTIN_DECL aarch64_builtin_decl 23857 23858#undef TARGET_BUILTIN_RECIPROCAL 23859#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal 23860 23861#undef TARGET_C_EXCESS_PRECISION 23862#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision 23863 23864#undef TARGET_EXPAND_BUILTIN 23865#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin 23866 23867#undef TARGET_EXPAND_BUILTIN_VA_START 23868#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start 23869 23870#undef TARGET_FOLD_BUILTIN 23871#define TARGET_FOLD_BUILTIN aarch64_fold_builtin 23872 23873#undef TARGET_FUNCTION_ARG 23874#define TARGET_FUNCTION_ARG aarch64_function_arg 23875 23876#undef TARGET_FUNCTION_ARG_ADVANCE 23877#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance 23878 23879#undef TARGET_FUNCTION_ARG_BOUNDARY 23880#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary 23881 23882#undef TARGET_FUNCTION_ARG_PADDING 23883#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding 23884 23885#undef TARGET_GET_RAW_RESULT_MODE 23886#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode 23887#undef TARGET_GET_RAW_ARG_MODE 23888#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode 23889 23890#undef TARGET_FUNCTION_OK_FOR_SIBCALL 23891#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall 23892 23893#undef TARGET_FUNCTION_VALUE 23894#define TARGET_FUNCTION_VALUE aarch64_function_value 23895 23896#undef TARGET_FUNCTION_VALUE_REGNO_P 23897#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p 23898 23899#undef TARGET_GIMPLE_FOLD_BUILTIN 23900#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin 23901 23902#undef TARGET_GIMPLIFY_VA_ARG_EXPR 23903#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr 23904 23905#undef TARGET_INIT_BUILTINS 23906#define TARGET_INIT_BUILTINS aarch64_init_builtins 23907 23908#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS 23909#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \ 23910 aarch64_ira_change_pseudo_allocno_class 23911 23912#undef TARGET_LEGITIMATE_ADDRESS_P 23913#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p 23914 23915#undef TARGET_LEGITIMATE_CONSTANT_P 23916#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p 23917 23918#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT 23919#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \ 23920 aarch64_legitimize_address_displacement 23921 23922#undef TARGET_LIBGCC_CMP_RETURN_MODE 23923#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode 23924 23925#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P 23926#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \ 23927aarch64_libgcc_floating_mode_supported_p 23928 23929#undef TARGET_MANGLE_TYPE 23930#define TARGET_MANGLE_TYPE aarch64_mangle_type 23931 23932#undef TARGET_INVALID_CONVERSION 23933#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion 23934 23935#undef TARGET_INVALID_UNARY_OP 23936#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op 23937 23938#undef TARGET_INVALID_BINARY_OP 23939#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op 23940 23941#undef TARGET_VERIFY_TYPE_CONTEXT 23942#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context 23943 23944#undef TARGET_MEMORY_MOVE_COST 23945#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost 23946 23947#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL 23948#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul 23949 23950#undef TARGET_MUST_PASS_IN_STACK 23951#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size 23952 23953/* This target hook should return true if accesses to volatile bitfields 23954 should use the narrowest mode possible. It should return false if these 23955 accesses should use the bitfield container type. */ 23956#undef TARGET_NARROW_VOLATILE_BITFIELD 23957#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false 23958 23959#undef TARGET_OPTION_OVERRIDE 23960#define TARGET_OPTION_OVERRIDE aarch64_override_options 23961 23962#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE 23963#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \ 23964 aarch64_override_options_after_change 23965 23966#undef TARGET_OPTION_SAVE 23967#define TARGET_OPTION_SAVE aarch64_option_save 23968 23969#undef TARGET_OPTION_RESTORE 23970#define TARGET_OPTION_RESTORE aarch64_option_restore 23971 23972#undef TARGET_OPTION_PRINT 23973#define TARGET_OPTION_PRINT aarch64_option_print 23974 23975#undef TARGET_OPTION_VALID_ATTRIBUTE_P 23976#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p 23977 23978#undef TARGET_SET_CURRENT_FUNCTION 23979#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function 23980 23981#undef TARGET_PASS_BY_REFERENCE 23982#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference 23983 23984#undef TARGET_PREFERRED_RELOAD_CLASS 23985#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class 23986 23987#undef TARGET_SCHED_REASSOCIATION_WIDTH 23988#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width 23989 23990#undef TARGET_PROMOTED_TYPE 23991#define TARGET_PROMOTED_TYPE aarch64_promoted_type 23992 23993#undef TARGET_SECONDARY_RELOAD 23994#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload 23995 23996#undef TARGET_SHIFT_TRUNCATION_MASK 23997#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask 23998 23999#undef TARGET_SETUP_INCOMING_VARARGS 24000#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs 24001 24002#undef TARGET_STRUCT_VALUE_RTX 24003#define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx 24004 24005#undef TARGET_REGISTER_MOVE_COST 24006#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost 24007 24008#undef TARGET_RETURN_IN_MEMORY 24009#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory 24010 24011#undef TARGET_RETURN_IN_MSB 24012#define TARGET_RETURN_IN_MSB aarch64_return_in_msb 24013 24014#undef TARGET_RTX_COSTS 24015#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper 24016 24017#undef TARGET_SCALAR_MODE_SUPPORTED_P 24018#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p 24019 24020#undef TARGET_SCHED_ISSUE_RATE 24021#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate 24022 24023#undef TARGET_SCHED_VARIABLE_ISSUE 24024#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue 24025 24026#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD 24027#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ 24028 aarch64_sched_first_cycle_multipass_dfa_lookahead 24029 24030#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD 24031#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \ 24032 aarch64_first_cycle_multipass_dfa_lookahead_guard 24033 24034#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS 24035#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ 24036 aarch64_get_separate_components 24037 24038#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB 24039#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \ 24040 aarch64_components_for_bb 24041 24042#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS 24043#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ 24044 aarch64_disqualify_components 24045 24046#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS 24047#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ 24048 aarch64_emit_prologue_components 24049 24050#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS 24051#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ 24052 aarch64_emit_epilogue_components 24053 24054#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS 24055#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ 24056 aarch64_set_handled_components 24057 24058#undef TARGET_TRAMPOLINE_INIT 24059#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init 24060 24061#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P 24062#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p 24063 24064#undef TARGET_VECTOR_MODE_SUPPORTED_P 24065#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p 24066 24067#undef TARGET_COMPATIBLE_VECTOR_TYPES_P 24068#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p 24069 24070#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT 24071#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \ 24072 aarch64_builtin_support_vector_misalignment 24073 24074#undef TARGET_ARRAY_MODE 24075#define TARGET_ARRAY_MODE aarch64_array_mode 24076 24077#undef TARGET_ARRAY_MODE_SUPPORTED_P 24078#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p 24079 24080#undef TARGET_VECTORIZE_ADD_STMT_COST 24081#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost 24082 24083#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 24084#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ 24085 aarch64_builtin_vectorization_cost 24086 24087#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE 24088#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode 24089 24090#undef TARGET_VECTORIZE_BUILTINS 24091#define TARGET_VECTORIZE_BUILTINS 24092 24093#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION 24094#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ 24095 aarch64_builtin_vectorized_function 24096 24097#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES 24098#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ 24099 aarch64_autovectorize_vector_modes 24100 24101#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV 24102#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \ 24103 aarch64_atomic_assign_expand_fenv 24104 24105/* Section anchor support. */ 24106 24107#undef TARGET_MIN_ANCHOR_OFFSET 24108#define TARGET_MIN_ANCHOR_OFFSET -256 24109 24110/* Limit the maximum anchor offset to 4k-1, since that's the limit for a 24111 byte offset; we can do much more for larger data types, but have no way 24112 to determine the size of the access. We assume accesses are aligned. */ 24113#undef TARGET_MAX_ANCHOR_OFFSET 24114#define TARGET_MAX_ANCHOR_OFFSET 4095 24115 24116#undef TARGET_VECTOR_ALIGNMENT 24117#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment 24118 24119#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT 24120#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ 24121 aarch64_vectorize_preferred_vector_alignment 24122#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE 24123#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ 24124 aarch64_simd_vector_alignment_reachable 24125 24126/* vec_perm support. */ 24127 24128#undef TARGET_VECTORIZE_VEC_PERM_CONST 24129#define TARGET_VECTORIZE_VEC_PERM_CONST \ 24130 aarch64_vectorize_vec_perm_const 24131 24132#undef TARGET_VECTORIZE_RELATED_MODE 24133#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode 24134#undef TARGET_VECTORIZE_GET_MASK_MODE 24135#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode 24136#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE 24137#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \ 24138 aarch64_empty_mask_is_expensive 24139#undef TARGET_PREFERRED_ELSE_VALUE 24140#define TARGET_PREFERRED_ELSE_VALUE \ 24141 aarch64_preferred_else_value 24142 24143#undef TARGET_INIT_LIBFUNCS 24144#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs 24145 24146#undef TARGET_FIXED_CONDITION_CODE_REGS 24147#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs 24148 24149#undef TARGET_FLAGS_REGNUM 24150#define TARGET_FLAGS_REGNUM CC_REGNUM 24151 24152#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS 24153#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true 24154 24155#undef TARGET_ASAN_SHADOW_OFFSET 24156#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset 24157 24158#undef TARGET_LEGITIMIZE_ADDRESS 24159#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address 24160 24161#undef TARGET_SCHED_CAN_SPECULATE_INSN 24162#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn 24163 24164#undef TARGET_CAN_USE_DOLOOP_P 24165#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost 24166 24167#undef TARGET_SCHED_ADJUST_PRIORITY 24168#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority 24169 24170#undef TARGET_SCHED_MACRO_FUSION_P 24171#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p 24172 24173#undef TARGET_SCHED_MACRO_FUSION_PAIR_P 24174#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p 24175 24176#undef TARGET_SCHED_FUSION_PRIORITY 24177#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority 24178 24179#undef TARGET_UNSPEC_MAY_TRAP_P 24180#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p 24181 24182#undef TARGET_USE_PSEUDO_PIC_REG 24183#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg 24184 24185#undef TARGET_PRINT_OPERAND 24186#define TARGET_PRINT_OPERAND aarch64_print_operand 24187 24188#undef TARGET_PRINT_OPERAND_ADDRESS 24189#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address 24190 24191#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA 24192#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra 24193 24194#undef TARGET_OPTAB_SUPPORTED_P 24195#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p 24196 24197#undef TARGET_OMIT_STRUCT_RETURN_REG 24198#define TARGET_OMIT_STRUCT_RETURN_REG true 24199 24200#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE 24201#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \ 24202 aarch64_dwarf_poly_indeterminate_value 24203 24204/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */ 24205#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS 24206#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4 24207 24208#undef TARGET_HARD_REGNO_NREGS 24209#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs 24210#undef TARGET_HARD_REGNO_MODE_OK 24211#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok 24212 24213#undef TARGET_MODES_TIEABLE_P 24214#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p 24215 24216#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED 24217#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ 24218 aarch64_hard_regno_call_part_clobbered 24219 24220#undef TARGET_INSN_CALLEE_ABI 24221#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi 24222 24223#undef TARGET_CONSTANT_ALIGNMENT 24224#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment 24225 24226#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE 24227#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \ 24228 aarch64_stack_clash_protection_alloca_probe_range 24229 24230#undef TARGET_COMPUTE_PRESSURE_CLASSES 24231#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes 24232 24233#undef TARGET_CAN_CHANGE_MODE_CLASS 24234#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class 24235 24236#undef TARGET_SELECT_EARLY_REMAT_MODES 24237#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes 24238 24239#undef TARGET_SPECULATION_SAFE_VALUE 24240#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value 24241 24242#undef TARGET_ESTIMATED_POLY_VALUE 24243#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value 24244 24245#undef TARGET_ATTRIBUTE_TABLE 24246#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table 24247 24248#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN 24249#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \ 24250 aarch64_simd_clone_compute_vecsize_and_simdlen 24251 24252#undef TARGET_SIMD_CLONE_ADJUST 24253#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust 24254 24255#undef TARGET_SIMD_CLONE_USABLE 24256#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable 24257 24258#undef TARGET_COMP_TYPE_ATTRIBUTES 24259#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes 24260 24261#undef TARGET_GET_MULTILIB_ABI_NAME 24262#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name 24263 24264#undef TARGET_FNTYPE_ABI 24265#define TARGET_FNTYPE_ABI aarch64_fntype_abi 24266 24267#if CHECKING_P 24268#undef TARGET_RUN_TARGET_SELFTESTS 24269#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests 24270#endif /* #if CHECKING_P */ 24271 24272#undef TARGET_ASM_POST_CFI_STARTPROC 24273#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc 24274 24275#undef TARGET_STRICT_ARGUMENT_NAMING 24276#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true 24277 24278#undef TARGET_MD_ASM_ADJUST 24279#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust 24280 24281#undef TARGET_ASM_FILE_END 24282#define TARGET_ASM_FILE_END aarch64_asm_file_end 24283 24284#undef TARGET_ASM_FUNCTION_EPILOGUE 24285#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks 24286 24287struct gcc_target targetm = TARGET_INITIALIZER; 24288 24289#include "gt-aarch64.h" 24290