1/* Copyright (C) 1988-2020 Free Software Foundation, Inc. 2 3This file is part of GCC. 4 5GCC is free software; you can redistribute it and/or modify 6it under the terms of the GNU General Public License as published by 7the Free Software Foundation; either version 3, or (at your option) 8any later version. 9 10GCC is distributed in the hope that it will be useful, 11but WITHOUT ANY WARRANTY; without even the implied warranty of 12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13GNU General Public License for more details. 14 15You should have received a copy of the GNU General Public License 16along with GCC; see the file COPYING3. If not see 17<http://www.gnu.org/licenses/>. */ 18 19#define IN_TARGET_CODE 1 20 21#include "config.h" 22#include "system.h" 23#include "coretypes.h" 24#include "backend.h" 25#include "rtl.h" 26#include "tree.h" 27#include "memmodel.h" 28#include "gimple.h" 29#include "cfghooks.h" 30#include "cfgloop.h" 31#include "df.h" 32#include "tm_p.h" 33#include "stringpool.h" 34#include "expmed.h" 35#include "optabs.h" 36#include "regs.h" 37#include "emit-rtl.h" 38#include "recog.h" 39#include "cgraph.h" 40#include "diagnostic.h" 41#include "cfgbuild.h" 42#include "alias.h" 43#include "fold-const.h" 44#include "attribs.h" 45#include "calls.h" 46#include "stor-layout.h" 47#include "varasm.h" 48#include "output.h" 49#include "insn-attr.h" 50#include "flags.h" 51#include "except.h" 52#include "explow.h" 53#include "expr.h" 54#include "cfgrtl.h" 55#include "common/common-target.h" 56#include "langhooks.h" 57#include "reload.h" 58#include "gimplify.h" 59#include "dwarf2.h" 60#include "tm-constrs.h" 61#include "cselib.h" 62#include "sched-int.h" 63#include "opts.h" 64#include "tree-pass.h" 65#include "context.h" 66#include "pass_manager.h" 67#include "target-globals.h" 68#include "gimple-iterator.h" 69#include "tree-vectorizer.h" 70#include "shrink-wrap.h" 71#include "builtins.h" 72#include "rtl-iter.h" 73#include "tree-iterator.h" 74#include "dbgcnt.h" 75#include "case-cfn-macros.h" 76#include "dojump.h" 77#include "fold-const-call.h" 78#include "tree-vrp.h" 79#include "tree-ssanames.h" 80#include "selftest.h" 81#include "selftest-rtl.h" 82#include "print-rtl.h" 83#include "intl.h" 84#include "ifcvt.h" 85#include "symbol-summary.h" 86#include "ipa-prop.h" 87#include "ipa-fnsummary.h" 88#include "wide-int-bitmask.h" 89#include "tree-vector-builder.h" 90#include "debug.h" 91#include "dwarf2out.h" 92#include "i386-options.h" 93#include "i386-builtins.h" 94#include "i386-expand.h" 95 96/* Split one or more double-mode RTL references into pairs of half-mode 97 references. The RTL can be REG, offsettable MEM, integer constant, or 98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to 99 split and "num" is its length. lo_half and hi_half are output arrays 100 that parallel "operands". */ 101 102void 103split_double_mode (machine_mode mode, rtx operands[], 104 int num, rtx lo_half[], rtx hi_half[]) 105{ 106 machine_mode half_mode; 107 unsigned int byte; 108 rtx mem_op = NULL_RTX; 109 int mem_num = 0; 110 111 switch (mode) 112 { 113 case E_TImode: 114 half_mode = DImode; 115 break; 116 case E_DImode: 117 half_mode = SImode; 118 break; 119 case E_P2HImode: 120 half_mode = HImode; 121 break; 122 case E_P2QImode: 123 half_mode = QImode; 124 break; 125 default: 126 gcc_unreachable (); 127 } 128 129 byte = GET_MODE_SIZE (half_mode); 130 131 while (num--) 132 { 133 rtx op = operands[num]; 134 135 /* simplify_subreg refuse to split volatile memory addresses, 136 but we still have to handle it. */ 137 if (MEM_P (op)) 138 { 139 if (mem_op && rtx_equal_p (op, mem_op)) 140 { 141 lo_half[num] = lo_half[mem_num]; 142 hi_half[num] = hi_half[mem_num]; 143 } 144 else 145 { 146 mem_op = op; 147 mem_num = num; 148 lo_half[num] = adjust_address (op, half_mode, 0); 149 hi_half[num] = adjust_address (op, half_mode, byte); 150 } 151 } 152 else 153 { 154 lo_half[num] = simplify_gen_subreg (half_mode, op, 155 GET_MODE (op) == VOIDmode 156 ? mode : GET_MODE (op), 0); 157 hi_half[num] = simplify_gen_subreg (half_mode, op, 158 GET_MODE (op) == VOIDmode 159 ? mode : GET_MODE (op), byte); 160 } 161 } 162} 163 164/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate 165 for the target. */ 166 167void 168ix86_expand_clear (rtx dest) 169{ 170 rtx tmp; 171 172 /* We play register width games, which are only valid after reload. */ 173 gcc_assert (reload_completed); 174 175 /* Avoid HImode and its attendant prefix byte. */ 176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4) 177 dest = gen_rtx_REG (SImode, REGNO (dest)); 178 tmp = gen_rtx_SET (dest, const0_rtx); 179 180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) 181 { 182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); 184 } 185 186 emit_insn (tmp); 187} 188 189void 190ix86_expand_move (machine_mode mode, rtx operands[]) 191{ 192 rtx op0, op1; 193 rtx tmp, addend = NULL_RTX; 194 enum tls_model model; 195 196 op0 = operands[0]; 197 op1 = operands[1]; 198 199 switch (GET_CODE (op1)) 200 { 201 case CONST: 202 tmp = XEXP (op1, 0); 203 204 if (GET_CODE (tmp) != PLUS 205 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF) 206 break; 207 208 op1 = XEXP (tmp, 0); 209 addend = XEXP (tmp, 1); 210 /* FALLTHRU */ 211 212 case SYMBOL_REF: 213 model = SYMBOL_REF_TLS_MODEL (op1); 214 215 if (model) 216 op1 = legitimize_tls_address (op1, model, true); 217 else if (ix86_force_load_from_GOT_p (op1)) 218 { 219 /* Load the external function address via GOT slot to avoid PLT. */ 220 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1), 221 (TARGET_64BIT 222 ? UNSPEC_GOTPCREL 223 : UNSPEC_GOT)); 224 op1 = gen_rtx_CONST (Pmode, op1); 225 op1 = gen_const_mem (Pmode, op1); 226 set_mem_alias_set (op1, ix86_GOT_alias_set ()); 227 } 228 else 229 { 230 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX); 231 if (tmp) 232 { 233 op1 = tmp; 234 if (!addend) 235 break; 236 } 237 else 238 { 239 op1 = operands[1]; 240 break; 241 } 242 } 243 244 if (addend) 245 { 246 op1 = force_operand (op1, NULL_RTX); 247 op1 = expand_simple_binop (Pmode, PLUS, op1, addend, 248 op0, 1, OPTAB_DIRECT); 249 } 250 else 251 op1 = force_operand (op1, op0); 252 253 if (op1 == op0) 254 return; 255 256 op1 = convert_to_mode (mode, op1, 1); 257 258 default: 259 break; 260 } 261 262 if ((flag_pic || MACHOPIC_INDIRECT) 263 && symbolic_operand (op1, mode)) 264 { 265 if (TARGET_MACHO && !TARGET_64BIT) 266 { 267#if TARGET_MACHO 268 /* dynamic-no-pic */ 269 if (MACHOPIC_INDIRECT) 270 { 271 rtx temp = (op0 && REG_P (op0) && mode == Pmode) 272 ? op0 : gen_reg_rtx (Pmode); 273 op1 = machopic_indirect_data_reference (op1, temp); 274 if (MACHOPIC_PURE) 275 op1 = machopic_legitimize_pic_address (op1, mode, 276 temp == op1 ? 0 : temp); 277 } 278 if (op0 != op1 && GET_CODE (op0) != MEM) 279 { 280 rtx insn = gen_rtx_SET (op0, op1); 281 emit_insn (insn); 282 return; 283 } 284 if (GET_CODE (op0) == MEM) 285 op1 = force_reg (Pmode, op1); 286 else 287 { 288 rtx temp = op0; 289 if (GET_CODE (temp) != REG) 290 temp = gen_reg_rtx (Pmode); 291 temp = legitimize_pic_address (op1, temp); 292 if (temp == op0) 293 return; 294 op1 = temp; 295 } 296 /* dynamic-no-pic */ 297#endif 298 } 299 else 300 { 301 if (MEM_P (op0)) 302 op1 = force_reg (mode, op1); 303 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode))) 304 { 305 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; 306 op1 = legitimize_pic_address (op1, reg); 307 if (op0 == op1) 308 return; 309 op1 = convert_to_mode (mode, op1, 1); 310 } 311 } 312 } 313 else 314 { 315 if (MEM_P (op0) 316 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) 317 || !push_operand (op0, mode)) 318 && MEM_P (op1)) 319 op1 = force_reg (mode, op1); 320 321 if (push_operand (op0, mode) 322 && ! general_no_elim_operand (op1, mode)) 323 op1 = copy_to_mode_reg (mode, op1); 324 325 /* Force large constants in 64bit compilation into register 326 to get them CSEed. */ 327 if (can_create_pseudo_p () 328 && (mode == DImode) && TARGET_64BIT 329 && immediate_operand (op1, mode) 330 && !x86_64_zext_immediate_operand (op1, VOIDmode) 331 && !register_operand (op0, mode) 332 && optimize) 333 op1 = copy_to_mode_reg (mode, op1); 334 335 if (can_create_pseudo_p () 336 && CONST_DOUBLE_P (op1)) 337 { 338 /* If we are loading a floating point constant to a register, 339 force the value to memory now, since we'll get better code 340 out the back end. */ 341 342 op1 = validize_mem (force_const_mem (mode, op1)); 343 if (!register_operand (op0, mode)) 344 { 345 rtx temp = gen_reg_rtx (mode); 346 emit_insn (gen_rtx_SET (temp, op1)); 347 emit_move_insn (op0, temp); 348 return; 349 } 350 } 351 } 352 353 emit_insn (gen_rtx_SET (op0, op1)); 354} 355 356void 357ix86_expand_vector_move (machine_mode mode, rtx operands[]) 358{ 359 rtx op0 = operands[0], op1 = operands[1]; 360 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU 361 psABI since the biggest alignment is 4 byte for IA MCU psABI. */ 362 unsigned int align = (TARGET_IAMCU 363 ? GET_MODE_BITSIZE (mode) 364 : GET_MODE_ALIGNMENT (mode)); 365 366 if (push_operand (op0, VOIDmode)) 367 op0 = emit_move_resolve_push (mode, op0); 368 369 /* Force constants other than zero into memory. We do not know how 370 the instructions used to build constants modify the upper 64 bits 371 of the register, once we have that information we may be able 372 to handle some of them more efficiently. */ 373 if (can_create_pseudo_p () 374 && (CONSTANT_P (op1) 375 || (SUBREG_P (op1) 376 && CONSTANT_P (SUBREG_REG (op1)))) 377 && ((register_operand (op0, mode) 378 && !standard_sse_constant_p (op1, mode)) 379 /* ix86_expand_vector_move_misalign() does not like constants. */ 380 || (SSE_REG_MODE_P (mode) 381 && MEM_P (op0) 382 && MEM_ALIGN (op0) < align))) 383 { 384 if (SUBREG_P (op1)) 385 { 386 machine_mode imode = GET_MODE (SUBREG_REG (op1)); 387 rtx r = force_const_mem (imode, SUBREG_REG (op1)); 388 if (r) 389 r = validize_mem (r); 390 else 391 r = force_reg (imode, SUBREG_REG (op1)); 392 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); 393 } 394 else 395 op1 = validize_mem (force_const_mem (mode, op1)); 396 } 397 398 /* We need to check memory alignment for SSE mode since attribute 399 can make operands unaligned. */ 400 if (can_create_pseudo_p () 401 && SSE_REG_MODE_P (mode) 402 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) 403 || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) 404 { 405 rtx tmp[2]; 406 407 /* ix86_expand_vector_move_misalign() does not like both 408 arguments in memory. */ 409 if (!register_operand (op0, mode) 410 && !register_operand (op1, mode)) 411 op1 = force_reg (mode, op1); 412 413 tmp[0] = op0; tmp[1] = op1; 414 ix86_expand_vector_move_misalign (mode, tmp); 415 return; 416 } 417 418 /* Make operand1 a register if it isn't already. */ 419 if (can_create_pseudo_p () 420 && !register_operand (op0, mode) 421 && !register_operand (op1, mode)) 422 { 423 emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); 424 return; 425 } 426 427 emit_insn (gen_rtx_SET (op0, op1)); 428} 429 430/* Split 32-byte AVX unaligned load and store if needed. */ 431 432static void 433ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) 434{ 435 rtx m; 436 rtx (*extract) (rtx, rtx, rtx); 437 machine_mode mode; 438 439 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) 440 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) 441 { 442 emit_insn (gen_rtx_SET (op0, op1)); 443 return; 444 } 445 446 rtx orig_op0 = NULL_RTX; 447 mode = GET_MODE (op0); 448 switch (GET_MODE_CLASS (mode)) 449 { 450 case MODE_VECTOR_INT: 451 case MODE_INT: 452 if (mode != V32QImode) 453 { 454 if (!MEM_P (op0)) 455 { 456 orig_op0 = op0; 457 op0 = gen_reg_rtx (V32QImode); 458 } 459 else 460 op0 = gen_lowpart (V32QImode, op0); 461 op1 = gen_lowpart (V32QImode, op1); 462 mode = V32QImode; 463 } 464 break; 465 case MODE_VECTOR_FLOAT: 466 break; 467 default: 468 gcc_unreachable (); 469 } 470 471 switch (mode) 472 { 473 default: 474 gcc_unreachable (); 475 case E_V32QImode: 476 extract = gen_avx_vextractf128v32qi; 477 mode = V16QImode; 478 break; 479 case E_V8SFmode: 480 extract = gen_avx_vextractf128v8sf; 481 mode = V4SFmode; 482 break; 483 case E_V4DFmode: 484 extract = gen_avx_vextractf128v4df; 485 mode = V2DFmode; 486 break; 487 } 488 489 if (MEM_P (op1)) 490 { 491 rtx r = gen_reg_rtx (mode); 492 m = adjust_address (op1, mode, 0); 493 emit_move_insn (r, m); 494 m = adjust_address (op1, mode, 16); 495 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); 496 emit_move_insn (op0, r); 497 } 498 else if (MEM_P (op0)) 499 { 500 m = adjust_address (op0, mode, 0); 501 emit_insn (extract (m, op1, const0_rtx)); 502 m = adjust_address (op0, mode, 16); 503 emit_insn (extract (m, copy_rtx (op1), const1_rtx)); 504 } 505 else 506 gcc_unreachable (); 507 508 if (orig_op0) 509 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); 510} 511 512/* Implement the movmisalign patterns for SSE. Non-SSE modes go 513 straight to ix86_expand_vector_move. */ 514/* Code generation for scalar reg-reg moves of single and double precision data: 515 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) 516 movaps reg, reg 517 else 518 movss reg, reg 519 if (x86_sse_partial_reg_dependency == true) 520 movapd reg, reg 521 else 522 movsd reg, reg 523 524 Code generation for scalar loads of double precision data: 525 if (x86_sse_split_regs == true) 526 movlpd mem, reg (gas syntax) 527 else 528 movsd mem, reg 529 530 Code generation for unaligned packed loads of single precision data 531 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): 532 if (x86_sse_unaligned_move_optimal) 533 movups mem, reg 534 535 if (x86_sse_partial_reg_dependency == true) 536 { 537 xorps reg, reg 538 movlps mem, reg 539 movhps mem+8, reg 540 } 541 else 542 { 543 movlps mem, reg 544 movhps mem+8, reg 545 } 546 547 Code generation for unaligned packed loads of double precision data 548 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): 549 if (x86_sse_unaligned_move_optimal) 550 movupd mem, reg 551 552 if (x86_sse_split_regs == true) 553 { 554 movlpd mem, reg 555 movhpd mem+8, reg 556 } 557 else 558 { 559 movsd mem, reg 560 movhpd mem+8, reg 561 } 562 */ 563 564void 565ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) 566{ 567 rtx op0, op1, m; 568 569 op0 = operands[0]; 570 op1 = operands[1]; 571 572 /* Use unaligned load/store for AVX512 or when optimizing for size. */ 573 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) 574 { 575 emit_insn (gen_rtx_SET (op0, op1)); 576 return; 577 } 578 579 if (TARGET_AVX) 580 { 581 if (GET_MODE_SIZE (mode) == 32) 582 ix86_avx256_split_vector_move_misalign (op0, op1); 583 else 584 /* Always use 128-bit mov<mode>_internal pattern for AVX. */ 585 emit_insn (gen_rtx_SET (op0, op1)); 586 return; 587 } 588 589 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL 590 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) 591 { 592 emit_insn (gen_rtx_SET (op0, op1)); 593 return; 594 } 595 596 /* ??? If we have typed data, then it would appear that using 597 movdqu is the only way to get unaligned data loaded with 598 integer type. */ 599 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) 600 { 601 emit_insn (gen_rtx_SET (op0, op1)); 602 return; 603 } 604 605 if (MEM_P (op1)) 606 { 607 if (TARGET_SSE2 && mode == V2DFmode) 608 { 609 rtx zero; 610 611 /* When SSE registers are split into halves, we can avoid 612 writing to the top half twice. */ 613 if (TARGET_SSE_SPLIT_REGS) 614 { 615 emit_clobber (op0); 616 zero = op0; 617 } 618 else 619 { 620 /* ??? Not sure about the best option for the Intel chips. 621 The following would seem to satisfy; the register is 622 entirely cleared, breaking the dependency chain. We 623 then store to the upper half, with a dependency depth 624 of one. A rumor has it that Intel recommends two movsd 625 followed by an unpacklpd, but this is unconfirmed. And 626 given that the dependency depth of the unpacklpd would 627 still be one, I'm not sure why this would be better. */ 628 zero = CONST0_RTX (V2DFmode); 629 } 630 631 m = adjust_address (op1, DFmode, 0); 632 emit_insn (gen_sse2_loadlpd (op0, zero, m)); 633 m = adjust_address (op1, DFmode, 8); 634 emit_insn (gen_sse2_loadhpd (op0, op0, m)); 635 } 636 else 637 { 638 rtx t; 639 640 if (mode != V4SFmode) 641 t = gen_reg_rtx (V4SFmode); 642 else 643 t = op0; 644 645 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) 646 emit_move_insn (t, CONST0_RTX (V4SFmode)); 647 else 648 emit_clobber (t); 649 650 m = adjust_address (op1, V2SFmode, 0); 651 emit_insn (gen_sse_loadlps (t, t, m)); 652 m = adjust_address (op1, V2SFmode, 8); 653 emit_insn (gen_sse_loadhps (t, t, m)); 654 if (mode != V4SFmode) 655 emit_move_insn (op0, gen_lowpart (mode, t)); 656 } 657 } 658 else if (MEM_P (op0)) 659 { 660 if (TARGET_SSE2 && mode == V2DFmode) 661 { 662 m = adjust_address (op0, DFmode, 0); 663 emit_insn (gen_sse2_storelpd (m, op1)); 664 m = adjust_address (op0, DFmode, 8); 665 emit_insn (gen_sse2_storehpd (m, op1)); 666 } 667 else 668 { 669 if (mode != V4SFmode) 670 op1 = gen_lowpart (V4SFmode, op1); 671 672 m = adjust_address (op0, V2SFmode, 0); 673 emit_insn (gen_sse_storelps (m, op1)); 674 m = adjust_address (op0, V2SFmode, 8); 675 emit_insn (gen_sse_storehps (m, copy_rtx (op1))); 676 } 677 } 678 else 679 gcc_unreachable (); 680} 681 682/* Move bits 64:95 to bits 32:63. */ 683 684void 685ix86_move_vector_high_sse_to_mmx (rtx op) 686{ 687 rtx mask = gen_rtx_PARALLEL (VOIDmode, 688 gen_rtvec (4, GEN_INT (0), GEN_INT (2), 689 GEN_INT (0), GEN_INT (0))); 690 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op)); 691 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask); 692 rtx insn = gen_rtx_SET (dest, op); 693 emit_insn (insn); 694} 695 696/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */ 697 698void 699ix86_split_mmx_pack (rtx operands[], enum rtx_code code) 700{ 701 rtx op0 = operands[0]; 702 rtx op1 = operands[1]; 703 rtx op2 = operands[2]; 704 705 machine_mode dmode = GET_MODE (op0); 706 machine_mode smode = GET_MODE (op1); 707 machine_mode inner_dmode = GET_MODE_INNER (dmode); 708 machine_mode inner_smode = GET_MODE_INNER (smode); 709 710 /* Get the corresponding SSE mode for destination. */ 711 int nunits = 16 / GET_MODE_SIZE (inner_dmode); 712 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode), 713 nunits).require (); 714 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode), 715 nunits / 2).require (); 716 717 /* Get the corresponding SSE mode for source. */ 718 nunits = 16 / GET_MODE_SIZE (inner_smode); 719 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode), 720 nunits).require (); 721 722 /* Generate SSE pack with signed/unsigned saturation. */ 723 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0)); 724 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1)); 725 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2)); 726 727 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); 728 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); 729 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, 730 op1, op2)); 731 emit_insn (insn); 732 733 ix86_move_vector_high_sse_to_mmx (op0); 734} 735 736/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */ 737 738void 739ix86_split_mmx_punpck (rtx operands[], bool high_p) 740{ 741 rtx op0 = operands[0]; 742 rtx op1 = operands[1]; 743 rtx op2 = operands[2]; 744 machine_mode mode = GET_MODE (op0); 745 rtx mask; 746 /* The corresponding SSE mode. */ 747 machine_mode sse_mode, double_sse_mode; 748 749 switch (mode) 750 { 751 case E_V8QImode: 752 sse_mode = V16QImode; 753 double_sse_mode = V32QImode; 754 mask = gen_rtx_PARALLEL (VOIDmode, 755 gen_rtvec (16, 756 GEN_INT (0), GEN_INT (16), 757 GEN_INT (1), GEN_INT (17), 758 GEN_INT (2), GEN_INT (18), 759 GEN_INT (3), GEN_INT (19), 760 GEN_INT (4), GEN_INT (20), 761 GEN_INT (5), GEN_INT (21), 762 GEN_INT (6), GEN_INT (22), 763 GEN_INT (7), GEN_INT (23))); 764 break; 765 766 case E_V4HImode: 767 sse_mode = V8HImode; 768 double_sse_mode = V16HImode; 769 mask = gen_rtx_PARALLEL (VOIDmode, 770 gen_rtvec (8, 771 GEN_INT (0), GEN_INT (8), 772 GEN_INT (1), GEN_INT (9), 773 GEN_INT (2), GEN_INT (10), 774 GEN_INT (3), GEN_INT (11))); 775 break; 776 777 case E_V2SImode: 778 sse_mode = V4SImode; 779 double_sse_mode = V8SImode; 780 mask = gen_rtx_PARALLEL (VOIDmode, 781 gen_rtvec (4, 782 GEN_INT (0), GEN_INT (4), 783 GEN_INT (1), GEN_INT (5))); 784 break; 785 786 default: 787 gcc_unreachable (); 788 } 789 790 /* Generate SSE punpcklXX. */ 791 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0)); 792 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1)); 793 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2)); 794 795 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2); 796 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask); 797 rtx insn = gen_rtx_SET (dest, op2); 798 emit_insn (insn); 799 800 if (high_p) 801 { 802 /* Move bits 64:127 to bits 0:63. */ 803 mask = gen_rtx_PARALLEL (VOIDmode, 804 gen_rtvec (4, GEN_INT (2), GEN_INT (3), 805 GEN_INT (0), GEN_INT (0))); 806 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); 807 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); 808 insn = gen_rtx_SET (dest, op1); 809 emit_insn (insn); 810 } 811} 812 813/* Helper function of ix86_fixup_binary_operands to canonicalize 814 operand order. Returns true if the operands should be swapped. */ 815 816static bool 817ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode, 818 rtx operands[]) 819{ 820 rtx dst = operands[0]; 821 rtx src1 = operands[1]; 822 rtx src2 = operands[2]; 823 824 /* If the operation is not commutative, we can't do anything. */ 825 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH 826 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE) 827 return false; 828 829 /* Highest priority is that src1 should match dst. */ 830 if (rtx_equal_p (dst, src1)) 831 return false; 832 if (rtx_equal_p (dst, src2)) 833 return true; 834 835 /* Next highest priority is that immediate constants come second. */ 836 if (immediate_operand (src2, mode)) 837 return false; 838 if (immediate_operand (src1, mode)) 839 return true; 840 841 /* Lowest priority is that memory references should come second. */ 842 if (MEM_P (src2)) 843 return false; 844 if (MEM_P (src1)) 845 return true; 846 847 return false; 848} 849 850 851/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the 852 destination to use for the operation. If different from the true 853 destination in operands[0], a copy operation will be required. */ 854 855rtx 856ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode, 857 rtx operands[]) 858{ 859 rtx dst = operands[0]; 860 rtx src1 = operands[1]; 861 rtx src2 = operands[2]; 862 863 /* Canonicalize operand order. */ 864 if (ix86_swap_binary_operands_p (code, mode, operands)) 865 { 866 /* It is invalid to swap operands of different modes. */ 867 gcc_assert (GET_MODE (src1) == GET_MODE (src2)); 868 869 std::swap (src1, src2); 870 } 871 872 /* Both source operands cannot be in memory. */ 873 if (MEM_P (src1) && MEM_P (src2)) 874 { 875 /* Optimization: Only read from memory once. */ 876 if (rtx_equal_p (src1, src2)) 877 { 878 src2 = force_reg (mode, src2); 879 src1 = src2; 880 } 881 else if (rtx_equal_p (dst, src1)) 882 src2 = force_reg (mode, src2); 883 else 884 src1 = force_reg (mode, src1); 885 } 886 887 /* If the destination is memory, and we do not have matching source 888 operands, do things in registers. */ 889 if (MEM_P (dst) && !rtx_equal_p (dst, src1)) 890 dst = gen_reg_rtx (mode); 891 892 /* Source 1 cannot be a constant. */ 893 if (CONSTANT_P (src1)) 894 src1 = force_reg (mode, src1); 895 896 /* Source 1 cannot be a non-matching memory. */ 897 if (MEM_P (src1) && !rtx_equal_p (dst, src1)) 898 src1 = force_reg (mode, src1); 899 900 /* Improve address combine. */ 901 if (code == PLUS 902 && GET_MODE_CLASS (mode) == MODE_INT 903 && MEM_P (src2)) 904 src2 = force_reg (mode, src2); 905 906 operands[1] = src1; 907 operands[2] = src2; 908 return dst; 909} 910 911/* Similarly, but assume that the destination has already been 912 set up properly. */ 913 914void 915ix86_fixup_binary_operands_no_copy (enum rtx_code code, 916 machine_mode mode, rtx operands[]) 917{ 918 rtx dst = ix86_fixup_binary_operands (code, mode, operands); 919 gcc_assert (dst == operands[0]); 920} 921 922/* Attempt to expand a binary operator. Make the expansion closer to the 923 actual machine, then just general_operand, which will allow 3 separate 924 memory references (one output, two input) in a single insn. */ 925 926void 927ix86_expand_binary_operator (enum rtx_code code, machine_mode mode, 928 rtx operands[]) 929{ 930 rtx src1, src2, dst, op, clob; 931 932 dst = ix86_fixup_binary_operands (code, mode, operands); 933 src1 = operands[1]; 934 src2 = operands[2]; 935 936 /* Emit the instruction. */ 937 938 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2)); 939 940 if (reload_completed 941 && code == PLUS 942 && !rtx_equal_p (dst, src1)) 943 { 944 /* This is going to be an LEA; avoid splitting it later. */ 945 emit_insn (op); 946 } 947 else 948 { 949 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 950 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); 951 } 952 953 /* Fix up the destination if needed. */ 954 if (dst != operands[0]) 955 emit_move_insn (operands[0], dst); 956} 957 958/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with 959 the given OPERANDS. */ 960 961void 962ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode, 963 rtx operands[]) 964{ 965 rtx op1 = NULL_RTX, op2 = NULL_RTX; 966 if (SUBREG_P (operands[1])) 967 { 968 op1 = operands[1]; 969 op2 = operands[2]; 970 } 971 else if (SUBREG_P (operands[2])) 972 { 973 op1 = operands[2]; 974 op2 = operands[1]; 975 } 976 /* Optimize (__m128i) d | (__m128i) e and similar code 977 when d and e are float vectors into float vector logical 978 insn. In C/C++ without using intrinsics there is no other way 979 to express vector logical operation on float vectors than 980 to cast them temporarily to integer vectors. */ 981 if (op1 982 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL 983 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR) 984 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT 985 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode) 986 && SUBREG_BYTE (op1) == 0 987 && (GET_CODE (op2) == CONST_VECTOR 988 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2)) 989 && SUBREG_BYTE (op2) == 0)) 990 && can_create_pseudo_p ()) 991 { 992 rtx dst; 993 switch (GET_MODE (SUBREG_REG (op1))) 994 { 995 case E_V4SFmode: 996 case E_V8SFmode: 997 case E_V16SFmode: 998 case E_V2DFmode: 999 case E_V4DFmode: 1000 case E_V8DFmode: 1001 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1))); 1002 if (GET_CODE (op2) == CONST_VECTOR) 1003 { 1004 op2 = gen_lowpart (GET_MODE (dst), op2); 1005 op2 = force_reg (GET_MODE (dst), op2); 1006 } 1007 else 1008 { 1009 op1 = operands[1]; 1010 op2 = SUBREG_REG (operands[2]); 1011 if (!vector_operand (op2, GET_MODE (dst))) 1012 op2 = force_reg (GET_MODE (dst), op2); 1013 } 1014 op1 = SUBREG_REG (op1); 1015 if (!vector_operand (op1, GET_MODE (dst))) 1016 op1 = force_reg (GET_MODE (dst), op1); 1017 emit_insn (gen_rtx_SET (dst, 1018 gen_rtx_fmt_ee (code, GET_MODE (dst), 1019 op1, op2))); 1020 emit_move_insn (operands[0], gen_lowpart (mode, dst)); 1021 return; 1022 default: 1023 break; 1024 } 1025 } 1026 if (!vector_operand (operands[1], mode)) 1027 operands[1] = force_reg (mode, operands[1]); 1028 if (!vector_operand (operands[2], mode)) 1029 operands[2] = force_reg (mode, operands[2]); 1030 ix86_fixup_binary_operands_no_copy (code, mode, operands); 1031 emit_insn (gen_rtx_SET (operands[0], 1032 gen_rtx_fmt_ee (code, mode, operands[1], 1033 operands[2]))); 1034} 1035 1036/* Return TRUE or FALSE depending on whether the binary operator meets the 1037 appropriate constraints. */ 1038 1039bool 1040ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, 1041 rtx operands[3]) 1042{ 1043 rtx dst = operands[0]; 1044 rtx src1 = operands[1]; 1045 rtx src2 = operands[2]; 1046 1047 /* Both source operands cannot be in memory. */ 1048 if (MEM_P (src1) && MEM_P (src2)) 1049 return false; 1050 1051 /* Canonicalize operand order for commutative operators. */ 1052 if (ix86_swap_binary_operands_p (code, mode, operands)) 1053 std::swap (src1, src2); 1054 1055 /* If the destination is memory, we must have a matching source operand. */ 1056 if (MEM_P (dst) && !rtx_equal_p (dst, src1)) 1057 return false; 1058 1059 /* Source 1 cannot be a constant. */ 1060 if (CONSTANT_P (src1)) 1061 return false; 1062 1063 /* Source 1 cannot be a non-matching memory. */ 1064 if (MEM_P (src1) && !rtx_equal_p (dst, src1)) 1065 /* Support "andhi/andsi/anddi" as a zero-extending move. */ 1066 return (code == AND 1067 && (mode == HImode 1068 || mode == SImode 1069 || (TARGET_64BIT && mode == DImode)) 1070 && satisfies_constraint_L (src2)); 1071 1072 return true; 1073} 1074 1075/* Attempt to expand a unary operator. Make the expansion closer to the 1076 actual machine, then just general_operand, which will allow 2 separate 1077 memory references (one output, one input) in a single insn. */ 1078 1079void 1080ix86_expand_unary_operator (enum rtx_code code, machine_mode mode, 1081 rtx operands[]) 1082{ 1083 bool matching_memory = false; 1084 rtx src, dst, op, clob; 1085 1086 dst = operands[0]; 1087 src = operands[1]; 1088 1089 /* If the destination is memory, and we do not have matching source 1090 operands, do things in registers. */ 1091 if (MEM_P (dst)) 1092 { 1093 if (rtx_equal_p (dst, src)) 1094 matching_memory = true; 1095 else 1096 dst = gen_reg_rtx (mode); 1097 } 1098 1099 /* When source operand is memory, destination must match. */ 1100 if (MEM_P (src) && !matching_memory) 1101 src = force_reg (mode, src); 1102 1103 /* Emit the instruction. */ 1104 1105 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src)); 1106 1107 if (code == NOT) 1108 emit_insn (op); 1109 else 1110 { 1111 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 1112 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); 1113 } 1114 1115 /* Fix up the destination if needed. */ 1116 if (dst != operands[0]) 1117 emit_move_insn (operands[0], dst); 1118} 1119 1120/* Predict just emitted jump instruction to be taken with probability PROB. */ 1121 1122static void 1123predict_jump (int prob) 1124{ 1125 rtx_insn *insn = get_last_insn (); 1126 gcc_assert (JUMP_P (insn)); 1127 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob)); 1128} 1129 1130/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and 1131 divisor are within the range [0-255]. */ 1132 1133void 1134ix86_split_idivmod (machine_mode mode, rtx operands[], 1135 bool unsigned_p) 1136{ 1137 rtx_code_label *end_label, *qimode_label; 1138 rtx div, mod; 1139 rtx_insn *insn; 1140 rtx scratch, tmp0, tmp1, tmp2; 1141 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); 1142 1143 operands[2] = force_reg (mode, operands[2]); 1144 operands[3] = force_reg (mode, operands[3]); 1145 1146 switch (mode) 1147 { 1148 case E_SImode: 1149 if (GET_MODE (operands[0]) == SImode) 1150 { 1151 if (GET_MODE (operands[1]) == SImode) 1152 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1; 1153 else 1154 gen_divmod4_1 1155 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2; 1156 } 1157 else 1158 gen_divmod4_1 1159 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1; 1160 break; 1161 1162 case E_DImode: 1163 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1; 1164 break; 1165 1166 default: 1167 gcc_unreachable (); 1168 } 1169 1170 end_label = gen_label_rtx (); 1171 qimode_label = gen_label_rtx (); 1172 1173 scratch = gen_reg_rtx (mode); 1174 1175 /* Use 8bit unsigned divimod if dividend and divisor are within 1176 the range [0-255]. */ 1177 emit_move_insn (scratch, operands[2]); 1178 scratch = expand_simple_binop (mode, IOR, scratch, operands[3], 1179 scratch, 1, OPTAB_DIRECT); 1180 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100))); 1181 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); 1182 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); 1183 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, 1184 gen_rtx_LABEL_REF (VOIDmode, qimode_label), 1185 pc_rtx); 1186 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0)); 1187 predict_jump (REG_BR_PROB_BASE * 50 / 100); 1188 JUMP_LABEL (insn) = qimode_label; 1189 1190 /* Generate original signed/unsigned divimod. */ 1191 div = gen_divmod4_1 (operands[0], operands[1], 1192 operands[2], operands[3]); 1193 emit_insn (div); 1194 1195 /* Branch to the end. */ 1196 emit_jump_insn (gen_jump (end_label)); 1197 emit_barrier (); 1198 1199 /* Generate 8bit unsigned divide. */ 1200 emit_label (qimode_label); 1201 /* Don't use operands[0] for result of 8bit divide since not all 1202 registers support QImode ZERO_EXTRACT. */ 1203 tmp0 = lowpart_subreg (HImode, scratch, mode); 1204 tmp1 = lowpart_subreg (HImode, operands[2], mode); 1205 tmp2 = lowpart_subreg (QImode, operands[3], mode); 1206 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); 1207 1208 if (unsigned_p) 1209 { 1210 div = gen_rtx_UDIV (mode, operands[2], operands[3]); 1211 mod = gen_rtx_UMOD (mode, operands[2], operands[3]); 1212 } 1213 else 1214 { 1215 div = gen_rtx_DIV (mode, operands[2], operands[3]); 1216 mod = gen_rtx_MOD (mode, operands[2], operands[3]); 1217 } 1218 if (mode == SImode) 1219 { 1220 if (GET_MODE (operands[0]) != SImode) 1221 div = gen_rtx_ZERO_EXTEND (DImode, div); 1222 if (GET_MODE (operands[1]) != SImode) 1223 mod = gen_rtx_ZERO_EXTEND (DImode, mod); 1224 } 1225 1226 /* Extract remainder from AH. */ 1227 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), 1228 tmp0, GEN_INT (8), GEN_INT (8)); 1229 if (REG_P (operands[1])) 1230 insn = emit_move_insn (operands[1], tmp1); 1231 else 1232 { 1233 /* Need a new scratch register since the old one has result 1234 of 8bit divide. */ 1235 scratch = gen_reg_rtx (GET_MODE (operands[1])); 1236 emit_move_insn (scratch, tmp1); 1237 insn = emit_move_insn (operands[1], scratch); 1238 } 1239 set_unique_reg_note (insn, REG_EQUAL, mod); 1240 1241 /* Zero extend quotient from AL. */ 1242 tmp1 = gen_lowpart (QImode, tmp0); 1243 insn = emit_insn (gen_extend_insn 1244 (operands[0], tmp1, 1245 GET_MODE (operands[0]), QImode, 1)); 1246 set_unique_reg_note (insn, REG_EQUAL, div); 1247 1248 emit_label (end_label); 1249} 1250 1251/* Emit x86 binary operand CODE in mode MODE, where the first operand 1252 matches destination. RTX includes clobber of FLAGS_REG. */ 1253 1254void 1255ix86_emit_binop (enum rtx_code code, machine_mode mode, 1256 rtx dst, rtx src) 1257{ 1258 rtx op, clob; 1259 1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); 1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 1262 1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); 1264} 1265 1266/* Return true if regno1 def is nearest to the insn. */ 1267 1268static bool 1269find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) 1270{ 1271 rtx_insn *prev = insn; 1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn)); 1273 1274 if (insn == start) 1275 return false; 1276 while (prev && prev != start) 1277 { 1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev)) 1279 { 1280 prev = PREV_INSN (prev); 1281 continue; 1282 } 1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev)) 1284 return true; 1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev)) 1286 return false; 1287 prev = PREV_INSN (prev); 1288 } 1289 1290 /* None of the regs is defined in the bb. */ 1291 return false; 1292} 1293 1294/* Split lea instructions into a sequence of instructions 1295 which are executed on ALU to avoid AGU stalls. 1296 It is assumed that it is allowed to clobber flags register 1297 at lea position. */ 1298 1299void 1300ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode) 1301{ 1302 unsigned int regno0, regno1, regno2; 1303 struct ix86_address parts; 1304 rtx target, tmp; 1305 int ok, adds; 1306 1307 ok = ix86_decompose_address (operands[1], &parts); 1308 gcc_assert (ok); 1309 1310 target = gen_lowpart (mode, operands[0]); 1311 1312 regno0 = true_regnum (target); 1313 regno1 = INVALID_REGNUM; 1314 regno2 = INVALID_REGNUM; 1315 1316 if (parts.base) 1317 { 1318 parts.base = gen_lowpart (mode, parts.base); 1319 regno1 = true_regnum (parts.base); 1320 } 1321 1322 if (parts.index) 1323 { 1324 parts.index = gen_lowpart (mode, parts.index); 1325 regno2 = true_regnum (parts.index); 1326 } 1327 1328 if (parts.disp) 1329 parts.disp = gen_lowpart (mode, parts.disp); 1330 1331 if (parts.scale > 1) 1332 { 1333 /* Case r1 = r1 + ... */ 1334 if (regno1 == regno0) 1335 { 1336 /* If we have a case r1 = r1 + C * r2 then we 1337 should use multiplication which is very 1338 expensive. Assume cost model is wrong if we 1339 have such case here. */ 1340 gcc_assert (regno2 != regno0); 1341 1342 for (adds = parts.scale; adds > 0; adds--) 1343 ix86_emit_binop (PLUS, mode, target, parts.index); 1344 } 1345 else 1346 { 1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */ 1348 if (regno0 != regno2) 1349 emit_insn (gen_rtx_SET (target, parts.index)); 1350 1351 /* Use shift for scaling. */ 1352 ix86_emit_binop (ASHIFT, mode, target, 1353 GEN_INT (exact_log2 (parts.scale))); 1354 1355 if (parts.base) 1356 ix86_emit_binop (PLUS, mode, target, parts.base); 1357 1358 if (parts.disp && parts.disp != const0_rtx) 1359 ix86_emit_binop (PLUS, mode, target, parts.disp); 1360 } 1361 } 1362 else if (!parts.base && !parts.index) 1363 { 1364 gcc_assert(parts.disp); 1365 emit_insn (gen_rtx_SET (target, parts.disp)); 1366 } 1367 else 1368 { 1369 if (!parts.base) 1370 { 1371 if (regno0 != regno2) 1372 emit_insn (gen_rtx_SET (target, parts.index)); 1373 } 1374 else if (!parts.index) 1375 { 1376 if (regno0 != regno1) 1377 emit_insn (gen_rtx_SET (target, parts.base)); 1378 } 1379 else 1380 { 1381 if (regno0 == regno1) 1382 tmp = parts.index; 1383 else if (regno0 == regno2) 1384 tmp = parts.base; 1385 else 1386 { 1387 rtx tmp1; 1388 1389 /* Find better operand for SET instruction, depending 1390 on which definition is farther from the insn. */ 1391 if (find_nearest_reg_def (insn, regno1, regno2)) 1392 tmp = parts.index, tmp1 = parts.base; 1393 else 1394 tmp = parts.base, tmp1 = parts.index; 1395 1396 emit_insn (gen_rtx_SET (target, tmp)); 1397 1398 if (parts.disp && parts.disp != const0_rtx) 1399 ix86_emit_binop (PLUS, mode, target, parts.disp); 1400 1401 ix86_emit_binop (PLUS, mode, target, tmp1); 1402 return; 1403 } 1404 1405 ix86_emit_binop (PLUS, mode, target, tmp); 1406 } 1407 1408 if (parts.disp && parts.disp != const0_rtx) 1409 ix86_emit_binop (PLUS, mode, target, parts.disp); 1410 } 1411} 1412 1413/* Post-reload splitter for converting an SF or DFmode value in an 1414 SSE register into an unsigned SImode. */ 1415 1416void 1417ix86_split_convert_uns_si_sse (rtx operands[]) 1418{ 1419 machine_mode vecmode; 1420 rtx value, large, zero_or_two31, input, two31, x; 1421 1422 large = operands[1]; 1423 zero_or_two31 = operands[2]; 1424 input = operands[3]; 1425 two31 = operands[4]; 1426 vecmode = GET_MODE (large); 1427 value = gen_rtx_REG (vecmode, REGNO (operands[0])); 1428 1429 /* Load up the value into the low element. We must ensure that the other 1430 elements are valid floats -- zero is the easiest such value. */ 1431 if (MEM_P (input)) 1432 { 1433 if (vecmode == V4SFmode) 1434 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); 1435 else 1436 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); 1437 } 1438 else 1439 { 1440 input = gen_rtx_REG (vecmode, REGNO (input)); 1441 emit_move_insn (value, CONST0_RTX (vecmode)); 1442 if (vecmode == V4SFmode) 1443 emit_insn (gen_sse_movss (value, value, input)); 1444 else 1445 emit_insn (gen_sse2_movsd (value, value, input)); 1446 } 1447 1448 emit_move_insn (large, two31); 1449 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); 1450 1451 x = gen_rtx_fmt_ee (LE, vecmode, large, value); 1452 emit_insn (gen_rtx_SET (large, x)); 1453 1454 x = gen_rtx_AND (vecmode, zero_or_two31, large); 1455 emit_insn (gen_rtx_SET (zero_or_two31, x)); 1456 1457 x = gen_rtx_MINUS (vecmode, value, zero_or_two31); 1458 emit_insn (gen_rtx_SET (value, x)); 1459 1460 large = gen_rtx_REG (V4SImode, REGNO (large)); 1461 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); 1462 1463 x = gen_rtx_REG (V4SImode, REGNO (value)); 1464 if (vecmode == V4SFmode) 1465 emit_insn (gen_fix_truncv4sfv4si2 (x, value)); 1466 else 1467 emit_insn (gen_sse2_cvttpd2dq (x, value)); 1468 value = x; 1469 1470 emit_insn (gen_xorv4si3 (value, value, large)); 1471} 1472 1473static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, 1474 machine_mode mode, rtx target, 1475 rtx var, int one_var); 1476 1477/* Convert an unsigned DImode value into a DFmode, using only SSE. 1478 Expects the 64-bit DImode to be supplied in a pair of integral 1479 registers. Requires SSE2; will use SSE3 if available. For x86_32, 1480 -mfpmath=sse, !optimize_size only. */ 1481 1482void 1483ix86_expand_convert_uns_didf_sse (rtx target, rtx input) 1484{ 1485 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; 1486 rtx int_xmm, fp_xmm; 1487 rtx biases, exponents; 1488 rtx x; 1489 1490 int_xmm = gen_reg_rtx (V4SImode); 1491 if (TARGET_INTER_UNIT_MOVES_TO_VEC) 1492 emit_insn (gen_movdi_to_sse (int_xmm, input)); 1493 else if (TARGET_SSE_SPLIT_REGS) 1494 { 1495 emit_clobber (int_xmm); 1496 emit_move_insn (gen_lowpart (DImode, int_xmm), input); 1497 } 1498 else 1499 { 1500 x = gen_reg_rtx (V2DImode); 1501 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); 1502 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); 1503 } 1504 1505 x = gen_rtx_CONST_VECTOR (V4SImode, 1506 gen_rtvec (4, GEN_INT (0x43300000UL), 1507 GEN_INT (0x45300000UL), 1508 const0_rtx, const0_rtx)); 1509 exponents = validize_mem (force_const_mem (V4SImode, x)); 1510 1511 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ 1512 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); 1513 1514 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) 1515 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). 1516 Similarly (0x45300000UL ## fp_value_hi_xmm) yields 1517 (0x1.0p84 + double(fp_value_hi_xmm)). 1518 Note these exponents differ by 32. */ 1519 1520 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); 1521 1522 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values 1523 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ 1524 real_ldexp (&bias_lo_rvt, &dconst1, 52); 1525 real_ldexp (&bias_hi_rvt, &dconst1, 84); 1526 biases = const_double_from_real_value (bias_lo_rvt, DFmode); 1527 x = const_double_from_real_value (bias_hi_rvt, DFmode); 1528 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); 1529 biases = validize_mem (force_const_mem (V2DFmode, biases)); 1530 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); 1531 1532 /* Add the upper and lower DFmode values together. */ 1533 if (TARGET_SSE3) 1534 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); 1535 else 1536 { 1537 x = copy_to_mode_reg (V2DFmode, fp_xmm); 1538 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); 1539 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); 1540 } 1541 1542 ix86_expand_vector_extract (false, target, fp_xmm, 0); 1543} 1544 1545/* Not used, but eases macroization of patterns. */ 1546void 1547ix86_expand_convert_uns_sixf_sse (rtx, rtx) 1548{ 1549 gcc_unreachable (); 1550} 1551 1552/* Convert an unsigned SImode value into a DFmode. Only currently used 1553 for SSE, but applicable anywhere. */ 1554 1555void 1556ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) 1557{ 1558 REAL_VALUE_TYPE TWO31r; 1559 rtx x, fp; 1560 1561 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), 1562 NULL, 1, OPTAB_DIRECT); 1563 1564 fp = gen_reg_rtx (DFmode); 1565 emit_insn (gen_floatsidf2 (fp, x)); 1566 1567 real_ldexp (&TWO31r, &dconst1, 31); 1568 x = const_double_from_real_value (TWO31r, DFmode); 1569 1570 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); 1571 if (x != target) 1572 emit_move_insn (target, x); 1573} 1574 1575/* Convert a signed DImode value into a DFmode. Only used for SSE in 1576 32-bit mode; otherwise we have a direct convert instruction. */ 1577 1578void 1579ix86_expand_convert_sign_didf_sse (rtx target, rtx input) 1580{ 1581 REAL_VALUE_TYPE TWO32r; 1582 rtx fp_lo, fp_hi, x; 1583 1584 fp_lo = gen_reg_rtx (DFmode); 1585 fp_hi = gen_reg_rtx (DFmode); 1586 1587 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); 1588 1589 real_ldexp (&TWO32r, &dconst1, 32); 1590 x = const_double_from_real_value (TWO32r, DFmode); 1591 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); 1592 1593 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); 1594 1595 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, 1596 0, OPTAB_DIRECT); 1597 if (x != target) 1598 emit_move_insn (target, x); 1599} 1600 1601/* Convert an unsigned SImode value into a SFmode, using only SSE. 1602 For x86_32, -mfpmath=sse, !optimize_size only. */ 1603void 1604ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) 1605{ 1606 REAL_VALUE_TYPE ONE16r; 1607 rtx fp_hi, fp_lo, int_hi, int_lo, x; 1608 1609 real_ldexp (&ONE16r, &dconst1, 16); 1610 x = const_double_from_real_value (ONE16r, SFmode); 1611 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), 1612 NULL, 0, OPTAB_DIRECT); 1613 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), 1614 NULL, 0, OPTAB_DIRECT); 1615 fp_hi = gen_reg_rtx (SFmode); 1616 fp_lo = gen_reg_rtx (SFmode); 1617 emit_insn (gen_floatsisf2 (fp_hi, int_hi)); 1618 emit_insn (gen_floatsisf2 (fp_lo, int_lo)); 1619 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, 1620 0, OPTAB_DIRECT); 1621 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, 1622 0, OPTAB_DIRECT); 1623 if (!rtx_equal_p (target, fp_hi)) 1624 emit_move_insn (target, fp_hi); 1625} 1626 1627/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert 1628 a vector of unsigned ints VAL to vector of floats TARGET. */ 1629 1630void 1631ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) 1632{ 1633 rtx tmp[8]; 1634 REAL_VALUE_TYPE TWO16r; 1635 machine_mode intmode = GET_MODE (val); 1636 machine_mode fltmode = GET_MODE (target); 1637 rtx (*cvt) (rtx, rtx); 1638 1639 if (intmode == V4SImode) 1640 cvt = gen_floatv4siv4sf2; 1641 else 1642 cvt = gen_floatv8siv8sf2; 1643 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff)); 1644 tmp[0] = force_reg (intmode, tmp[0]); 1645 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1, 1646 OPTAB_DIRECT); 1647 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16), 1648 NULL_RTX, 1, OPTAB_DIRECT); 1649 tmp[3] = gen_reg_rtx (fltmode); 1650 emit_insn (cvt (tmp[3], tmp[1])); 1651 tmp[4] = gen_reg_rtx (fltmode); 1652 emit_insn (cvt (tmp[4], tmp[2])); 1653 real_ldexp (&TWO16r, &dconst1, 16); 1654 tmp[5] = const_double_from_real_value (TWO16r, SFmode); 1655 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); 1656 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1, 1657 OPTAB_DIRECT); 1658 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1, 1659 OPTAB_DIRECT); 1660 if (tmp[7] != target) 1661 emit_move_insn (target, tmp[7]); 1662} 1663 1664/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* 1665 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*. 1666 This is done by doing just signed conversion if < 0x1p31, and otherwise by 1667 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */ 1668 1669rtx 1670ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp) 1671{ 1672 REAL_VALUE_TYPE TWO31r; 1673 rtx two31r, tmp[4]; 1674 machine_mode mode = GET_MODE (val); 1675 machine_mode scalarmode = GET_MODE_INNER (mode); 1676 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode; 1677 rtx (*cmp) (rtx, rtx, rtx, rtx); 1678 int i; 1679 1680 for (i = 0; i < 3; i++) 1681 tmp[i] = gen_reg_rtx (mode); 1682 real_ldexp (&TWO31r, &dconst1, 31); 1683 two31r = const_double_from_real_value (TWO31r, scalarmode); 1684 two31r = ix86_build_const_vector (mode, 1, two31r); 1685 two31r = force_reg (mode, two31r); 1686 switch (mode) 1687 { 1688 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break; 1689 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break; 1690 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break; 1691 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break; 1692 default: gcc_unreachable (); 1693 } 1694 tmp[3] = gen_rtx_LE (mode, two31r, val); 1695 emit_insn (cmp (tmp[0], two31r, val, tmp[3])); 1696 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1], 1697 0, OPTAB_DIRECT); 1698 if (intmode == V4SImode || TARGET_AVX2) 1699 *xorp = expand_simple_binop (intmode, ASHIFT, 1700 gen_lowpart (intmode, tmp[0]), 1701 GEN_INT (31), NULL_RTX, 0, 1702 OPTAB_DIRECT); 1703 else 1704 { 1705 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode); 1706 two31 = ix86_build_const_vector (intmode, 1, two31); 1707 *xorp = expand_simple_binop (intmode, AND, 1708 gen_lowpart (intmode, tmp[0]), 1709 two31, NULL_RTX, 0, 1710 OPTAB_DIRECT); 1711 } 1712 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2], 1713 0, OPTAB_DIRECT); 1714} 1715 1716/* Generate code for floating point ABS or NEG. */ 1717 1718void 1719ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, 1720 rtx operands[]) 1721{ 1722 rtx set, dst, src; 1723 bool use_sse = false; 1724 bool vector_mode = VECTOR_MODE_P (mode); 1725 machine_mode vmode = mode; 1726 rtvec par; 1727 1728 if (vector_mode) 1729 use_sse = true; 1730 else if (mode == TFmode) 1731 use_sse = true; 1732 else if (TARGET_SSE_MATH) 1733 { 1734 use_sse = SSE_FLOAT_MODE_P (mode); 1735 if (mode == SFmode) 1736 vmode = V4SFmode; 1737 else if (mode == DFmode) 1738 vmode = V2DFmode; 1739 } 1740 1741 dst = operands[0]; 1742 src = operands[1]; 1743 1744 set = gen_rtx_fmt_e (code, mode, src); 1745 set = gen_rtx_SET (dst, set); 1746 1747 if (use_sse) 1748 { 1749 rtx mask, use, clob; 1750 1751 /* NEG and ABS performed with SSE use bitwise mask operations. 1752 Create the appropriate mask now. */ 1753 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); 1754 use = gen_rtx_USE (VOIDmode, mask); 1755 if (vector_mode) 1756 par = gen_rtvec (2, set, use); 1757 else 1758 { 1759 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 1760 par = gen_rtvec (3, set, use, clob); 1761 } 1762 } 1763 else 1764 { 1765 rtx clob; 1766 1767 /* Changing of sign for FP values is doable using integer unit too. */ 1768 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 1769 par = gen_rtvec (2, set, clob); 1770 } 1771 1772 emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); 1773} 1774 1775/* Deconstruct a floating point ABS or NEG operation 1776 with integer registers into integer operations. */ 1777 1778void 1779ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode, 1780 rtx operands[]) 1781{ 1782 enum rtx_code absneg_op; 1783 rtx dst, set; 1784 1785 gcc_assert (operands_match_p (operands[0], operands[1])); 1786 1787 switch (mode) 1788 { 1789 case E_SFmode: 1790 dst = gen_lowpart (SImode, operands[0]); 1791 1792 if (code == ABS) 1793 { 1794 set = gen_int_mode (0x7fffffff, SImode); 1795 absneg_op = AND; 1796 } 1797 else 1798 { 1799 set = gen_int_mode (0x80000000, SImode); 1800 absneg_op = XOR; 1801 } 1802 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); 1803 break; 1804 1805 case E_DFmode: 1806 if (TARGET_64BIT) 1807 { 1808 dst = gen_lowpart (DImode, operands[0]); 1809 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63)); 1810 1811 if (code == ABS) 1812 set = const0_rtx; 1813 else 1814 set = gen_rtx_NOT (DImode, dst); 1815 } 1816 else 1817 { 1818 dst = gen_highpart (SImode, operands[0]); 1819 1820 if (code == ABS) 1821 { 1822 set = gen_int_mode (0x7fffffff, SImode); 1823 absneg_op = AND; 1824 } 1825 else 1826 { 1827 set = gen_int_mode (0x80000000, SImode); 1828 absneg_op = XOR; 1829 } 1830 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); 1831 } 1832 break; 1833 1834 case E_XFmode: 1835 dst = gen_rtx_REG (SImode, 1836 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2)); 1837 if (code == ABS) 1838 { 1839 set = GEN_INT (0x7fff); 1840 absneg_op = AND; 1841 } 1842 else 1843 { 1844 set = GEN_INT (0x8000); 1845 absneg_op = XOR; 1846 } 1847 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); 1848 break; 1849 1850 default: 1851 gcc_unreachable (); 1852 } 1853 1854 set = gen_rtx_SET (dst, set); 1855 1856 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); 1857 rtvec par = gen_rtvec (2, set, clob); 1858 1859 emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); 1860} 1861 1862/* Expand a copysign operation. Special case operand 0 being a constant. */ 1863 1864void 1865ix86_expand_copysign (rtx operands[]) 1866{ 1867 machine_mode mode, vmode; 1868 rtx dest, op0, op1, mask; 1869 1870 dest = operands[0]; 1871 op0 = operands[1]; 1872 op1 = operands[2]; 1873 1874 mode = GET_MODE (dest); 1875 1876 if (mode == SFmode) 1877 vmode = V4SFmode; 1878 else if (mode == DFmode) 1879 vmode = V2DFmode; 1880 else if (mode == TFmode) 1881 vmode = mode; 1882 else 1883 gcc_unreachable (); 1884 1885 mask = ix86_build_signbit_mask (vmode, 0, 0); 1886 1887 if (CONST_DOUBLE_P (op0)) 1888 { 1889 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) 1890 op0 = simplify_unary_operation (ABS, mode, op0, mode); 1891 1892 if (mode == SFmode || mode == DFmode) 1893 { 1894 if (op0 == CONST0_RTX (mode)) 1895 op0 = CONST0_RTX (vmode); 1896 else 1897 { 1898 rtx v = ix86_build_const_vector (vmode, false, op0); 1899 1900 op0 = force_reg (vmode, v); 1901 } 1902 } 1903 else if (op0 != CONST0_RTX (mode)) 1904 op0 = force_reg (mode, op0); 1905 1906 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask)); 1907 } 1908 else 1909 { 1910 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1); 1911 1912 emit_insn (gen_copysign3_var 1913 (mode, dest, NULL_RTX, op0, op1, nmask, mask)); 1914 } 1915} 1916 1917/* Deconstruct a copysign operation into bit masks. Operand 0 is known to 1918 be a constant, and so has already been expanded into a vector constant. */ 1919 1920void 1921ix86_split_copysign_const (rtx operands[]) 1922{ 1923 machine_mode mode, vmode; 1924 rtx dest, op0, mask, x; 1925 1926 dest = operands[0]; 1927 op0 = operands[1]; 1928 mask = operands[3]; 1929 1930 mode = GET_MODE (dest); 1931 vmode = GET_MODE (mask); 1932 1933 dest = lowpart_subreg (vmode, dest, mode); 1934 x = gen_rtx_AND (vmode, dest, mask); 1935 emit_insn (gen_rtx_SET (dest, x)); 1936 1937 if (op0 != CONST0_RTX (vmode)) 1938 { 1939 x = gen_rtx_IOR (vmode, dest, op0); 1940 emit_insn (gen_rtx_SET (dest, x)); 1941 } 1942} 1943 1944/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, 1945 so we have to do two masks. */ 1946 1947void 1948ix86_split_copysign_var (rtx operands[]) 1949{ 1950 machine_mode mode, vmode; 1951 rtx dest, scratch, op0, op1, mask, nmask, x; 1952 1953 dest = operands[0]; 1954 scratch = operands[1]; 1955 op0 = operands[2]; 1956 op1 = operands[3]; 1957 nmask = operands[4]; 1958 mask = operands[5]; 1959 1960 mode = GET_MODE (dest); 1961 vmode = GET_MODE (mask); 1962 1963 if (rtx_equal_p (op0, op1)) 1964 { 1965 /* Shouldn't happen often (it's useless, obviously), but when it does 1966 we'd generate incorrect code if we continue below. */ 1967 emit_move_insn (dest, op0); 1968 return; 1969 } 1970 1971 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ 1972 { 1973 gcc_assert (REGNO (op1) == REGNO (scratch)); 1974 1975 x = gen_rtx_AND (vmode, scratch, mask); 1976 emit_insn (gen_rtx_SET (scratch, x)); 1977 1978 dest = mask; 1979 op0 = lowpart_subreg (vmode, op0, mode); 1980 x = gen_rtx_NOT (vmode, dest); 1981 x = gen_rtx_AND (vmode, x, op0); 1982 emit_insn (gen_rtx_SET (dest, x)); 1983 } 1984 else 1985 { 1986 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ 1987 { 1988 x = gen_rtx_AND (vmode, scratch, mask); 1989 } 1990 else /* alternative 2,4 */ 1991 { 1992 gcc_assert (REGNO (mask) == REGNO (scratch)); 1993 op1 = lowpart_subreg (vmode, op1, mode); 1994 x = gen_rtx_AND (vmode, scratch, op1); 1995 } 1996 emit_insn (gen_rtx_SET (scratch, x)); 1997 1998 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ 1999 { 2000 dest = lowpart_subreg (vmode, op0, mode); 2001 x = gen_rtx_AND (vmode, dest, nmask); 2002 } 2003 else /* alternative 3,4 */ 2004 { 2005 gcc_assert (REGNO (nmask) == REGNO (dest)); 2006 dest = nmask; 2007 op0 = lowpart_subreg (vmode, op0, mode); 2008 x = gen_rtx_AND (vmode, dest, op0); 2009 } 2010 emit_insn (gen_rtx_SET (dest, x)); 2011 } 2012 2013 x = gen_rtx_IOR (vmode, dest, scratch); 2014 emit_insn (gen_rtx_SET (dest, x)); 2015} 2016 2017/* Expand an xorsign operation. */ 2018 2019void 2020ix86_expand_xorsign (rtx operands[]) 2021{ 2022 machine_mode mode, vmode; 2023 rtx dest, op0, op1, mask; 2024 2025 dest = operands[0]; 2026 op0 = operands[1]; 2027 op1 = operands[2]; 2028 2029 mode = GET_MODE (dest); 2030 2031 if (mode == SFmode) 2032 vmode = V4SFmode; 2033 else if (mode == DFmode) 2034 vmode = V2DFmode; 2035 else 2036 gcc_unreachable (); 2037 2038 mask = ix86_build_signbit_mask (vmode, 0, 0); 2039 2040 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask)); 2041} 2042 2043/* Deconstruct an xorsign operation into bit masks. */ 2044 2045void 2046ix86_split_xorsign (rtx operands[]) 2047{ 2048 machine_mode mode, vmode; 2049 rtx dest, op0, mask, x; 2050 2051 dest = operands[0]; 2052 op0 = operands[1]; 2053 mask = operands[3]; 2054 2055 mode = GET_MODE (dest); 2056 vmode = GET_MODE (mask); 2057 2058 dest = lowpart_subreg (vmode, dest, mode); 2059 x = gen_rtx_AND (vmode, dest, mask); 2060 emit_insn (gen_rtx_SET (dest, x)); 2061 2062 op0 = lowpart_subreg (vmode, op0, mode); 2063 x = gen_rtx_XOR (vmode, dest, op0); 2064 emit_insn (gen_rtx_SET (dest, x)); 2065} 2066 2067static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1); 2068 2069void 2070ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) 2071{ 2072 machine_mode mode = GET_MODE (op0); 2073 rtx tmp; 2074 2075 /* Handle special case - vector comparsion with boolean result, transform 2076 it using ptest instruction. */ 2077 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) 2078 { 2079 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); 2080 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; 2081 2082 gcc_assert (code == EQ || code == NE); 2083 /* Generate XOR since we can't check that one operand is zero vector. */ 2084 tmp = gen_reg_rtx (mode); 2085 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); 2086 tmp = gen_lowpart (p_mode, tmp); 2087 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), 2088 gen_rtx_UNSPEC (CCmode, 2089 gen_rtvec (2, tmp, tmp), 2090 UNSPEC_PTEST))); 2091 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); 2092 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 2093 gen_rtx_LABEL_REF (VOIDmode, label), 2094 pc_rtx); 2095 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 2096 return; 2097 } 2098 2099 switch (mode) 2100 { 2101 case E_SFmode: 2102 case E_DFmode: 2103 case E_XFmode: 2104 case E_QImode: 2105 case E_HImode: 2106 case E_SImode: 2107 simple: 2108 tmp = ix86_expand_compare (code, op0, op1); 2109 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 2110 gen_rtx_LABEL_REF (VOIDmode, label), 2111 pc_rtx); 2112 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 2113 return; 2114 2115 case E_DImode: 2116 if (TARGET_64BIT) 2117 goto simple; 2118 /* For 32-bit target DI comparison may be performed on 2119 SSE registers. To allow this we should avoid split 2120 to SI mode which is achieved by doing xor in DI mode 2121 and then comparing with zero (which is recognized by 2122 STV pass). We don't compare using xor when optimizing 2123 for size. */ 2124 if (!optimize_insn_for_size_p () 2125 && TARGET_STV 2126 && (code == EQ || code == NE)) 2127 { 2128 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1)); 2129 op1 = const0_rtx; 2130 } 2131 /* FALLTHRU */ 2132 case E_TImode: 2133 /* Expand DImode branch into multiple compare+branch. */ 2134 { 2135 rtx lo[2], hi[2]; 2136 rtx_code_label *label2; 2137 enum rtx_code code1, code2, code3; 2138 machine_mode submode; 2139 2140 if (CONSTANT_P (op0) && !CONSTANT_P (op1)) 2141 { 2142 std::swap (op0, op1); 2143 code = swap_condition (code); 2144 } 2145 2146 split_double_mode (mode, &op0, 1, lo+0, hi+0); 2147 split_double_mode (mode, &op1, 1, lo+1, hi+1); 2148 2149 submode = mode == DImode ? SImode : DImode; 2150 2151 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to 2152 avoid two branches. This costs one extra insn, so disable when 2153 optimizing for size. */ 2154 2155 if ((code == EQ || code == NE) 2156 && (!optimize_insn_for_size_p () 2157 || hi[1] == const0_rtx || lo[1] == const0_rtx)) 2158 { 2159 rtx xor0, xor1; 2160 2161 xor1 = hi[0]; 2162 if (hi[1] != const0_rtx) 2163 xor1 = expand_binop (submode, xor_optab, xor1, hi[1], 2164 NULL_RTX, 0, OPTAB_WIDEN); 2165 2166 xor0 = lo[0]; 2167 if (lo[1] != const0_rtx) 2168 xor0 = expand_binop (submode, xor_optab, xor0, lo[1], 2169 NULL_RTX, 0, OPTAB_WIDEN); 2170 2171 tmp = expand_binop (submode, ior_optab, xor1, xor0, 2172 NULL_RTX, 0, OPTAB_WIDEN); 2173 2174 ix86_expand_branch (code, tmp, const0_rtx, label); 2175 return; 2176 } 2177 2178 /* Otherwise, if we are doing less-than or greater-or-equal-than, 2179 op1 is a constant and the low word is zero, then we can just 2180 examine the high word. Similarly for low word -1 and 2181 less-or-equal-than or greater-than. */ 2182 2183 if (CONST_INT_P (hi[1])) 2184 switch (code) 2185 { 2186 case LT: case LTU: case GE: case GEU: 2187 if (lo[1] == const0_rtx) 2188 { 2189 ix86_expand_branch (code, hi[0], hi[1], label); 2190 return; 2191 } 2192 break; 2193 case LE: case LEU: case GT: case GTU: 2194 if (lo[1] == constm1_rtx) 2195 { 2196 ix86_expand_branch (code, hi[0], hi[1], label); 2197 return; 2198 } 2199 break; 2200 default: 2201 break; 2202 } 2203 2204 /* Emulate comparisons that do not depend on Zero flag with 2205 double-word subtraction. Note that only Overflow, Sign 2206 and Carry flags are valid, so swap arguments and condition 2207 of comparisons that would otherwise test Zero flag. */ 2208 2209 switch (code) 2210 { 2211 case LE: case LEU: case GT: case GTU: 2212 std::swap (lo[0], lo[1]); 2213 std::swap (hi[0], hi[1]); 2214 code = swap_condition (code); 2215 /* FALLTHRU */ 2216 2217 case LT: case LTU: case GE: case GEU: 2218 { 2219 bool uns = (code == LTU || code == GEU); 2220 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx) 2221 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz; 2222 2223 if (!nonimmediate_operand (lo[0], submode)) 2224 lo[0] = force_reg (submode, lo[0]); 2225 if (!x86_64_general_operand (lo[1], submode)) 2226 lo[1] = force_reg (submode, lo[1]); 2227 2228 if (!register_operand (hi[0], submode)) 2229 hi[0] = force_reg (submode, hi[0]); 2230 if ((uns && !nonimmediate_operand (hi[1], submode)) 2231 || (!uns && !x86_64_general_operand (hi[1], submode))) 2232 hi[1] = force_reg (submode, hi[1]); 2233 2234 emit_insn (gen_cmp_1 (submode, lo[0], lo[1])); 2235 2236 tmp = gen_rtx_SCRATCH (submode); 2237 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1])); 2238 2239 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); 2240 ix86_expand_branch (code, tmp, const0_rtx, label); 2241 return; 2242 } 2243 2244 default: 2245 break; 2246 } 2247 2248 /* Otherwise, we need two or three jumps. */ 2249 2250 label2 = gen_label_rtx (); 2251 2252 code1 = code; 2253 code2 = swap_condition (code); 2254 code3 = unsigned_condition (code); 2255 2256 switch (code) 2257 { 2258 case LT: case GT: case LTU: case GTU: 2259 break; 2260 2261 case LE: code1 = LT; code2 = GT; break; 2262 case GE: code1 = GT; code2 = LT; break; 2263 case LEU: code1 = LTU; code2 = GTU; break; 2264 case GEU: code1 = GTU; code2 = LTU; break; 2265 2266 case EQ: code1 = UNKNOWN; code2 = NE; break; 2267 case NE: code2 = UNKNOWN; break; 2268 2269 default: 2270 gcc_unreachable (); 2271 } 2272 2273 /* 2274 * a < b => 2275 * if (hi(a) < hi(b)) goto true; 2276 * if (hi(a) > hi(b)) goto false; 2277 * if (lo(a) < lo(b)) goto true; 2278 * false: 2279 */ 2280 2281 if (code1 != UNKNOWN) 2282 ix86_expand_branch (code1, hi[0], hi[1], label); 2283 if (code2 != UNKNOWN) 2284 ix86_expand_branch (code2, hi[0], hi[1], label2); 2285 2286 ix86_expand_branch (code3, lo[0], lo[1], label); 2287 2288 if (code2 != UNKNOWN) 2289 emit_label (label2); 2290 return; 2291 } 2292 2293 default: 2294 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); 2295 goto simple; 2296 } 2297} 2298 2299/* Figure out whether to use unordered fp comparisons. */ 2300 2301static bool 2302ix86_unordered_fp_compare (enum rtx_code code) 2303{ 2304 if (!TARGET_IEEE_FP) 2305 return false; 2306 2307 switch (code) 2308 { 2309 case LT: 2310 case LE: 2311 case GT: 2312 case GE: 2313 case LTGT: 2314 return false; 2315 2316 case EQ: 2317 case NE: 2318 2319 case UNORDERED: 2320 case ORDERED: 2321 case UNLT: 2322 case UNLE: 2323 case UNGT: 2324 case UNGE: 2325 case UNEQ: 2326 return true; 2327 2328 default: 2329 gcc_unreachable (); 2330 } 2331} 2332 2333/* Return a comparison we can do and that it is equivalent to 2334 swap_condition (code) apart possibly from orderedness. 2335 But, never change orderedness if TARGET_IEEE_FP, returning 2336 UNKNOWN in that case if necessary. */ 2337 2338static enum rtx_code 2339ix86_fp_swap_condition (enum rtx_code code) 2340{ 2341 switch (code) 2342 { 2343 case GT: /* GTU - CF=0 & ZF=0 */ 2344 return TARGET_IEEE_FP ? UNKNOWN : UNLT; 2345 case GE: /* GEU - CF=0 */ 2346 return TARGET_IEEE_FP ? UNKNOWN : UNLE; 2347 case UNLT: /* LTU - CF=1 */ 2348 return TARGET_IEEE_FP ? UNKNOWN : GT; 2349 case UNLE: /* LEU - CF=1 | ZF=1 */ 2350 return TARGET_IEEE_FP ? UNKNOWN : GE; 2351 default: 2352 return swap_condition (code); 2353 } 2354} 2355 2356/* Return cost of comparison CODE using the best strategy for performance. 2357 All following functions do use number of instructions as a cost metrics. 2358 In future this should be tweaked to compute bytes for optimize_size and 2359 take into account performance of various instructions on various CPUs. */ 2360 2361static int 2362ix86_fp_comparison_cost (enum rtx_code code) 2363{ 2364 int arith_cost; 2365 2366 /* The cost of code using bit-twiddling on %ah. */ 2367 switch (code) 2368 { 2369 case UNLE: 2370 case UNLT: 2371 case LTGT: 2372 case GT: 2373 case GE: 2374 case UNORDERED: 2375 case ORDERED: 2376 case UNEQ: 2377 arith_cost = 4; 2378 break; 2379 case LT: 2380 case NE: 2381 case EQ: 2382 case UNGE: 2383 arith_cost = TARGET_IEEE_FP ? 5 : 4; 2384 break; 2385 case LE: 2386 case UNGT: 2387 arith_cost = TARGET_IEEE_FP ? 6 : 4; 2388 break; 2389 default: 2390 gcc_unreachable (); 2391 } 2392 2393 switch (ix86_fp_comparison_strategy (code)) 2394 { 2395 case IX86_FPCMP_COMI: 2396 return arith_cost > 4 ? 3 : 2; 2397 case IX86_FPCMP_SAHF: 2398 return arith_cost > 4 ? 4 : 3; 2399 default: 2400 return arith_cost; 2401 } 2402} 2403 2404/* Swap, force into registers, or otherwise massage the two operands 2405 to a fp comparison. The operands are updated in place; the new 2406 comparison code is returned. */ 2407 2408static enum rtx_code 2409ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) 2410{ 2411 bool unordered_compare = ix86_unordered_fp_compare (code); 2412 rtx op0 = *pop0, op1 = *pop1; 2413 machine_mode op_mode = GET_MODE (op0); 2414 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); 2415 2416 /* All of the unordered compare instructions only work on registers. 2417 The same is true of the fcomi compare instructions. The XFmode 2418 compare instructions require registers except when comparing 2419 against zero or when converting operand 1 from fixed point to 2420 floating point. */ 2421 2422 if (!is_sse 2423 && (unordered_compare 2424 || (op_mode == XFmode 2425 && ! (standard_80387_constant_p (op0) == 1 2426 || standard_80387_constant_p (op1) == 1) 2427 && GET_CODE (op1) != FLOAT) 2428 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) 2429 { 2430 op0 = force_reg (op_mode, op0); 2431 op1 = force_reg (op_mode, op1); 2432 } 2433 else 2434 { 2435 /* %%% We only allow op1 in memory; op0 must be st(0). So swap 2436 things around if they appear profitable, otherwise force op0 2437 into a register. */ 2438 2439 if (standard_80387_constant_p (op0) == 0 2440 || (MEM_P (op0) 2441 && ! (standard_80387_constant_p (op1) == 0 2442 || MEM_P (op1)))) 2443 { 2444 enum rtx_code new_code = ix86_fp_swap_condition (code); 2445 if (new_code != UNKNOWN) 2446 { 2447 std::swap (op0, op1); 2448 code = new_code; 2449 } 2450 } 2451 2452 if (!REG_P (op0)) 2453 op0 = force_reg (op_mode, op0); 2454 2455 if (CONSTANT_P (op1)) 2456 { 2457 int tmp = standard_80387_constant_p (op1); 2458 if (tmp == 0) 2459 op1 = validize_mem (force_const_mem (op_mode, op1)); 2460 else if (tmp == 1) 2461 { 2462 if (TARGET_CMOVE) 2463 op1 = force_reg (op_mode, op1); 2464 } 2465 else 2466 op1 = force_reg (op_mode, op1); 2467 } 2468 } 2469 2470 /* Try to rearrange the comparison to make it cheaper. */ 2471 if (ix86_fp_comparison_cost (code) 2472 > ix86_fp_comparison_cost (swap_condition (code)) 2473 && (REG_P (op1) || can_create_pseudo_p ())) 2474 { 2475 std::swap (op0, op1); 2476 code = swap_condition (code); 2477 if (!REG_P (op0)) 2478 op0 = force_reg (op_mode, op0); 2479 } 2480 2481 *pop0 = op0; 2482 *pop1 = op1; 2483 return code; 2484} 2485 2486/* Generate insn patterns to do a floating point compare of OPERANDS. */ 2487 2488static rtx 2489ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) 2490{ 2491 bool unordered_compare = ix86_unordered_fp_compare (code); 2492 machine_mode cmp_mode; 2493 rtx tmp, scratch; 2494 2495 code = ix86_prepare_fp_compare_args (code, &op0, &op1); 2496 2497 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); 2498 if (unordered_compare) 2499 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); 2500 2501 /* Do fcomi/sahf based test when profitable. */ 2502 switch (ix86_fp_comparison_strategy (code)) 2503 { 2504 case IX86_FPCMP_COMI: 2505 cmp_mode = CCFPmode; 2506 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); 2507 break; 2508 2509 case IX86_FPCMP_SAHF: 2510 cmp_mode = CCFPmode; 2511 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); 2512 scratch = gen_reg_rtx (HImode); 2513 emit_insn (gen_rtx_SET (scratch, tmp)); 2514 emit_insn (gen_x86_sahf_1 (scratch)); 2515 break; 2516 2517 case IX86_FPCMP_ARITH: 2518 cmp_mode = CCNOmode; 2519 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); 2520 scratch = gen_reg_rtx (HImode); 2521 emit_insn (gen_rtx_SET (scratch, tmp)); 2522 2523 /* In the unordered case, we have to check C2 for NaN's, which 2524 doesn't happen to work out to anything nice combination-wise. 2525 So do some bit twiddling on the value we've got in AH to come 2526 up with an appropriate set of condition codes. */ 2527 2528 switch (code) 2529 { 2530 case GT: 2531 case UNGT: 2532 if (code == GT || !TARGET_IEEE_FP) 2533 { 2534 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); 2535 code = EQ; 2536 } 2537 else 2538 { 2539 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2540 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); 2541 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); 2542 cmp_mode = CCmode; 2543 code = GEU; 2544 } 2545 break; 2546 case LT: 2547 case UNLT: 2548 if (code == LT && TARGET_IEEE_FP) 2549 { 2550 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2551 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); 2552 cmp_mode = CCmode; 2553 code = EQ; 2554 } 2555 else 2556 { 2557 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx)); 2558 code = NE; 2559 } 2560 break; 2561 case GE: 2562 case UNGE: 2563 if (code == GE || !TARGET_IEEE_FP) 2564 { 2565 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05))); 2566 code = EQ; 2567 } 2568 else 2569 { 2570 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2571 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx)); 2572 code = NE; 2573 } 2574 break; 2575 case LE: 2576 case UNLE: 2577 if (code == LE && TARGET_IEEE_FP) 2578 { 2579 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2580 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); 2581 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); 2582 cmp_mode = CCmode; 2583 code = LTU; 2584 } 2585 else 2586 { 2587 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); 2588 code = NE; 2589 } 2590 break; 2591 case EQ: 2592 case UNEQ: 2593 if (code == EQ && TARGET_IEEE_FP) 2594 { 2595 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2596 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); 2597 cmp_mode = CCmode; 2598 code = EQ; 2599 } 2600 else 2601 { 2602 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); 2603 code = NE; 2604 } 2605 break; 2606 case NE: 2607 case LTGT: 2608 if (code == NE && TARGET_IEEE_FP) 2609 { 2610 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); 2611 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, 2612 GEN_INT (0x40))); 2613 code = NE; 2614 } 2615 else 2616 { 2617 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); 2618 code = EQ; 2619 } 2620 break; 2621 2622 case UNORDERED: 2623 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); 2624 code = NE; 2625 break; 2626 case ORDERED: 2627 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); 2628 code = EQ; 2629 break; 2630 2631 default: 2632 gcc_unreachable (); 2633 } 2634 break; 2635 2636 default: 2637 gcc_unreachable(); 2638 } 2639 2640 /* Return the test that should be put into the flags user, i.e. 2641 the bcc, scc, or cmov instruction. */ 2642 return gen_rtx_fmt_ee (code, VOIDmode, 2643 gen_rtx_REG (cmp_mode, FLAGS_REG), 2644 const0_rtx); 2645} 2646 2647/* Generate insn patterns to do an integer compare of OPERANDS. */ 2648 2649static rtx 2650ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) 2651{ 2652 machine_mode cmpmode; 2653 rtx tmp, flags; 2654 2655 cmpmode = SELECT_CC_MODE (code, op0, op1); 2656 flags = gen_rtx_REG (cmpmode, FLAGS_REG); 2657 2658 /* This is very simple, but making the interface the same as in the 2659 FP case makes the rest of the code easier. */ 2660 tmp = gen_rtx_COMPARE (cmpmode, op0, op1); 2661 emit_insn (gen_rtx_SET (flags, tmp)); 2662 2663 /* Return the test that should be put into the flags user, i.e. 2664 the bcc, scc, or cmov instruction. */ 2665 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); 2666} 2667 2668static rtx 2669ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) 2670{ 2671 rtx ret; 2672 2673 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) 2674 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); 2675 2676 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) 2677 { 2678 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); 2679 ret = ix86_expand_fp_compare (code, op0, op1); 2680 } 2681 else 2682 ret = ix86_expand_int_compare (code, op0, op1); 2683 2684 return ret; 2685} 2686 2687void 2688ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) 2689{ 2690 rtx ret; 2691 2692 gcc_assert (GET_MODE (dest) == QImode); 2693 2694 ret = ix86_expand_compare (code, op0, op1); 2695 PUT_MODE (ret, QImode); 2696 emit_insn (gen_rtx_SET (dest, ret)); 2697} 2698 2699/* Expand comparison setting or clearing carry flag. Return true when 2700 successful and set pop for the operation. */ 2701static bool 2702ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) 2703{ 2704 machine_mode mode 2705 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); 2706 2707 /* Do not handle double-mode compares that go through special path. */ 2708 if (mode == (TARGET_64BIT ? TImode : DImode)) 2709 return false; 2710 2711 if (SCALAR_FLOAT_MODE_P (mode)) 2712 { 2713 rtx compare_op; 2714 rtx_insn *compare_seq; 2715 2716 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); 2717 2718 /* Shortcut: following common codes never translate 2719 into carry flag compares. */ 2720 if (code == EQ || code == NE || code == UNEQ || code == LTGT 2721 || code == ORDERED || code == UNORDERED) 2722 return false; 2723 2724 /* These comparisons require zero flag; swap operands so they won't. */ 2725 if ((code == GT || code == UNLE || code == LE || code == UNGT) 2726 && !TARGET_IEEE_FP) 2727 { 2728 std::swap (op0, op1); 2729 code = swap_condition (code); 2730 } 2731 2732 /* Try to expand the comparison and verify that we end up with 2733 carry flag based comparison. This fails to be true only when 2734 we decide to expand comparison using arithmetic that is not 2735 too common scenario. */ 2736 start_sequence (); 2737 compare_op = ix86_expand_fp_compare (code, op0, op1); 2738 compare_seq = get_insns (); 2739 end_sequence (); 2740 2741 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) 2742 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); 2743 else 2744 code = GET_CODE (compare_op); 2745 2746 if (code != LTU && code != GEU) 2747 return false; 2748 2749 emit_insn (compare_seq); 2750 *pop = compare_op; 2751 return true; 2752 } 2753 2754 if (!INTEGRAL_MODE_P (mode)) 2755 return false; 2756 2757 switch (code) 2758 { 2759 case LTU: 2760 case GEU: 2761 break; 2762 2763 /* Convert a==0 into (unsigned)a<1. */ 2764 case EQ: 2765 case NE: 2766 if (op1 != const0_rtx) 2767 return false; 2768 op1 = const1_rtx; 2769 code = (code == EQ ? LTU : GEU); 2770 break; 2771 2772 /* Convert a>b into b<a or a>=b-1. */ 2773 case GTU: 2774 case LEU: 2775 if (CONST_INT_P (op1)) 2776 { 2777 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); 2778 /* Bail out on overflow. We still can swap operands but that 2779 would force loading of the constant into register. */ 2780 if (op1 == const0_rtx 2781 || !x86_64_immediate_operand (op1, GET_MODE (op1))) 2782 return false; 2783 code = (code == GTU ? GEU : LTU); 2784 } 2785 else 2786 { 2787 std::swap (op0, op1); 2788 code = (code == GTU ? LTU : GEU); 2789 } 2790 break; 2791 2792 /* Convert a>=0 into (unsigned)a<0x80000000. */ 2793 case LT: 2794 case GE: 2795 if (mode == DImode || op1 != const0_rtx) 2796 return false; 2797 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); 2798 code = (code == LT ? GEU : LTU); 2799 break; 2800 case LE: 2801 case GT: 2802 if (mode == DImode || op1 != constm1_rtx) 2803 return false; 2804 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); 2805 code = (code == LE ? GEU : LTU); 2806 break; 2807 2808 default: 2809 return false; 2810 } 2811 /* Swapping operands may cause constant to appear as first operand. */ 2812 if (!nonimmediate_operand (op0, VOIDmode)) 2813 { 2814 if (!can_create_pseudo_p ()) 2815 return false; 2816 op0 = force_reg (mode, op0); 2817 } 2818 *pop = ix86_expand_compare (code, op0, op1); 2819 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); 2820 return true; 2821} 2822 2823/* Expand conditional increment or decrement using adb/sbb instructions. 2824 The default case using setcc followed by the conditional move can be 2825 done by generic code. */ 2826bool 2827ix86_expand_int_addcc (rtx operands[]) 2828{ 2829 enum rtx_code code = GET_CODE (operands[1]); 2830 rtx flags; 2831 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx); 2832 rtx compare_op; 2833 rtx val = const0_rtx; 2834 bool fpcmp = false; 2835 machine_mode mode; 2836 rtx op0 = XEXP (operands[1], 0); 2837 rtx op1 = XEXP (operands[1], 1); 2838 2839 if (operands[3] != const1_rtx 2840 && operands[3] != constm1_rtx) 2841 return false; 2842 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) 2843 return false; 2844 code = GET_CODE (compare_op); 2845 2846 flags = XEXP (compare_op, 0); 2847 2848 if (GET_MODE (flags) == CCFPmode) 2849 { 2850 fpcmp = true; 2851 code = ix86_fp_compare_code_to_integer (code); 2852 } 2853 2854 if (code != LTU) 2855 { 2856 val = constm1_rtx; 2857 if (fpcmp) 2858 PUT_CODE (compare_op, 2859 reverse_condition_maybe_unordered 2860 (GET_CODE (compare_op))); 2861 else 2862 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); 2863 } 2864 2865 mode = GET_MODE (operands[0]); 2866 2867 /* Construct either adc or sbb insn. */ 2868 if ((code == LTU) == (operands[3] == constm1_rtx)) 2869 insn = gen_sub3_carry; 2870 else 2871 insn = gen_add3_carry; 2872 2873 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op)); 2874 2875 return true; 2876} 2877 2878bool 2879ix86_expand_int_movcc (rtx operands[]) 2880{ 2881 enum rtx_code code = GET_CODE (operands[1]), compare_code; 2882 rtx_insn *compare_seq; 2883 rtx compare_op; 2884 machine_mode mode = GET_MODE (operands[0]); 2885 bool sign_bit_compare_p = false; 2886 rtx op0 = XEXP (operands[1], 0); 2887 rtx op1 = XEXP (operands[1], 1); 2888 2889 if (GET_MODE (op0) == TImode 2890 || (GET_MODE (op0) == DImode 2891 && !TARGET_64BIT)) 2892 return false; 2893 2894 start_sequence (); 2895 compare_op = ix86_expand_compare (code, op0, op1); 2896 compare_seq = get_insns (); 2897 end_sequence (); 2898 2899 compare_code = GET_CODE (compare_op); 2900 2901 if ((op1 == const0_rtx && (code == GE || code == LT)) 2902 || (op1 == constm1_rtx && (code == GT || code == LE))) 2903 sign_bit_compare_p = true; 2904 2905 /* Don't attempt mode expansion here -- if we had to expand 5 or 6 2906 HImode insns, we'd be swallowed in word prefix ops. */ 2907 2908 if ((mode != HImode || TARGET_FAST_PREFIX) 2909 && (mode != (TARGET_64BIT ? TImode : DImode)) 2910 && CONST_INT_P (operands[2]) 2911 && CONST_INT_P (operands[3])) 2912 { 2913 rtx out = operands[0]; 2914 HOST_WIDE_INT ct = INTVAL (operands[2]); 2915 HOST_WIDE_INT cf = INTVAL (operands[3]); 2916 HOST_WIDE_INT diff; 2917 2918 diff = ct - cf; 2919 /* Sign bit compares are better done using shifts than we do by using 2920 sbb. */ 2921 if (sign_bit_compare_p 2922 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) 2923 { 2924 /* Detect overlap between destination and compare sources. */ 2925 rtx tmp = out; 2926 2927 if (!sign_bit_compare_p) 2928 { 2929 rtx flags; 2930 bool fpcmp = false; 2931 2932 compare_code = GET_CODE (compare_op); 2933 2934 flags = XEXP (compare_op, 0); 2935 2936 if (GET_MODE (flags) == CCFPmode) 2937 { 2938 fpcmp = true; 2939 compare_code 2940 = ix86_fp_compare_code_to_integer (compare_code); 2941 } 2942 2943 /* To simplify rest of code, restrict to the GEU case. */ 2944 if (compare_code == LTU) 2945 { 2946 std::swap (ct, cf); 2947 compare_code = reverse_condition (compare_code); 2948 code = reverse_condition (code); 2949 } 2950 else 2951 { 2952 if (fpcmp) 2953 PUT_CODE (compare_op, 2954 reverse_condition_maybe_unordered 2955 (GET_CODE (compare_op))); 2956 else 2957 PUT_CODE (compare_op, 2958 reverse_condition (GET_CODE (compare_op))); 2959 } 2960 diff = ct - cf; 2961 2962 if (reg_overlap_mentioned_p (out, op0) 2963 || reg_overlap_mentioned_p (out, op1)) 2964 tmp = gen_reg_rtx (mode); 2965 2966 if (mode == DImode) 2967 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); 2968 else 2969 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), 2970 flags, compare_op)); 2971 } 2972 else 2973 { 2974 if (code == GT || code == GE) 2975 code = reverse_condition (code); 2976 else 2977 { 2978 std::swap (ct, cf); 2979 diff = ct - cf; 2980 } 2981 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1); 2982 } 2983 2984 if (diff == 1) 2985 { 2986 /* 2987 * cmpl op0,op1 2988 * sbbl dest,dest 2989 * [addl dest, ct] 2990 * 2991 * Size 5 - 8. 2992 */ 2993 if (ct) 2994 tmp = expand_simple_binop (mode, PLUS, 2995 tmp, GEN_INT (ct), 2996 copy_rtx (tmp), 1, OPTAB_DIRECT); 2997 } 2998 else if (cf == -1) 2999 { 3000 /* 3001 * cmpl op0,op1 3002 * sbbl dest,dest 3003 * orl $ct, dest 3004 * 3005 * Size 8. 3006 */ 3007 tmp = expand_simple_binop (mode, IOR, 3008 tmp, GEN_INT (ct), 3009 copy_rtx (tmp), 1, OPTAB_DIRECT); 3010 } 3011 else if (diff == -1 && ct) 3012 { 3013 /* 3014 * cmpl op0,op1 3015 * sbbl dest,dest 3016 * notl dest 3017 * [addl dest, cf] 3018 * 3019 * Size 8 - 11. 3020 */ 3021 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); 3022 if (cf) 3023 tmp = expand_simple_binop (mode, PLUS, 3024 copy_rtx (tmp), GEN_INT (cf), 3025 copy_rtx (tmp), 1, OPTAB_DIRECT); 3026 } 3027 else 3028 { 3029 /* 3030 * cmpl op0,op1 3031 * sbbl dest,dest 3032 * [notl dest] 3033 * andl cf - ct, dest 3034 * [addl dest, ct] 3035 * 3036 * Size 8 - 11. 3037 */ 3038 3039 if (cf == 0) 3040 { 3041 cf = ct; 3042 ct = 0; 3043 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); 3044 } 3045 3046 tmp = expand_simple_binop (mode, AND, 3047 copy_rtx (tmp), 3048 gen_int_mode (cf - ct, mode), 3049 copy_rtx (tmp), 1, OPTAB_DIRECT); 3050 if (ct) 3051 tmp = expand_simple_binop (mode, PLUS, 3052 copy_rtx (tmp), GEN_INT (ct), 3053 copy_rtx (tmp), 1, OPTAB_DIRECT); 3054 } 3055 3056 if (!rtx_equal_p (tmp, out)) 3057 emit_move_insn (copy_rtx (out), copy_rtx (tmp)); 3058 3059 return true; 3060 } 3061 3062 if (diff < 0) 3063 { 3064 machine_mode cmp_mode = GET_MODE (op0); 3065 enum rtx_code new_code; 3066 3067 if (SCALAR_FLOAT_MODE_P (cmp_mode)) 3068 { 3069 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); 3070 3071 /* We may be reversing a non-trapping 3072 comparison to a trapping comparison. */ 3073 if (HONOR_NANS (cmp_mode) && flag_trapping_math 3074 && code != EQ && code != NE 3075 && code != ORDERED && code != UNORDERED) 3076 new_code = UNKNOWN; 3077 else 3078 new_code = reverse_condition_maybe_unordered (code); 3079 } 3080 else 3081 new_code = ix86_reverse_condition (code, cmp_mode); 3082 if (new_code != UNKNOWN) 3083 { 3084 std::swap (ct, cf); 3085 diff = -diff; 3086 code = new_code; 3087 } 3088 } 3089 3090 compare_code = UNKNOWN; 3091 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT 3092 && CONST_INT_P (op1)) 3093 { 3094 if (op1 == const0_rtx 3095 && (code == LT || code == GE)) 3096 compare_code = code; 3097 else if (op1 == constm1_rtx) 3098 { 3099 if (code == LE) 3100 compare_code = LT; 3101 else if (code == GT) 3102 compare_code = GE; 3103 } 3104 } 3105 3106 /* Optimize dest = (op0 < 0) ? -1 : cf. */ 3107 if (compare_code != UNKNOWN 3108 && GET_MODE (op0) == GET_MODE (out) 3109 && (cf == -1 || ct == -1)) 3110 { 3111 /* If lea code below could be used, only optimize 3112 if it results in a 2 insn sequence. */ 3113 3114 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8 3115 || diff == 3 || diff == 5 || diff == 9) 3116 || (compare_code == LT && ct == -1) 3117 || (compare_code == GE && cf == -1)) 3118 { 3119 /* 3120 * notl op1 (if necessary) 3121 * sarl $31, op1 3122 * orl cf, op1 3123 */ 3124 if (ct != -1) 3125 { 3126 cf = ct; 3127 ct = -1; 3128 code = reverse_condition (code); 3129 } 3130 3131 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); 3132 3133 out = expand_simple_binop (mode, IOR, 3134 out, GEN_INT (cf), 3135 out, 1, OPTAB_DIRECT); 3136 if (out != operands[0]) 3137 emit_move_insn (operands[0], out); 3138 3139 return true; 3140 } 3141 } 3142 3143 3144 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 3145 || diff == 3 || diff == 5 || diff == 9) 3146 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) 3147 && (mode != DImode 3148 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode))) 3149 { 3150 /* 3151 * xorl dest,dest 3152 * cmpl op1,op2 3153 * setcc dest 3154 * lea cf(dest*(ct-cf)),dest 3155 * 3156 * Size 14. 3157 * 3158 * This also catches the degenerate setcc-only case. 3159 */ 3160 3161 rtx tmp; 3162 int nops; 3163 3164 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); 3165 3166 nops = 0; 3167 /* On x86_64 the lea instruction operates on Pmode, so we need 3168 to get arithmetics done in proper mode to match. */ 3169 if (diff == 1) 3170 tmp = copy_rtx (out); 3171 else 3172 { 3173 rtx out1; 3174 out1 = copy_rtx (out); 3175 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1)); 3176 nops++; 3177 if (diff & 1) 3178 { 3179 tmp = gen_rtx_PLUS (mode, tmp, out1); 3180 nops++; 3181 } 3182 } 3183 if (cf != 0) 3184 { 3185 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); 3186 nops++; 3187 } 3188 if (!rtx_equal_p (tmp, out)) 3189 { 3190 if (nops == 1) 3191 out = force_operand (tmp, copy_rtx (out)); 3192 else 3193 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp))); 3194 } 3195 if (!rtx_equal_p (out, operands[0])) 3196 emit_move_insn (operands[0], copy_rtx (out)); 3197 3198 return true; 3199 } 3200 3201 /* 3202 * General case: Jumpful: 3203 * xorl dest,dest cmpl op1, op2 3204 * cmpl op1, op2 movl ct, dest 3205 * setcc dest jcc 1f 3206 * decl dest movl cf, dest 3207 * andl (cf-ct),dest 1: 3208 * addl ct,dest 3209 * 3210 * Size 20. Size 14. 3211 * 3212 * This is reasonably steep, but branch mispredict costs are 3213 * high on modern cpus, so consider failing only if optimizing 3214 * for space. 3215 */ 3216 3217 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) 3218 && BRANCH_COST (optimize_insn_for_speed_p (), 3219 false) >= 2) 3220 { 3221 if (cf == 0) 3222 { 3223 machine_mode cmp_mode = GET_MODE (op0); 3224 enum rtx_code new_code; 3225 3226 if (SCALAR_FLOAT_MODE_P (cmp_mode)) 3227 { 3228 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); 3229 3230 /* We may be reversing a non-trapping 3231 comparison to a trapping comparison. */ 3232 if (HONOR_NANS (cmp_mode) && flag_trapping_math 3233 && code != EQ && code != NE 3234 && code != ORDERED && code != UNORDERED) 3235 new_code = UNKNOWN; 3236 else 3237 new_code = reverse_condition_maybe_unordered (code); 3238 3239 } 3240 else 3241 { 3242 new_code = ix86_reverse_condition (code, cmp_mode); 3243 if (compare_code != UNKNOWN && new_code != UNKNOWN) 3244 compare_code = reverse_condition (compare_code); 3245 } 3246 3247 if (new_code != UNKNOWN) 3248 { 3249 cf = ct; 3250 ct = 0; 3251 code = new_code; 3252 } 3253 } 3254 3255 if (compare_code != UNKNOWN) 3256 { 3257 /* notl op1 (if needed) 3258 sarl $31, op1 3259 andl (cf-ct), op1 3260 addl ct, op1 3261 3262 For x < 0 (resp. x <= -1) there will be no notl, 3263 so if possible swap the constants to get rid of the 3264 complement. 3265 True/false will be -1/0 while code below (store flag 3266 followed by decrement) is 0/-1, so the constants need 3267 to be exchanged once more. */ 3268 3269 if (compare_code == GE || !cf) 3270 { 3271 code = reverse_condition (code); 3272 compare_code = LT; 3273 } 3274 else 3275 std::swap (ct, cf); 3276 3277 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); 3278 } 3279 else 3280 { 3281 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); 3282 3283 out = expand_simple_binop (mode, PLUS, copy_rtx (out), 3284 constm1_rtx, 3285 copy_rtx (out), 1, OPTAB_DIRECT); 3286 } 3287 3288 out = expand_simple_binop (mode, AND, copy_rtx (out), 3289 gen_int_mode (cf - ct, mode), 3290 copy_rtx (out), 1, OPTAB_DIRECT); 3291 if (ct) 3292 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), 3293 copy_rtx (out), 1, OPTAB_DIRECT); 3294 if (!rtx_equal_p (out, operands[0])) 3295 emit_move_insn (operands[0], copy_rtx (out)); 3296 3297 return true; 3298 } 3299 } 3300 3301 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) 3302 { 3303 /* Try a few things more with specific constants and a variable. */ 3304 3305 optab op; 3306 rtx var, orig_out, out, tmp; 3307 3308 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) 3309 return false; 3310 3311 /* If one of the two operands is an interesting constant, load a 3312 constant with the above and mask it in with a logical operation. */ 3313 3314 if (CONST_INT_P (operands[2])) 3315 { 3316 var = operands[3]; 3317 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) 3318 operands[3] = constm1_rtx, op = and_optab; 3319 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) 3320 operands[3] = const0_rtx, op = ior_optab; 3321 else 3322 return false; 3323 } 3324 else if (CONST_INT_P (operands[3])) 3325 { 3326 var = operands[2]; 3327 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) 3328 operands[2] = constm1_rtx, op = and_optab; 3329 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) 3330 operands[2] = const0_rtx, op = ior_optab; 3331 else 3332 return false; 3333 } 3334 else 3335 return false; 3336 3337 orig_out = operands[0]; 3338 tmp = gen_reg_rtx (mode); 3339 operands[0] = tmp; 3340 3341 /* Recurse to get the constant loaded. */ 3342 if (!ix86_expand_int_movcc (operands)) 3343 return false; 3344 3345 /* Mask in the interesting variable. */ 3346 out = expand_binop (mode, op, var, tmp, orig_out, 0, 3347 OPTAB_WIDEN); 3348 if (!rtx_equal_p (out, orig_out)) 3349 emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); 3350 3351 return true; 3352 } 3353 3354 /* 3355 * For comparison with above, 3356 * 3357 * movl cf,dest 3358 * movl ct,tmp 3359 * cmpl op1,op2 3360 * cmovcc tmp,dest 3361 * 3362 * Size 15. 3363 */ 3364 3365 if (! nonimmediate_operand (operands[2], mode)) 3366 operands[2] = force_reg (mode, operands[2]); 3367 if (! nonimmediate_operand (operands[3], mode)) 3368 operands[3] = force_reg (mode, operands[3]); 3369 3370 if (! register_operand (operands[2], VOIDmode) 3371 && (mode == QImode 3372 || ! register_operand (operands[3], VOIDmode))) 3373 operands[2] = force_reg (mode, operands[2]); 3374 3375 if (mode == QImode 3376 && ! register_operand (operands[3], VOIDmode)) 3377 operands[3] = force_reg (mode, operands[3]); 3378 3379 emit_insn (compare_seq); 3380 emit_insn (gen_rtx_SET (operands[0], 3381 gen_rtx_IF_THEN_ELSE (mode, 3382 compare_op, operands[2], 3383 operands[3]))); 3384 return true; 3385} 3386 3387/* Detect conditional moves that exactly match min/max operational 3388 semantics. Note that this is IEEE safe, as long as we don't 3389 interchange the operands. 3390 3391 Returns FALSE if this conditional move doesn't match a MIN/MAX, 3392 and TRUE if the operation is successful and instructions are emitted. */ 3393 3394static bool 3395ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, 3396 rtx cmp_op1, rtx if_true, rtx if_false) 3397{ 3398 machine_mode mode; 3399 bool is_min; 3400 rtx tmp; 3401 3402 if (code == LT) 3403 ; 3404 else if (code == UNGE) 3405 std::swap (if_true, if_false); 3406 else 3407 return false; 3408 3409 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false)) 3410 is_min = true; 3411 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false)) 3412 is_min = false; 3413 else 3414 return false; 3415 3416 mode = GET_MODE (dest); 3417 3418 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here, 3419 but MODE may be a vector mode and thus not appropriate. */ 3420 if (!flag_finite_math_only || flag_signed_zeros) 3421 { 3422 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX; 3423 rtvec v; 3424 3425 if_true = force_reg (mode, if_true); 3426 v = gen_rtvec (2, if_true, if_false); 3427 tmp = gen_rtx_UNSPEC (mode, v, u); 3428 } 3429 else 3430 { 3431 code = is_min ? SMIN : SMAX; 3432 if (MEM_P (if_true) && MEM_P (if_false)) 3433 if_true = force_reg (mode, if_true); 3434 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false); 3435 } 3436 3437 emit_insn (gen_rtx_SET (dest, tmp)); 3438 return true; 3439} 3440 3441/* Return true if MODE is valid for vector compare to mask register, 3442 Same result for conditionl vector move with mask register. */ 3443static bool 3444ix86_valid_mask_cmp_mode (machine_mode mode) 3445{ 3446 /* XOP has its own vector conditional movement. */ 3447 if (TARGET_XOP && !TARGET_AVX512F) 3448 return false; 3449 3450 /* AVX512F is needed for mask operation. */ 3451 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode))) 3452 return false; 3453 3454 /* AVX512BW is needed for vector QI/HImode, 3455 AVX512VL is needed for 128/256-bit vector. */ 3456 machine_mode inner_mode = GET_MODE_INNER (mode); 3457 int vector_size = GET_MODE_SIZE (mode); 3458 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW) 3459 return false; 3460 3461 return vector_size == 64 || TARGET_AVX512VL; 3462} 3463 3464/* Expand an SSE comparison. Return the register with the result. */ 3465 3466static rtx 3467ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, 3468 rtx op_true, rtx op_false) 3469{ 3470 machine_mode mode = GET_MODE (dest); 3471 machine_mode cmp_ops_mode = GET_MODE (cmp_op0); 3472 3473 /* In general case result of comparison can differ from operands' type. */ 3474 machine_mode cmp_mode; 3475 3476 /* In AVX512F the result of comparison is an integer mask. */ 3477 bool maskcmp = false; 3478 rtx x; 3479 3480 if (ix86_valid_mask_cmp_mode (cmp_ops_mode)) 3481 { 3482 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode); 3483 maskcmp = true; 3484 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode; 3485 } 3486 else 3487 cmp_mode = cmp_ops_mode; 3488 3489 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); 3490 3491 int (*op1_predicate)(rtx, machine_mode) 3492 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand; 3493 3494 if (!op1_predicate (cmp_op1, cmp_ops_mode)) 3495 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); 3496 3497 if (optimize 3498 || (maskcmp && cmp_mode != mode) 3499 || (op_true && reg_overlap_mentioned_p (dest, op_true)) 3500 || (op_false && reg_overlap_mentioned_p (dest, op_false))) 3501 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); 3502 3503 if (maskcmp) 3504 { 3505 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1); 3506 gcc_assert (ok); 3507 return dest; 3508 } 3509 3510 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); 3511 3512 if (cmp_mode != mode && !maskcmp) 3513 { 3514 x = force_reg (cmp_ops_mode, x); 3515 convert_move (dest, x, false); 3516 } 3517 else 3518 emit_insn (gen_rtx_SET (dest, x)); 3519 3520 return dest; 3521} 3522 3523/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical 3524 operations. This is used for both scalar and vector conditional moves. */ 3525 3526void 3527ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) 3528{ 3529 machine_mode mode = GET_MODE (dest); 3530 machine_mode cmpmode = GET_MODE (cmp); 3531 3532 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */ 3533 if (rtx_equal_p (op_true, op_false)) 3534 { 3535 emit_move_insn (dest, op_true); 3536 return; 3537 } 3538 3539 /* In AVX512F the result of comparison is an integer mask. */ 3540 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode); 3541 3542 rtx t2, t3, x; 3543 3544 /* If we have an integer mask and FP value then we need 3545 to cast mask to FP mode. */ 3546 if (mode != cmpmode && VECTOR_MODE_P (cmpmode)) 3547 { 3548 cmp = force_reg (cmpmode, cmp); 3549 cmp = gen_rtx_SUBREG (mode, cmp, 0); 3550 } 3551 3552 if (maskcmp) 3553 { 3554 /* Using vector move with mask register. */ 3555 cmp = force_reg (cmpmode, cmp); 3556 /* Optimize for mask zero. */ 3557 op_true = (op_true != CONST0_RTX (mode) 3558 ? force_reg (mode, op_true) : op_true); 3559 op_false = (op_false != CONST0_RTX (mode) 3560 ? force_reg (mode, op_false) : op_false); 3561 if (op_true == CONST0_RTX (mode)) 3562 { 3563 rtx (*gen_not) (rtx, rtx); 3564 switch (cmpmode) 3565 { 3566 case E_QImode: gen_not = gen_knotqi; break; 3567 case E_HImode: gen_not = gen_knothi; break; 3568 case E_SImode: gen_not = gen_knotsi; break; 3569 case E_DImode: gen_not = gen_knotdi; break; 3570 default: gcc_unreachable (); 3571 } 3572 rtx n = gen_reg_rtx (cmpmode); 3573 emit_insn (gen_not (n, cmp)); 3574 cmp = n; 3575 /* Reverse op_true op_false. */ 3576 std::swap (op_true, op_false); 3577 } 3578 3579 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp); 3580 emit_insn (gen_rtx_SET (dest, vec_merge)); 3581 return; 3582 } 3583 else if (vector_all_ones_operand (op_true, mode) 3584 && op_false == CONST0_RTX (mode)) 3585 { 3586 emit_insn (gen_rtx_SET (dest, cmp)); 3587 return; 3588 } 3589 else if (op_false == CONST0_RTX (mode)) 3590 { 3591 op_true = force_reg (mode, op_true); 3592 x = gen_rtx_AND (mode, cmp, op_true); 3593 emit_insn (gen_rtx_SET (dest, x)); 3594 return; 3595 } 3596 else if (op_true == CONST0_RTX (mode)) 3597 { 3598 op_false = force_reg (mode, op_false); 3599 x = gen_rtx_NOT (mode, cmp); 3600 x = gen_rtx_AND (mode, x, op_false); 3601 emit_insn (gen_rtx_SET (dest, x)); 3602 return; 3603 } 3604 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) 3605 { 3606 op_false = force_reg (mode, op_false); 3607 x = gen_rtx_IOR (mode, cmp, op_false); 3608 emit_insn (gen_rtx_SET (dest, x)); 3609 return; 3610 } 3611 else if (TARGET_XOP) 3612 { 3613 op_true = force_reg (mode, op_true); 3614 3615 if (!nonimmediate_operand (op_false, mode)) 3616 op_false = force_reg (mode, op_false); 3617 3618 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, 3619 op_true, 3620 op_false))); 3621 return; 3622 } 3623 3624 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; 3625 rtx d = dest; 3626 3627 if (!vector_operand (op_true, mode)) 3628 op_true = force_reg (mode, op_true); 3629 3630 op_false = force_reg (mode, op_false); 3631 3632 switch (mode) 3633 { 3634 case E_V4SFmode: 3635 if (TARGET_SSE4_1) 3636 gen = gen_sse4_1_blendvps; 3637 break; 3638 case E_V2DFmode: 3639 if (TARGET_SSE4_1) 3640 gen = gen_sse4_1_blendvpd; 3641 break; 3642 case E_SFmode: 3643 if (TARGET_SSE4_1) 3644 { 3645 gen = gen_sse4_1_blendvss; 3646 op_true = force_reg (mode, op_true); 3647 } 3648 break; 3649 case E_DFmode: 3650 if (TARGET_SSE4_1) 3651 { 3652 gen = gen_sse4_1_blendvsd; 3653 op_true = force_reg (mode, op_true); 3654 } 3655 break; 3656 case E_V16QImode: 3657 case E_V8HImode: 3658 case E_V4SImode: 3659 case E_V2DImode: 3660 if (TARGET_SSE4_1) 3661 { 3662 gen = gen_sse4_1_pblendvb; 3663 if (mode != V16QImode) 3664 d = gen_reg_rtx (V16QImode); 3665 op_false = gen_lowpart (V16QImode, op_false); 3666 op_true = gen_lowpart (V16QImode, op_true); 3667 cmp = gen_lowpart (V16QImode, cmp); 3668 } 3669 break; 3670 case E_V8SFmode: 3671 if (TARGET_AVX) 3672 gen = gen_avx_blendvps256; 3673 break; 3674 case E_V4DFmode: 3675 if (TARGET_AVX) 3676 gen = gen_avx_blendvpd256; 3677 break; 3678 case E_V32QImode: 3679 case E_V16HImode: 3680 case E_V8SImode: 3681 case E_V4DImode: 3682 if (TARGET_AVX2) 3683 { 3684 gen = gen_avx2_pblendvb; 3685 if (mode != V32QImode) 3686 d = gen_reg_rtx (V32QImode); 3687 op_false = gen_lowpart (V32QImode, op_false); 3688 op_true = gen_lowpart (V32QImode, op_true); 3689 cmp = gen_lowpart (V32QImode, cmp); 3690 } 3691 break; 3692 3693 case E_V64QImode: 3694 gen = gen_avx512bw_blendmv64qi; 3695 break; 3696 case E_V32HImode: 3697 gen = gen_avx512bw_blendmv32hi; 3698 break; 3699 case E_V16SImode: 3700 gen = gen_avx512f_blendmv16si; 3701 break; 3702 case E_V8DImode: 3703 gen = gen_avx512f_blendmv8di; 3704 break; 3705 case E_V8DFmode: 3706 gen = gen_avx512f_blendmv8df; 3707 break; 3708 case E_V16SFmode: 3709 gen = gen_avx512f_blendmv16sf; 3710 break; 3711 3712 default: 3713 break; 3714 } 3715 3716 if (gen != NULL) 3717 { 3718 emit_insn (gen (d, op_false, op_true, cmp)); 3719 if (d != dest) 3720 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); 3721 } 3722 else 3723 { 3724 op_true = force_reg (mode, op_true); 3725 3726 t2 = gen_reg_rtx (mode); 3727 if (optimize) 3728 t3 = gen_reg_rtx (mode); 3729 else 3730 t3 = dest; 3731 3732 x = gen_rtx_AND (mode, op_true, cmp); 3733 emit_insn (gen_rtx_SET (t2, x)); 3734 3735 x = gen_rtx_NOT (mode, cmp); 3736 x = gen_rtx_AND (mode, x, op_false); 3737 emit_insn (gen_rtx_SET (t3, x)); 3738 3739 x = gen_rtx_IOR (mode, t3, t2); 3740 emit_insn (gen_rtx_SET (dest, x)); 3741 } 3742} 3743 3744/* Swap, force into registers, or otherwise massage the two operands 3745 to an sse comparison with a mask result. Thus we differ a bit from 3746 ix86_prepare_fp_compare_args which expects to produce a flags result. 3747 3748 The DEST operand exists to help determine whether to commute commutative 3749 operators. The POP0/POP1 operands are updated in place. The new 3750 comparison code is returned, or UNKNOWN if not implementable. */ 3751 3752static enum rtx_code 3753ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code, 3754 rtx *pop0, rtx *pop1) 3755{ 3756 switch (code) 3757 { 3758 case LTGT: 3759 case UNEQ: 3760 /* AVX supports all the needed comparisons. */ 3761 if (TARGET_AVX) 3762 break; 3763 /* We have no LTGT as an operator. We could implement it with 3764 NE & ORDERED, but this requires an extra temporary. It's 3765 not clear that it's worth it. */ 3766 return UNKNOWN; 3767 3768 case LT: 3769 case LE: 3770 case UNGT: 3771 case UNGE: 3772 /* These are supported directly. */ 3773 break; 3774 3775 case EQ: 3776 case NE: 3777 case UNORDERED: 3778 case ORDERED: 3779 /* AVX has 3 operand comparisons, no need to swap anything. */ 3780 if (TARGET_AVX) 3781 break; 3782 /* For commutative operators, try to canonicalize the destination 3783 operand to be first in the comparison - this helps reload to 3784 avoid extra moves. */ 3785 if (!dest || !rtx_equal_p (dest, *pop1)) 3786 break; 3787 /* FALLTHRU */ 3788 3789 case GE: 3790 case GT: 3791 case UNLE: 3792 case UNLT: 3793 /* These are not supported directly before AVX, and furthermore 3794 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the 3795 comparison operands to transform into something that is 3796 supported. */ 3797 std::swap (*pop0, *pop1); 3798 code = swap_condition (code); 3799 break; 3800 3801 default: 3802 gcc_unreachable (); 3803 } 3804 3805 return code; 3806} 3807 3808/* Expand a floating-point conditional move. Return true if successful. */ 3809 3810bool 3811ix86_expand_fp_movcc (rtx operands[]) 3812{ 3813 machine_mode mode = GET_MODE (operands[0]); 3814 enum rtx_code code = GET_CODE (operands[1]); 3815 rtx tmp, compare_op; 3816 rtx op0 = XEXP (operands[1], 0); 3817 rtx op1 = XEXP (operands[1], 1); 3818 3819 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) 3820 { 3821 machine_mode cmode; 3822 3823 /* Since we've no cmove for sse registers, don't force bad register 3824 allocation just to gain access to it. Deny movcc when the 3825 comparison mode doesn't match the move mode. */ 3826 cmode = GET_MODE (op0); 3827 if (cmode == VOIDmode) 3828 cmode = GET_MODE (op1); 3829 if (cmode != mode) 3830 return false; 3831 3832 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1); 3833 if (code == UNKNOWN) 3834 return false; 3835 3836 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1, 3837 operands[2], operands[3])) 3838 return true; 3839 3840 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1, 3841 operands[2], operands[3]); 3842 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]); 3843 return true; 3844 } 3845 3846 if (GET_MODE (op0) == TImode 3847 || (GET_MODE (op0) == DImode 3848 && !TARGET_64BIT)) 3849 return false; 3850 3851 /* The floating point conditional move instructions don't directly 3852 support conditions resulting from a signed integer comparison. */ 3853 3854 compare_op = ix86_expand_compare (code, op0, op1); 3855 if (!fcmov_comparison_operator (compare_op, VOIDmode)) 3856 { 3857 tmp = gen_reg_rtx (QImode); 3858 ix86_expand_setcc (tmp, code, op0, op1); 3859 3860 compare_op = ix86_expand_compare (NE, tmp, const0_rtx); 3861 } 3862 3863 emit_insn (gen_rtx_SET (operands[0], 3864 gen_rtx_IF_THEN_ELSE (mode, compare_op, 3865 operands[2], operands[3]))); 3866 3867 return true; 3868} 3869 3870/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */ 3871 3872static int 3873ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code) 3874{ 3875 switch (code) 3876 { 3877 case EQ: 3878 return 0; 3879 case LT: 3880 case LTU: 3881 return 1; 3882 case LE: 3883 case LEU: 3884 return 2; 3885 case NE: 3886 return 4; 3887 case GE: 3888 case GEU: 3889 return 5; 3890 case GT: 3891 case GTU: 3892 return 6; 3893 default: 3894 gcc_unreachable (); 3895 } 3896} 3897 3898/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */ 3899 3900static int 3901ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code) 3902{ 3903 switch (code) 3904 { 3905 case EQ: 3906 return 0x00; 3907 case NE: 3908 return 0x04; 3909 case GT: 3910 return 0x0e; 3911 case LE: 3912 return 0x02; 3913 case GE: 3914 return 0x0d; 3915 case LT: 3916 return 0x01; 3917 case UNLE: 3918 return 0x0a; 3919 case UNLT: 3920 return 0x09; 3921 case UNGE: 3922 return 0x05; 3923 case UNGT: 3924 return 0x06; 3925 case UNEQ: 3926 return 0x18; 3927 case LTGT: 3928 return 0x0c; 3929 case ORDERED: 3930 return 0x07; 3931 case UNORDERED: 3932 return 0x03; 3933 default: 3934 gcc_unreachable (); 3935 } 3936} 3937 3938/* Return immediate value to be used in UNSPEC_PCMP 3939 for comparison CODE in MODE. */ 3940 3941static int 3942ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode) 3943{ 3944 if (FLOAT_MODE_P (mode)) 3945 return ix86_fp_cmp_code_to_pcmp_immediate (code); 3946 return ix86_int_cmp_code_to_pcmp_immediate (code); 3947} 3948 3949/* Expand AVX-512 vector comparison. */ 3950 3951bool 3952ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1) 3953{ 3954 machine_mode mask_mode = GET_MODE (dest); 3955 machine_mode cmp_mode = GET_MODE (cmp_op0); 3956 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode)); 3957 int unspec_code; 3958 rtx unspec; 3959 3960 switch (code) 3961 { 3962 case LEU: 3963 case GTU: 3964 case GEU: 3965 case LTU: 3966 unspec_code = UNSPEC_UNSIGNED_PCMP; 3967 break; 3968 3969 default: 3970 unspec_code = UNSPEC_PCMP; 3971 } 3972 3973 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm), 3974 unspec_code); 3975 emit_insn (gen_rtx_SET (dest, unspec)); 3976 3977 return true; 3978} 3979 3980/* Expand fp vector comparison. */ 3981 3982bool 3983ix86_expand_fp_vec_cmp (rtx operands[]) 3984{ 3985 enum rtx_code code = GET_CODE (operands[1]); 3986 rtx cmp; 3987 3988 code = ix86_prepare_sse_fp_compare_args (operands[0], code, 3989 &operands[2], &operands[3]); 3990 if (code == UNKNOWN) 3991 { 3992 rtx temp; 3993 switch (GET_CODE (operands[1])) 3994 { 3995 case LTGT: 3996 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2], 3997 operands[3], NULL, NULL); 3998 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2], 3999 operands[3], NULL, NULL); 4000 code = AND; 4001 break; 4002 case UNEQ: 4003 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2], 4004 operands[3], NULL, NULL); 4005 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2], 4006 operands[3], NULL, NULL); 4007 code = IOR; 4008 break; 4009 default: 4010 gcc_unreachable (); 4011 } 4012 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, 4013 OPTAB_DIRECT); 4014 } 4015 else 4016 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3], 4017 operands[1], operands[2]); 4018 4019 if (operands[0] != cmp) 4020 emit_move_insn (operands[0], cmp); 4021 4022 return true; 4023} 4024 4025static rtx 4026ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, 4027 rtx op_true, rtx op_false, bool *negate) 4028{ 4029 machine_mode data_mode = GET_MODE (dest); 4030 machine_mode mode = GET_MODE (cop0); 4031 rtx x; 4032 4033 *negate = false; 4034 4035 /* XOP supports all of the comparisons on all 128-bit vector int types. */ 4036 if (TARGET_XOP 4037 && (mode == V16QImode || mode == V8HImode 4038 || mode == V4SImode || mode == V2DImode)) 4039 ; 4040 /* AVX512F supports all of the comparsions 4041 on all 128/256/512-bit vector int types. */ 4042 else if (ix86_valid_mask_cmp_mode (mode)) 4043 ; 4044 else 4045 { 4046 /* Canonicalize the comparison to EQ, GT, GTU. */ 4047 switch (code) 4048 { 4049 case EQ: 4050 case GT: 4051 case GTU: 4052 break; 4053 4054 case NE: 4055 case LE: 4056 case LEU: 4057 code = reverse_condition (code); 4058 *negate = true; 4059 break; 4060 4061 case GE: 4062 case GEU: 4063 code = reverse_condition (code); 4064 *negate = true; 4065 /* FALLTHRU */ 4066 4067 case LT: 4068 case LTU: 4069 std::swap (cop0, cop1); 4070 code = swap_condition (code); 4071 break; 4072 4073 default: 4074 gcc_unreachable (); 4075 } 4076 4077 /* Only SSE4.1/SSE4.2 supports V2DImode. */ 4078 if (mode == V2DImode) 4079 { 4080 switch (code) 4081 { 4082 case EQ: 4083 /* SSE4.1 supports EQ. */ 4084 if (!TARGET_SSE4_1) 4085 return NULL; 4086 break; 4087 4088 case GT: 4089 case GTU: 4090 /* SSE4.2 supports GT/GTU. */ 4091 if (!TARGET_SSE4_2) 4092 return NULL; 4093 break; 4094 4095 default: 4096 gcc_unreachable (); 4097 } 4098 } 4099 4100 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode); 4101 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode); 4102 if (*negate) 4103 std::swap (optrue, opfalse); 4104 4105 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when 4106 not using integer masks into min (x, y) == x ? -1 : 0 (i.e. 4107 min (x, y) == x). While we add one instruction (the minimum), 4108 we remove the need for two instructions in the negation, as the 4109 result is done this way. 4110 When using masks, do it for SI/DImode element types, as it is shorter 4111 than the two subtractions. */ 4112 if ((code != EQ 4113 && GET_MODE_SIZE (mode) != 64 4114 && vector_all_ones_operand (opfalse, data_mode) 4115 && optrue == CONST0_RTX (data_mode)) 4116 || (code == GTU 4117 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 4118 /* Don't do it if not using integer masks and we'd end up with 4119 the right values in the registers though. */ 4120 && (GET_MODE_SIZE (mode) == 64 4121 || !vector_all_ones_operand (optrue, data_mode) 4122 || opfalse != CONST0_RTX (data_mode)))) 4123 { 4124 rtx (*gen) (rtx, rtx, rtx) = NULL; 4125 4126 switch (mode) 4127 { 4128 case E_V16SImode: 4129 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3; 4130 break; 4131 case E_V8DImode: 4132 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3; 4133 cop0 = force_reg (mode, cop0); 4134 cop1 = force_reg (mode, cop1); 4135 break; 4136 case E_V32QImode: 4137 if (TARGET_AVX2) 4138 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3; 4139 break; 4140 case E_V16HImode: 4141 if (TARGET_AVX2) 4142 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3; 4143 break; 4144 case E_V8SImode: 4145 if (TARGET_AVX2) 4146 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3; 4147 break; 4148 case E_V4DImode: 4149 if (TARGET_AVX512VL) 4150 { 4151 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3; 4152 cop0 = force_reg (mode, cop0); 4153 cop1 = force_reg (mode, cop1); 4154 } 4155 break; 4156 case E_V16QImode: 4157 if (code == GTU && TARGET_SSE2) 4158 gen = gen_uminv16qi3; 4159 else if (code == GT && TARGET_SSE4_1) 4160 gen = gen_sminv16qi3; 4161 break; 4162 case E_V8HImode: 4163 if (code == GTU && TARGET_SSE4_1) 4164 gen = gen_uminv8hi3; 4165 else if (code == GT && TARGET_SSE2) 4166 gen = gen_sminv8hi3; 4167 break; 4168 case E_V4SImode: 4169 if (TARGET_SSE4_1) 4170 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; 4171 break; 4172 case E_V2DImode: 4173 if (TARGET_AVX512VL) 4174 { 4175 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3; 4176 cop0 = force_reg (mode, cop0); 4177 cop1 = force_reg (mode, cop1); 4178 } 4179 break; 4180 default: 4181 break; 4182 } 4183 4184 if (gen) 4185 { 4186 rtx tem = gen_reg_rtx (mode); 4187 if (!vector_operand (cop0, mode)) 4188 cop0 = force_reg (mode, cop0); 4189 if (!vector_operand (cop1, mode)) 4190 cop1 = force_reg (mode, cop1); 4191 *negate = !*negate; 4192 emit_insn (gen (tem, cop0, cop1)); 4193 cop1 = tem; 4194 code = EQ; 4195 } 4196 } 4197 4198 /* Unsigned parallel compare is not supported by the hardware. 4199 Play some tricks to turn this into a signed comparison 4200 against 0. */ 4201 if (code == GTU) 4202 { 4203 cop0 = force_reg (mode, cop0); 4204 4205 switch (mode) 4206 { 4207 case E_V16SImode: 4208 case E_V8DImode: 4209 case E_V8SImode: 4210 case E_V4DImode: 4211 case E_V4SImode: 4212 case E_V2DImode: 4213 { 4214 rtx t1, t2, mask; 4215 4216 /* Subtract (-(INT MAX) - 1) from both operands to make 4217 them signed. */ 4218 mask = ix86_build_signbit_mask (mode, true, false); 4219 t1 = gen_reg_rtx (mode); 4220 emit_insn (gen_sub3_insn (t1, cop0, mask)); 4221 4222 t2 = gen_reg_rtx (mode); 4223 emit_insn (gen_sub3_insn (t2, cop1, mask)); 4224 4225 cop0 = t1; 4226 cop1 = t2; 4227 code = GT; 4228 } 4229 break; 4230 4231 case E_V64QImode: 4232 case E_V32HImode: 4233 case E_V32QImode: 4234 case E_V16HImode: 4235 case E_V16QImode: 4236 case E_V8HImode: 4237 /* Perform a parallel unsigned saturating subtraction. */ 4238 x = gen_reg_rtx (mode); 4239 emit_insn (gen_rtx_SET 4240 (x, gen_rtx_US_MINUS (mode, cop0, cop1))); 4241 cop0 = x; 4242 cop1 = CONST0_RTX (mode); 4243 code = EQ; 4244 *negate = !*negate; 4245 break; 4246 4247 default: 4248 gcc_unreachable (); 4249 } 4250 } 4251 } 4252 4253 if (*negate) 4254 std::swap (op_true, op_false); 4255 4256 /* Allow the comparison to be done in one mode, but the movcc to 4257 happen in another mode. */ 4258 if (data_mode == mode) 4259 { 4260 x = ix86_expand_sse_cmp (dest, code, cop0, cop1, 4261 op_true, op_false); 4262 } 4263 else 4264 { 4265 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); 4266 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, 4267 op_true, op_false); 4268 if (GET_MODE (x) == mode) 4269 x = gen_lowpart (data_mode, x); 4270 } 4271 4272 return x; 4273} 4274 4275/* Expand integer vector comparison. */ 4276 4277bool 4278ix86_expand_int_vec_cmp (rtx operands[]) 4279{ 4280 rtx_code code = GET_CODE (operands[1]); 4281 bool negate = false; 4282 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2], 4283 operands[3], NULL, NULL, &negate); 4284 4285 if (!cmp) 4286 return false; 4287 4288 if (negate) 4289 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, 4290 CONST0_RTX (GET_MODE (cmp)), 4291 NULL, NULL, &negate); 4292 4293 gcc_assert (!negate); 4294 4295 if (operands[0] != cmp) 4296 emit_move_insn (operands[0], cmp); 4297 4298 return true; 4299} 4300 4301/* Expand a floating-point vector conditional move; a vcond operation 4302 rather than a movcc operation. */ 4303 4304bool 4305ix86_expand_fp_vcond (rtx operands[]) 4306{ 4307 enum rtx_code code = GET_CODE (operands[3]); 4308 rtx cmp; 4309 4310 code = ix86_prepare_sse_fp_compare_args (operands[0], code, 4311 &operands[4], &operands[5]); 4312 if (code == UNKNOWN) 4313 { 4314 rtx temp; 4315 switch (GET_CODE (operands[3])) 4316 { 4317 case LTGT: 4318 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], 4319 operands[5], operands[0], operands[0]); 4320 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], 4321 operands[5], operands[1], operands[2]); 4322 code = AND; 4323 break; 4324 case UNEQ: 4325 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], 4326 operands[5], operands[0], operands[0]); 4327 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], 4328 operands[5], operands[1], operands[2]); 4329 code = IOR; 4330 break; 4331 default: 4332 gcc_unreachable (); 4333 } 4334 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, 4335 OPTAB_DIRECT); 4336 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); 4337 return true; 4338 } 4339 4340 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], 4341 operands[5], operands[1], operands[2])) 4342 return true; 4343 4344 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5], 4345 operands[1], operands[2]); 4346 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); 4347 return true; 4348} 4349 4350/* Expand a signed/unsigned integral vector conditional move. */ 4351 4352bool 4353ix86_expand_int_vcond (rtx operands[]) 4354{ 4355 machine_mode data_mode = GET_MODE (operands[0]); 4356 machine_mode mode = GET_MODE (operands[4]); 4357 enum rtx_code code = GET_CODE (operands[3]); 4358 bool negate = false; 4359 rtx x, cop0, cop1; 4360 4361 cop0 = operands[4]; 4362 cop1 = operands[5]; 4363 4364 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 4365 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ 4366 if ((code == LT || code == GE) 4367 && data_mode == mode 4368 && cop1 == CONST0_RTX (mode) 4369 && operands[1 + (code == LT)] == CONST0_RTX (data_mode) 4370 && GET_MODE_UNIT_SIZE (data_mode) > 1 4371 && GET_MODE_UNIT_SIZE (data_mode) <= 8 4372 && (GET_MODE_SIZE (data_mode) == 16 4373 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) 4374 { 4375 rtx negop = operands[2 - (code == LT)]; 4376 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1; 4377 if (negop == CONST1_RTX (data_mode)) 4378 { 4379 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), 4380 operands[0], 1, OPTAB_DIRECT); 4381 if (res != operands[0]) 4382 emit_move_insn (operands[0], res); 4383 return true; 4384 } 4385 else if (GET_MODE_INNER (data_mode) != DImode 4386 && vector_all_ones_operand (negop, data_mode)) 4387 { 4388 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), 4389 operands[0], 0, OPTAB_DIRECT); 4390 if (res != operands[0]) 4391 emit_move_insn (operands[0], res); 4392 return true; 4393 } 4394 } 4395 4396 if (!nonimmediate_operand (cop1, mode)) 4397 cop1 = force_reg (mode, cop1); 4398 if (!general_operand (operands[1], data_mode)) 4399 operands[1] = force_reg (data_mode, operands[1]); 4400 if (!general_operand (operands[2], data_mode)) 4401 operands[2] = force_reg (data_mode, operands[2]); 4402 4403 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1, 4404 operands[1], operands[2], &negate); 4405 4406 if (!x) 4407 return false; 4408 4409 ix86_expand_sse_movcc (operands[0], x, operands[1+negate], 4410 operands[2-negate]); 4411 return true; 4412} 4413 4414static bool 4415ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, 4416 struct expand_vec_perm_d *d) 4417{ 4418 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const 4419 expander, so args are either in d, or in op0, op1 etc. */ 4420 machine_mode mode = GET_MODE (d ? d->op0 : op0); 4421 machine_mode maskmode = mode; 4422 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; 4423 4424 switch (mode) 4425 { 4426 case E_V8HImode: 4427 if (TARGET_AVX512VL && TARGET_AVX512BW) 4428 gen = gen_avx512vl_vpermt2varv8hi3; 4429 break; 4430 case E_V16HImode: 4431 if (TARGET_AVX512VL && TARGET_AVX512BW) 4432 gen = gen_avx512vl_vpermt2varv16hi3; 4433 break; 4434 case E_V64QImode: 4435 if (TARGET_AVX512VBMI) 4436 gen = gen_avx512bw_vpermt2varv64qi3; 4437 break; 4438 case E_V32HImode: 4439 if (TARGET_AVX512BW) 4440 gen = gen_avx512bw_vpermt2varv32hi3; 4441 break; 4442 case E_V4SImode: 4443 if (TARGET_AVX512VL) 4444 gen = gen_avx512vl_vpermt2varv4si3; 4445 break; 4446 case E_V8SImode: 4447 if (TARGET_AVX512VL) 4448 gen = gen_avx512vl_vpermt2varv8si3; 4449 break; 4450 case E_V16SImode: 4451 if (TARGET_AVX512F) 4452 gen = gen_avx512f_vpermt2varv16si3; 4453 break; 4454 case E_V4SFmode: 4455 if (TARGET_AVX512VL) 4456 { 4457 gen = gen_avx512vl_vpermt2varv4sf3; 4458 maskmode = V4SImode; 4459 } 4460 break; 4461 case E_V8SFmode: 4462 if (TARGET_AVX512VL) 4463 { 4464 gen = gen_avx512vl_vpermt2varv8sf3; 4465 maskmode = V8SImode; 4466 } 4467 break; 4468 case E_V16SFmode: 4469 if (TARGET_AVX512F) 4470 { 4471 gen = gen_avx512f_vpermt2varv16sf3; 4472 maskmode = V16SImode; 4473 } 4474 break; 4475 case E_V2DImode: 4476 if (TARGET_AVX512VL) 4477 gen = gen_avx512vl_vpermt2varv2di3; 4478 break; 4479 case E_V4DImode: 4480 if (TARGET_AVX512VL) 4481 gen = gen_avx512vl_vpermt2varv4di3; 4482 break; 4483 case E_V8DImode: 4484 if (TARGET_AVX512F) 4485 gen = gen_avx512f_vpermt2varv8di3; 4486 break; 4487 case E_V2DFmode: 4488 if (TARGET_AVX512VL) 4489 { 4490 gen = gen_avx512vl_vpermt2varv2df3; 4491 maskmode = V2DImode; 4492 } 4493 break; 4494 case E_V4DFmode: 4495 if (TARGET_AVX512VL) 4496 { 4497 gen = gen_avx512vl_vpermt2varv4df3; 4498 maskmode = V4DImode; 4499 } 4500 break; 4501 case E_V8DFmode: 4502 if (TARGET_AVX512F) 4503 { 4504 gen = gen_avx512f_vpermt2varv8df3; 4505 maskmode = V8DImode; 4506 } 4507 break; 4508 default: 4509 break; 4510 } 4511 4512 if (gen == NULL) 4513 return false; 4514 4515 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const 4516 expander, so args are either in d, or in op0, op1 etc. */ 4517 if (d) 4518 { 4519 rtx vec[64]; 4520 target = d->target; 4521 op0 = d->op0; 4522 op1 = d->op1; 4523 for (int i = 0; i < d->nelt; ++i) 4524 vec[i] = GEN_INT (d->perm[i]); 4525 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); 4526 } 4527 4528 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1)); 4529 return true; 4530} 4531 4532/* Expand a variable vector permutation. */ 4533 4534void 4535ix86_expand_vec_perm (rtx operands[]) 4536{ 4537 rtx target = operands[0]; 4538 rtx op0 = operands[1]; 4539 rtx op1 = operands[2]; 4540 rtx mask = operands[3]; 4541 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32]; 4542 machine_mode mode = GET_MODE (op0); 4543 machine_mode maskmode = GET_MODE (mask); 4544 int w, e, i; 4545 bool one_operand_shuffle = rtx_equal_p (op0, op1); 4546 4547 /* Number of elements in the vector. */ 4548 w = GET_MODE_NUNITS (mode); 4549 e = GET_MODE_UNIT_SIZE (mode); 4550 gcc_assert (w <= 64); 4551 4552 if (TARGET_AVX512F && one_operand_shuffle) 4553 { 4554 rtx (*gen) (rtx, rtx, rtx) = NULL; 4555 switch (mode) 4556 { 4557 case E_V16SImode: 4558 gen =gen_avx512f_permvarv16si; 4559 break; 4560 case E_V16SFmode: 4561 gen = gen_avx512f_permvarv16sf; 4562 break; 4563 case E_V8DImode: 4564 gen = gen_avx512f_permvarv8di; 4565 break; 4566 case E_V8DFmode: 4567 gen = gen_avx512f_permvarv8df; 4568 break; 4569 default: 4570 break; 4571 } 4572 if (gen != NULL) 4573 { 4574 emit_insn (gen (target, op0, mask)); 4575 return; 4576 } 4577 } 4578 4579 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL)) 4580 return; 4581 4582 if (TARGET_AVX2) 4583 { 4584 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode) 4585 { 4586 /* Unfortunately, the VPERMQ and VPERMPD instructions only support 4587 an constant shuffle operand. With a tiny bit of effort we can 4588 use VPERMD instead. A re-interpretation stall for V4DFmode is 4589 unfortunate but there's no avoiding it. 4590 Similarly for V16HImode we don't have instructions for variable 4591 shuffling, while for V32QImode we can use after preparing suitable 4592 masks vpshufb; vpshufb; vpermq; vpor. */ 4593 4594 if (mode == V16HImode) 4595 { 4596 maskmode = mode = V32QImode; 4597 w = 32; 4598 e = 1; 4599 } 4600 else 4601 { 4602 maskmode = mode = V8SImode; 4603 w = 8; 4604 e = 4; 4605 } 4606 t1 = gen_reg_rtx (maskmode); 4607 4608 /* Replicate the low bits of the V4DImode mask into V8SImode: 4609 mask = { A B C D } 4610 t1 = { A A B B C C D D }. */ 4611 for (i = 0; i < w / 2; ++i) 4612 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); 4613 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); 4614 vt = force_reg (maskmode, vt); 4615 mask = gen_lowpart (maskmode, mask); 4616 if (maskmode == V8SImode) 4617 emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); 4618 else 4619 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); 4620 4621 /* Multiply the shuffle indicies by two. */ 4622 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1, 4623 OPTAB_DIRECT); 4624 4625 /* Add one to the odd shuffle indicies: 4626 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ 4627 for (i = 0; i < w / 2; ++i) 4628 { 4629 vec[i * 2] = const0_rtx; 4630 vec[i * 2 + 1] = const1_rtx; 4631 } 4632 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); 4633 vt = validize_mem (force_const_mem (maskmode, vt)); 4634 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, 4635 OPTAB_DIRECT); 4636 4637 /* Continue as if V8SImode (resp. V32QImode) was used initially. */ 4638 operands[3] = mask = t1; 4639 target = gen_reg_rtx (mode); 4640 op0 = gen_lowpart (mode, op0); 4641 op1 = gen_lowpart (mode, op1); 4642 } 4643 4644 switch (mode) 4645 { 4646 case E_V8SImode: 4647 /* The VPERMD and VPERMPS instructions already properly ignore 4648 the high bits of the shuffle elements. No need for us to 4649 perform an AND ourselves. */ 4650 if (one_operand_shuffle) 4651 { 4652 emit_insn (gen_avx2_permvarv8si (target, op0, mask)); 4653 if (target != operands[0]) 4654 emit_move_insn (operands[0], 4655 gen_lowpart (GET_MODE (operands[0]), target)); 4656 } 4657 else 4658 { 4659 t1 = gen_reg_rtx (V8SImode); 4660 t2 = gen_reg_rtx (V8SImode); 4661 emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); 4662 emit_insn (gen_avx2_permvarv8si (t2, op1, mask)); 4663 goto merge_two; 4664 } 4665 return; 4666 4667 case E_V8SFmode: 4668 mask = gen_lowpart (V8SImode, mask); 4669 if (one_operand_shuffle) 4670 emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); 4671 else 4672 { 4673 t1 = gen_reg_rtx (V8SFmode); 4674 t2 = gen_reg_rtx (V8SFmode); 4675 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); 4676 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); 4677 goto merge_two; 4678 } 4679 return; 4680 4681 case E_V4SImode: 4682 /* By combining the two 128-bit input vectors into one 256-bit 4683 input vector, we can use VPERMD and VPERMPS for the full 4684 two-operand shuffle. */ 4685 t1 = gen_reg_rtx (V8SImode); 4686 t2 = gen_reg_rtx (V8SImode); 4687 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); 4688 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); 4689 emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); 4690 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); 4691 return; 4692 4693 case E_V4SFmode: 4694 t1 = gen_reg_rtx (V8SFmode); 4695 t2 = gen_reg_rtx (V8SImode); 4696 mask = gen_lowpart (V4SImode, mask); 4697 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); 4698 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); 4699 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); 4700 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); 4701 return; 4702 4703 case E_V32QImode: 4704 t1 = gen_reg_rtx (V32QImode); 4705 t2 = gen_reg_rtx (V32QImode); 4706 t3 = gen_reg_rtx (V32QImode); 4707 vt2 = GEN_INT (-128); 4708 vt = gen_const_vec_duplicate (V32QImode, vt2); 4709 vt = force_reg (V32QImode, vt); 4710 for (i = 0; i < 32; i++) 4711 vec[i] = i < 16 ? vt2 : const0_rtx; 4712 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); 4713 vt2 = force_reg (V32QImode, vt2); 4714 /* From mask create two adjusted masks, which contain the same 4715 bits as mask in the low 7 bits of each vector element. 4716 The first mask will have the most significant bit clear 4717 if it requests element from the same 128-bit lane 4718 and MSB set if it requests element from the other 128-bit lane. 4719 The second mask will have the opposite values of the MSB, 4720 and additionally will have its 128-bit lanes swapped. 4721 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have 4722 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and 4723 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ... 4724 stands for other 12 bytes. */ 4725 /* The bit whether element is from the same lane or the other 4726 lane is bit 4, so shift it up by 3 to the MSB position. */ 4727 t5 = gen_reg_rtx (V4DImode); 4728 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask), 4729 GEN_INT (3))); 4730 /* Clear MSB bits from the mask just in case it had them set. */ 4731 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); 4732 /* After this t1 will have MSB set for elements from other lane. */ 4733 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2)); 4734 /* Clear bits other than MSB. */ 4735 emit_insn (gen_andv32qi3 (t1, t1, vt)); 4736 /* Or in the lower bits from mask into t3. */ 4737 emit_insn (gen_iorv32qi3 (t3, t1, t2)); 4738 /* And invert MSB bits in t1, so MSB is set for elements from the same 4739 lane. */ 4740 emit_insn (gen_xorv32qi3 (t1, t1, vt)); 4741 /* Swap 128-bit lanes in t3. */ 4742 t6 = gen_reg_rtx (V4DImode); 4743 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3), 4744 const2_rtx, GEN_INT (3), 4745 const0_rtx, const1_rtx)); 4746 /* And or in the lower bits from mask into t1. */ 4747 emit_insn (gen_iorv32qi3 (t1, t1, t2)); 4748 if (one_operand_shuffle) 4749 { 4750 /* Each of these shuffles will put 0s in places where 4751 element from the other 128-bit lane is needed, otherwise 4752 will shuffle in the requested value. */ 4753 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, 4754 gen_lowpart (V32QImode, t6))); 4755 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); 4756 /* For t3 the 128-bit lanes are swapped again. */ 4757 t7 = gen_reg_rtx (V4DImode); 4758 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3), 4759 const2_rtx, GEN_INT (3), 4760 const0_rtx, const1_rtx)); 4761 /* And oring both together leads to the result. */ 4762 emit_insn (gen_iorv32qi3 (target, t1, 4763 gen_lowpart (V32QImode, t7))); 4764 if (target != operands[0]) 4765 emit_move_insn (operands[0], 4766 gen_lowpart (GET_MODE (operands[0]), target)); 4767 return; 4768 } 4769 4770 t4 = gen_reg_rtx (V32QImode); 4771 /* Similarly to the above one_operand_shuffle code, 4772 just for repeated twice for each operand. merge_two: 4773 code will merge the two results together. */ 4774 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, 4775 gen_lowpart (V32QImode, t6))); 4776 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, 4777 gen_lowpart (V32QImode, t6))); 4778 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); 4779 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); 4780 t7 = gen_reg_rtx (V4DImode); 4781 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4), 4782 const2_rtx, GEN_INT (3), 4783 const0_rtx, const1_rtx)); 4784 t8 = gen_reg_rtx (V4DImode); 4785 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3), 4786 const2_rtx, GEN_INT (3), 4787 const0_rtx, const1_rtx)); 4788 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7))); 4789 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8))); 4790 t1 = t4; 4791 t2 = t3; 4792 goto merge_two; 4793 4794 default: 4795 gcc_assert (GET_MODE_SIZE (mode) <= 16); 4796 break; 4797 } 4798 } 4799 4800 if (TARGET_XOP) 4801 { 4802 /* The XOP VPPERM insn supports three inputs. By ignoring the 4803 one_operand_shuffle special case, we avoid creating another 4804 set of constant vectors in memory. */ 4805 one_operand_shuffle = false; 4806 4807 /* mask = mask & {2*w-1, ...} */ 4808 vt = GEN_INT (2*w - 1); 4809 } 4810 else 4811 { 4812 /* mask = mask & {w-1, ...} */ 4813 vt = GEN_INT (w - 1); 4814 } 4815 4816 vt = gen_const_vec_duplicate (maskmode, vt); 4817 mask = expand_simple_binop (maskmode, AND, mask, vt, 4818 NULL_RTX, 0, OPTAB_DIRECT); 4819 4820 /* For non-QImode operations, convert the word permutation control 4821 into a byte permutation control. */ 4822 if (mode != V16QImode) 4823 { 4824 mask = expand_simple_binop (maskmode, ASHIFT, mask, 4825 GEN_INT (exact_log2 (e)), 4826 NULL_RTX, 0, OPTAB_DIRECT); 4827 4828 /* Convert mask to vector of chars. */ 4829 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); 4830 4831 /* Replicate each of the input bytes into byte positions: 4832 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} 4833 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} 4834 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ 4835 for (i = 0; i < 16; ++i) 4836 vec[i] = GEN_INT (i/e * e); 4837 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); 4838 vt = validize_mem (force_const_mem (V16QImode, vt)); 4839 if (TARGET_XOP) 4840 emit_insn (gen_xop_pperm (mask, mask, mask, vt)); 4841 else 4842 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); 4843 4844 /* Convert it into the byte positions by doing 4845 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ 4846 for (i = 0; i < 16; ++i) 4847 vec[i] = GEN_INT (i % e); 4848 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); 4849 vt = validize_mem (force_const_mem (V16QImode, vt)); 4850 emit_insn (gen_addv16qi3 (mask, mask, vt)); 4851 } 4852 4853 /* The actual shuffle operations all operate on V16QImode. */ 4854 op0 = gen_lowpart (V16QImode, op0); 4855 op1 = gen_lowpart (V16QImode, op1); 4856 4857 if (TARGET_XOP) 4858 { 4859 if (GET_MODE (target) != V16QImode) 4860 target = gen_reg_rtx (V16QImode); 4861 emit_insn (gen_xop_pperm (target, op0, op1, mask)); 4862 if (target != operands[0]) 4863 emit_move_insn (operands[0], 4864 gen_lowpart (GET_MODE (operands[0]), target)); 4865 } 4866 else if (one_operand_shuffle) 4867 { 4868 if (GET_MODE (target) != V16QImode) 4869 target = gen_reg_rtx (V16QImode); 4870 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); 4871 if (target != operands[0]) 4872 emit_move_insn (operands[0], 4873 gen_lowpart (GET_MODE (operands[0]), target)); 4874 } 4875 else 4876 { 4877 rtx xops[6]; 4878 bool ok; 4879 4880 /* Shuffle the two input vectors independently. */ 4881 t1 = gen_reg_rtx (V16QImode); 4882 t2 = gen_reg_rtx (V16QImode); 4883 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); 4884 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); 4885 4886 merge_two: 4887 /* Then merge them together. The key is whether any given control 4888 element contained a bit set that indicates the second word. */ 4889 mask = operands[3]; 4890 vt = GEN_INT (w); 4891 if (maskmode == V2DImode && !TARGET_SSE4_1) 4892 { 4893 /* Without SSE4.1, we don't have V2DImode EQ. Perform one 4894 more shuffle to convert the V2DI input mask into a V4SI 4895 input mask. At which point the masking that expand_int_vcond 4896 will work as desired. */ 4897 rtx t3 = gen_reg_rtx (V4SImode); 4898 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), 4899 const0_rtx, const0_rtx, 4900 const2_rtx, const2_rtx)); 4901 mask = t3; 4902 maskmode = V4SImode; 4903 e = w = 4; 4904 } 4905 4906 vt = gen_const_vec_duplicate (maskmode, vt); 4907 vt = force_reg (maskmode, vt); 4908 mask = expand_simple_binop (maskmode, AND, mask, vt, 4909 NULL_RTX, 0, OPTAB_DIRECT); 4910 4911 if (GET_MODE (target) != mode) 4912 target = gen_reg_rtx (mode); 4913 xops[0] = target; 4914 xops[1] = gen_lowpart (mode, t2); 4915 xops[2] = gen_lowpart (mode, t1); 4916 xops[3] = gen_rtx_EQ (maskmode, mask, vt); 4917 xops[4] = mask; 4918 xops[5] = vt; 4919 ok = ix86_expand_int_vcond (xops); 4920 gcc_assert (ok); 4921 if (target != operands[0]) 4922 emit_move_insn (operands[0], 4923 gen_lowpart (GET_MODE (operands[0]), target)); 4924 } 4925} 4926 4927/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is 4928 true if we should do zero extension, else sign extension. HIGH_P is 4929 true if we want the N/2 high elements, else the low elements. */ 4930 4931void 4932ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) 4933{ 4934 machine_mode imode = GET_MODE (src); 4935 rtx tmp; 4936 4937 if (TARGET_SSE4_1) 4938 { 4939 rtx (*unpack)(rtx, rtx); 4940 rtx (*extract)(rtx, rtx) = NULL; 4941 machine_mode halfmode = BLKmode; 4942 4943 switch (imode) 4944 { 4945 case E_V64QImode: 4946 if (unsigned_p) 4947 unpack = gen_avx512bw_zero_extendv32qiv32hi2; 4948 else 4949 unpack = gen_avx512bw_sign_extendv32qiv32hi2; 4950 halfmode = V32QImode; 4951 extract 4952 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; 4953 break; 4954 case E_V32QImode: 4955 if (unsigned_p) 4956 unpack = gen_avx2_zero_extendv16qiv16hi2; 4957 else 4958 unpack = gen_avx2_sign_extendv16qiv16hi2; 4959 halfmode = V16QImode; 4960 extract 4961 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; 4962 break; 4963 case E_V32HImode: 4964 if (unsigned_p) 4965 unpack = gen_avx512f_zero_extendv16hiv16si2; 4966 else 4967 unpack = gen_avx512f_sign_extendv16hiv16si2; 4968 halfmode = V16HImode; 4969 extract 4970 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; 4971 break; 4972 case E_V16HImode: 4973 if (unsigned_p) 4974 unpack = gen_avx2_zero_extendv8hiv8si2; 4975 else 4976 unpack = gen_avx2_sign_extendv8hiv8si2; 4977 halfmode = V8HImode; 4978 extract 4979 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; 4980 break; 4981 case E_V16SImode: 4982 if (unsigned_p) 4983 unpack = gen_avx512f_zero_extendv8siv8di2; 4984 else 4985 unpack = gen_avx512f_sign_extendv8siv8di2; 4986 halfmode = V8SImode; 4987 extract 4988 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; 4989 break; 4990 case E_V8SImode: 4991 if (unsigned_p) 4992 unpack = gen_avx2_zero_extendv4siv4di2; 4993 else 4994 unpack = gen_avx2_sign_extendv4siv4di2; 4995 halfmode = V4SImode; 4996 extract 4997 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si; 4998 break; 4999 case E_V16QImode: 5000 if (unsigned_p) 5001 unpack = gen_sse4_1_zero_extendv8qiv8hi2; 5002 else 5003 unpack = gen_sse4_1_sign_extendv8qiv8hi2; 5004 break; 5005 case E_V8HImode: 5006 if (unsigned_p) 5007 unpack = gen_sse4_1_zero_extendv4hiv4si2; 5008 else 5009 unpack = gen_sse4_1_sign_extendv4hiv4si2; 5010 break; 5011 case E_V4SImode: 5012 if (unsigned_p) 5013 unpack = gen_sse4_1_zero_extendv2siv2di2; 5014 else 5015 unpack = gen_sse4_1_sign_extendv2siv2di2; 5016 break; 5017 default: 5018 gcc_unreachable (); 5019 } 5020 5021 if (GET_MODE_SIZE (imode) >= 32) 5022 { 5023 tmp = gen_reg_rtx (halfmode); 5024 emit_insn (extract (tmp, src)); 5025 } 5026 else if (high_p) 5027 { 5028 /* Shift higher 8 bytes to lower 8 bytes. */ 5029 tmp = gen_reg_rtx (V1TImode); 5030 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), 5031 GEN_INT (64))); 5032 tmp = gen_lowpart (imode, tmp); 5033 } 5034 else 5035 tmp = src; 5036 5037 emit_insn (unpack (dest, tmp)); 5038 } 5039 else 5040 { 5041 rtx (*unpack)(rtx, rtx, rtx); 5042 5043 switch (imode) 5044 { 5045 case E_V16QImode: 5046 if (high_p) 5047 unpack = gen_vec_interleave_highv16qi; 5048 else 5049 unpack = gen_vec_interleave_lowv16qi; 5050 break; 5051 case E_V8HImode: 5052 if (high_p) 5053 unpack = gen_vec_interleave_highv8hi; 5054 else 5055 unpack = gen_vec_interleave_lowv8hi; 5056 break; 5057 case E_V4SImode: 5058 if (high_p) 5059 unpack = gen_vec_interleave_highv4si; 5060 else 5061 unpack = gen_vec_interleave_lowv4si; 5062 break; 5063 default: 5064 gcc_unreachable (); 5065 } 5066 5067 if (unsigned_p) 5068 tmp = force_reg (imode, CONST0_RTX (imode)); 5069 else 5070 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), 5071 src, pc_rtx, pc_rtx); 5072 5073 rtx tmp2 = gen_reg_rtx (imode); 5074 emit_insn (unpack (tmp2, src, tmp)); 5075 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2)); 5076 } 5077} 5078 5079/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, 5080 but works for floating pointer parameters and nonoffsetable memories. 5081 For pushes, it returns just stack offsets; the values will be saved 5082 in the right order. Maximally three parts are generated. */ 5083 5084static int 5085ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode) 5086{ 5087 int size; 5088 5089 if (!TARGET_64BIT) 5090 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4; 5091 else 5092 size = (GET_MODE_SIZE (mode) + 4) / 8; 5093 5094 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); 5095 gcc_assert (size >= 2 && size <= 4); 5096 5097 /* Optimize constant pool reference to immediates. This is used by fp 5098 moves, that force all constants to memory to allow combining. */ 5099 if (MEM_P (operand) && MEM_READONLY_P (operand)) 5100 operand = avoid_constant_pool_reference (operand); 5101 5102 if (MEM_P (operand) && !offsettable_memref_p (operand)) 5103 { 5104 /* The only non-offsetable memories we handle are pushes. */ 5105 int ok = push_operand (operand, VOIDmode); 5106 5107 gcc_assert (ok); 5108 5109 operand = copy_rtx (operand); 5110 PUT_MODE (operand, word_mode); 5111 parts[0] = parts[1] = parts[2] = parts[3] = operand; 5112 return size; 5113 } 5114 5115 if (GET_CODE (operand) == CONST_VECTOR) 5116 { 5117 scalar_int_mode imode = int_mode_for_mode (mode).require (); 5118 /* Caution: if we looked through a constant pool memory above, 5119 the operand may actually have a different mode now. That's 5120 ok, since we want to pun this all the way back to an integer. */ 5121 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0); 5122 gcc_assert (operand != NULL); 5123 mode = imode; 5124 } 5125 5126 if (!TARGET_64BIT) 5127 { 5128 if (mode == DImode) 5129 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); 5130 else 5131 { 5132 int i; 5133 5134 if (REG_P (operand)) 5135 { 5136 gcc_assert (reload_completed); 5137 for (i = 0; i < size; i++) 5138 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i); 5139 } 5140 else if (offsettable_memref_p (operand)) 5141 { 5142 operand = adjust_address (operand, SImode, 0); 5143 parts[0] = operand; 5144 for (i = 1; i < size; i++) 5145 parts[i] = adjust_address (operand, SImode, 4 * i); 5146 } 5147 else if (CONST_DOUBLE_P (operand)) 5148 { 5149 const REAL_VALUE_TYPE *r; 5150 long l[4]; 5151 5152 r = CONST_DOUBLE_REAL_VALUE (operand); 5153 switch (mode) 5154 { 5155 case E_TFmode: 5156 real_to_target (l, r, mode); 5157 parts[3] = gen_int_mode (l[3], SImode); 5158 parts[2] = gen_int_mode (l[2], SImode); 5159 break; 5160 case E_XFmode: 5161 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since 5162 long double may not be 80-bit. */ 5163 real_to_target (l, r, mode); 5164 parts[2] = gen_int_mode (l[2], SImode); 5165 break; 5166 case E_DFmode: 5167 REAL_VALUE_TO_TARGET_DOUBLE (*r, l); 5168 break; 5169 default: 5170 gcc_unreachable (); 5171 } 5172 parts[1] = gen_int_mode (l[1], SImode); 5173 parts[0] = gen_int_mode (l[0], SImode); 5174 } 5175 else 5176 gcc_unreachable (); 5177 } 5178 } 5179 else 5180 { 5181 if (mode == TImode) 5182 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); 5183 if (mode == XFmode || mode == TFmode) 5184 { 5185 machine_mode upper_mode = mode==XFmode ? SImode : DImode; 5186 if (REG_P (operand)) 5187 { 5188 gcc_assert (reload_completed); 5189 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); 5190 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1); 5191 } 5192 else if (offsettable_memref_p (operand)) 5193 { 5194 operand = adjust_address (operand, DImode, 0); 5195 parts[0] = operand; 5196 parts[1] = adjust_address (operand, upper_mode, 8); 5197 } 5198 else if (CONST_DOUBLE_P (operand)) 5199 { 5200 long l[4]; 5201 5202 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode); 5203 5204 /* real_to_target puts 32-bit pieces in each long. */ 5205 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff)) 5206 | ((l[1] & HOST_WIDE_INT_C (0xffffffff)) 5207 << 32), DImode); 5208 5209 if (upper_mode == SImode) 5210 parts[1] = gen_int_mode (l[2], SImode); 5211 else 5212 parts[1] 5213 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff)) 5214 | ((l[3] & HOST_WIDE_INT_C (0xffffffff)) 5215 << 32), DImode); 5216 } 5217 else 5218 gcc_unreachable (); 5219 } 5220 } 5221 5222 return size; 5223} 5224 5225/* Emit insns to perform a move or push of DI, DF, XF, and TF values. 5226 Return false when normal moves are needed; true when all required 5227 insns have been emitted. Operands 2-4 contain the input values 5228 int the correct order; operands 5-7 contain the output values. */ 5229 5230void 5231ix86_split_long_move (rtx operands[]) 5232{ 5233 rtx part[2][4]; 5234 int nparts, i, j; 5235 int push = 0; 5236 int collisions = 0; 5237 machine_mode mode = GET_MODE (operands[0]); 5238 bool collisionparts[4]; 5239 5240 /* The DFmode expanders may ask us to move double. 5241 For 64bit target this is single move. By hiding the fact 5242 here we simplify i386.md splitters. */ 5243 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8) 5244 { 5245 /* Optimize constant pool reference to immediates. This is used by 5246 fp moves, that force all constants to memory to allow combining. */ 5247 5248 if (MEM_P (operands[1]) 5249 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF 5250 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) 5251 operands[1] = get_pool_constant (XEXP (operands[1], 0)); 5252 if (push_operand (operands[0], VOIDmode)) 5253 { 5254 operands[0] = copy_rtx (operands[0]); 5255 PUT_MODE (operands[0], word_mode); 5256 } 5257 else 5258 operands[0] = gen_lowpart (DImode, operands[0]); 5259 operands[1] = gen_lowpart (DImode, operands[1]); 5260 emit_move_insn (operands[0], operands[1]); 5261 return; 5262 } 5263 5264 /* The only non-offsettable memory we handle is push. */ 5265 if (push_operand (operands[0], VOIDmode)) 5266 push = 1; 5267 else 5268 gcc_assert (!MEM_P (operands[0]) 5269 || offsettable_memref_p (operands[0])); 5270 5271 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); 5272 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); 5273 5274 /* When emitting push, take care for source operands on the stack. */ 5275 if (push && MEM_P (operands[1]) 5276 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) 5277 { 5278 rtx src_base = XEXP (part[1][nparts - 1], 0); 5279 5280 /* Compensate for the stack decrement by 4. */ 5281 if (!TARGET_64BIT && nparts == 3 5282 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE) 5283 src_base = plus_constant (Pmode, src_base, 4); 5284 5285 /* src_base refers to the stack pointer and is 5286 automatically decreased by emitted push. */ 5287 for (i = 0; i < nparts; i++) 5288 part[1][i] = change_address (part[1][i], 5289 GET_MODE (part[1][i]), src_base); 5290 } 5291 5292 /* We need to do copy in the right order in case an address register 5293 of the source overlaps the destination. */ 5294 if (REG_P (part[0][0]) && MEM_P (part[1][0])) 5295 { 5296 rtx tmp; 5297 5298 for (i = 0; i < nparts; i++) 5299 { 5300 collisionparts[i] 5301 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0)); 5302 if (collisionparts[i]) 5303 collisions++; 5304 } 5305 5306 /* Collision in the middle part can be handled by reordering. */ 5307 if (collisions == 1 && nparts == 3 && collisionparts [1]) 5308 { 5309 std::swap (part[0][1], part[0][2]); 5310 std::swap (part[1][1], part[1][2]); 5311 } 5312 else if (collisions == 1 5313 && nparts == 4 5314 && (collisionparts [1] || collisionparts [2])) 5315 { 5316 if (collisionparts [1]) 5317 { 5318 std::swap (part[0][1], part[0][2]); 5319 std::swap (part[1][1], part[1][2]); 5320 } 5321 else 5322 { 5323 std::swap (part[0][2], part[0][3]); 5324 std::swap (part[1][2], part[1][3]); 5325 } 5326 } 5327 5328 /* If there are more collisions, we can't handle it by reordering. 5329 Do an lea to the last part and use only one colliding move. */ 5330 else if (collisions > 1) 5331 { 5332 rtx base, addr; 5333 5334 collisions = 1; 5335 5336 base = part[0][nparts - 1]; 5337 5338 /* Handle the case when the last part isn't valid for lea. 5339 Happens in 64-bit mode storing the 12-byte XFmode. */ 5340 if (GET_MODE (base) != Pmode) 5341 base = gen_rtx_REG (Pmode, REGNO (base)); 5342 5343 addr = XEXP (part[1][0], 0); 5344 if (TARGET_TLS_DIRECT_SEG_REFS) 5345 { 5346 struct ix86_address parts; 5347 int ok = ix86_decompose_address (addr, &parts); 5348 gcc_assert (ok); 5349 /* It is not valid to use %gs: or %fs: in lea. */ 5350 gcc_assert (parts.seg == ADDR_SPACE_GENERIC); 5351 } 5352 emit_insn (gen_rtx_SET (base, addr)); 5353 part[1][0] = replace_equiv_address (part[1][0], base); 5354 for (i = 1; i < nparts; i++) 5355 { 5356 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i); 5357 part[1][i] = replace_equiv_address (part[1][i], tmp); 5358 } 5359 } 5360 } 5361 5362 if (push) 5363 { 5364 if (!TARGET_64BIT) 5365 { 5366 if (nparts == 3) 5367 { 5368 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode) 5369 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4))); 5370 emit_move_insn (part[0][2], part[1][2]); 5371 } 5372 else if (nparts == 4) 5373 { 5374 emit_move_insn (part[0][3], part[1][3]); 5375 emit_move_insn (part[0][2], part[1][2]); 5376 } 5377 } 5378 else 5379 { 5380 /* In 64bit mode we don't have 32bit push available. In case this is 5381 register, it is OK - we will just use larger counterpart. We also 5382 retype memory - these comes from attempt to avoid REX prefix on 5383 moving of second half of TFmode value. */ 5384 if (GET_MODE (part[1][1]) == SImode) 5385 { 5386 switch (GET_CODE (part[1][1])) 5387 { 5388 case MEM: 5389 part[1][1] = adjust_address (part[1][1], DImode, 0); 5390 break; 5391 5392 case REG: 5393 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); 5394 break; 5395 5396 default: 5397 gcc_unreachable (); 5398 } 5399 5400 if (GET_MODE (part[1][0]) == SImode) 5401 part[1][0] = part[1][1]; 5402 } 5403 } 5404 emit_move_insn (part[0][1], part[1][1]); 5405 emit_move_insn (part[0][0], part[1][0]); 5406 return; 5407 } 5408 5409 /* Choose correct order to not overwrite the source before it is copied. */ 5410 if ((REG_P (part[0][0]) 5411 && REG_P (part[1][1]) 5412 && (REGNO (part[0][0]) == REGNO (part[1][1]) 5413 || (nparts == 3 5414 && REGNO (part[0][0]) == REGNO (part[1][2])) 5415 || (nparts == 4 5416 && REGNO (part[0][0]) == REGNO (part[1][3])))) 5417 || (collisions > 0 5418 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) 5419 { 5420 for (i = 0, j = nparts - 1; i < nparts; i++, j--) 5421 { 5422 operands[2 + i] = part[0][j]; 5423 operands[6 + i] = part[1][j]; 5424 } 5425 } 5426 else 5427 { 5428 for (i = 0; i < nparts; i++) 5429 { 5430 operands[2 + i] = part[0][i]; 5431 operands[6 + i] = part[1][i]; 5432 } 5433 } 5434 5435 /* If optimizing for size, attempt to locally unCSE nonzero constants. */ 5436 if (optimize_insn_for_size_p ()) 5437 { 5438 for (j = 0; j < nparts - 1; j++) 5439 if (CONST_INT_P (operands[6 + j]) 5440 && operands[6 + j] != const0_rtx 5441 && REG_P (operands[2 + j])) 5442 for (i = j; i < nparts - 1; i++) 5443 if (CONST_INT_P (operands[7 + i]) 5444 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) 5445 operands[7 + i] = operands[2 + j]; 5446 } 5447 5448 for (i = 0; i < nparts; i++) 5449 emit_move_insn (operands[2 + i], operands[6 + i]); 5450 5451 return; 5452} 5453 5454/* Helper function of ix86_split_ashl used to generate an SImode/DImode 5455 left shift by a constant, either using a single shift or 5456 a sequence of add instructions. */ 5457 5458static void 5459ix86_expand_ashl_const (rtx operand, int count, machine_mode mode) 5460{ 5461 if (count == 1 5462 || (count * ix86_cost->add <= ix86_cost->shift_const 5463 && !optimize_insn_for_size_p ())) 5464 { 5465 while (count-- > 0) 5466 emit_insn (gen_add2_insn (operand, operand)); 5467 } 5468 else 5469 { 5470 rtx (*insn)(rtx, rtx, rtx); 5471 5472 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3; 5473 emit_insn (insn (operand, operand, GEN_INT (count))); 5474 } 5475} 5476 5477void 5478ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode) 5479{ 5480 rtx (*gen_ashl3)(rtx, rtx, rtx); 5481 rtx (*gen_shld)(rtx, rtx, rtx); 5482 int half_width = GET_MODE_BITSIZE (mode) >> 1; 5483 machine_mode half_mode; 5484 5485 rtx low[2], high[2]; 5486 int count; 5487 5488 if (CONST_INT_P (operands[2])) 5489 { 5490 split_double_mode (mode, operands, 2, low, high); 5491 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); 5492 5493 if (count >= half_width) 5494 { 5495 emit_move_insn (high[0], low[1]); 5496 emit_move_insn (low[0], const0_rtx); 5497 5498 if (count > half_width) 5499 ix86_expand_ashl_const (high[0], count - half_width, mode); 5500 } 5501 else 5502 { 5503 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; 5504 5505 if (!rtx_equal_p (operands[0], operands[1])) 5506 emit_move_insn (operands[0], operands[1]); 5507 5508 emit_insn (gen_shld (high[0], low[0], GEN_INT (count))); 5509 ix86_expand_ashl_const (low[0], count, mode); 5510 } 5511 return; 5512 } 5513 5514 split_double_mode (mode, operands, 1, low, high); 5515 half_mode = mode == DImode ? SImode : DImode; 5516 5517 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3; 5518 5519 if (operands[1] == const1_rtx) 5520 { 5521 /* Assuming we've chosen a QImode capable registers, then 1 << N 5522 can be done with two 32/64-bit shifts, no branches, no cmoves. */ 5523 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0])) 5524 { 5525 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG); 5526 5527 ix86_expand_clear (low[0]); 5528 ix86_expand_clear (high[0]); 5529 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width))); 5530 5531 d = gen_lowpart (QImode, low[0]); 5532 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); 5533 s = gen_rtx_EQ (QImode, flags, const0_rtx); 5534 emit_insn (gen_rtx_SET (d, s)); 5535 5536 d = gen_lowpart (QImode, high[0]); 5537 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); 5538 s = gen_rtx_NE (QImode, flags, const0_rtx); 5539 emit_insn (gen_rtx_SET (d, s)); 5540 } 5541 5542 /* Otherwise, we can get the same results by manually performing 5543 a bit extract operation on bit 5/6, and then performing the two 5544 shifts. The two methods of getting 0/1 into low/high are exactly 5545 the same size. Avoiding the shift in the bit extract case helps 5546 pentium4 a bit; no one else seems to care much either way. */ 5547 else 5548 { 5549 rtx (*gen_lshr3)(rtx, rtx, rtx); 5550 rtx (*gen_and3)(rtx, rtx, rtx); 5551 rtx (*gen_xor3)(rtx, rtx, rtx); 5552 HOST_WIDE_INT bits; 5553 rtx x; 5554 5555 if (mode == DImode) 5556 { 5557 gen_lshr3 = gen_lshrsi3; 5558 gen_and3 = gen_andsi3; 5559 gen_xor3 = gen_xorsi3; 5560 bits = 5; 5561 } 5562 else 5563 { 5564 gen_lshr3 = gen_lshrdi3; 5565 gen_and3 = gen_anddi3; 5566 gen_xor3 = gen_xordi3; 5567 bits = 6; 5568 } 5569 5570 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ()) 5571 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]); 5572 else 5573 x = gen_lowpart (half_mode, operands[2]); 5574 emit_insn (gen_rtx_SET (high[0], x)); 5575 5576 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits))); 5577 emit_insn (gen_and3 (high[0], high[0], const1_rtx)); 5578 emit_move_insn (low[0], high[0]); 5579 emit_insn (gen_xor3 (low[0], low[0], const1_rtx)); 5580 } 5581 5582 emit_insn (gen_ashl3 (low[0], low[0], operands[2])); 5583 emit_insn (gen_ashl3 (high[0], high[0], operands[2])); 5584 return; 5585 } 5586 5587 if (operands[1] == constm1_rtx) 5588 { 5589 /* For -1 << N, we can avoid the shld instruction, because we 5590 know that we're shifting 0...31/63 ones into a -1. */ 5591 emit_move_insn (low[0], constm1_rtx); 5592 if (optimize_insn_for_size_p ()) 5593 emit_move_insn (high[0], low[0]); 5594 else 5595 emit_move_insn (high[0], constm1_rtx); 5596 } 5597 else 5598 { 5599 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; 5600 5601 if (!rtx_equal_p (operands[0], operands[1])) 5602 emit_move_insn (operands[0], operands[1]); 5603 5604 split_double_mode (mode, operands, 1, low, high); 5605 emit_insn (gen_shld (high[0], low[0], operands[2])); 5606 } 5607 5608 emit_insn (gen_ashl3 (low[0], low[0], operands[2])); 5609 5610 if (TARGET_CMOVE && scratch) 5611 { 5612 ix86_expand_clear (scratch); 5613 emit_insn (gen_x86_shift_adj_1 5614 (half_mode, high[0], low[0], operands[2], scratch)); 5615 } 5616 else 5617 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2])); 5618} 5619 5620void 5621ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode) 5622{ 5623 rtx (*gen_ashr3)(rtx, rtx, rtx) 5624 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3; 5625 rtx (*gen_shrd)(rtx, rtx, rtx); 5626 int half_width = GET_MODE_BITSIZE (mode) >> 1; 5627 5628 rtx low[2], high[2]; 5629 int count; 5630 5631 if (CONST_INT_P (operands[2])) 5632 { 5633 split_double_mode (mode, operands, 2, low, high); 5634 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); 5635 5636 if (count == GET_MODE_BITSIZE (mode) - 1) 5637 { 5638 emit_move_insn (high[0], high[1]); 5639 emit_insn (gen_ashr3 (high[0], high[0], 5640 GEN_INT (half_width - 1))); 5641 emit_move_insn (low[0], high[0]); 5642 5643 } 5644 else if (count >= half_width) 5645 { 5646 emit_move_insn (low[0], high[1]); 5647 emit_move_insn (high[0], low[0]); 5648 emit_insn (gen_ashr3 (high[0], high[0], 5649 GEN_INT (half_width - 1))); 5650 5651 if (count > half_width) 5652 emit_insn (gen_ashr3 (low[0], low[0], 5653 GEN_INT (count - half_width))); 5654 } 5655 else 5656 { 5657 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; 5658 5659 if (!rtx_equal_p (operands[0], operands[1])) 5660 emit_move_insn (operands[0], operands[1]); 5661 5662 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); 5663 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count))); 5664 } 5665 } 5666 else 5667 { 5668 machine_mode half_mode; 5669 5670 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; 5671 5672 if (!rtx_equal_p (operands[0], operands[1])) 5673 emit_move_insn (operands[0], operands[1]); 5674 5675 split_double_mode (mode, operands, 1, low, high); 5676 half_mode = mode == DImode ? SImode : DImode; 5677 5678 emit_insn (gen_shrd (low[0], high[0], operands[2])); 5679 emit_insn (gen_ashr3 (high[0], high[0], operands[2])); 5680 5681 if (TARGET_CMOVE && scratch) 5682 { 5683 emit_move_insn (scratch, high[0]); 5684 emit_insn (gen_ashr3 (scratch, scratch, 5685 GEN_INT (half_width - 1))); 5686 emit_insn (gen_x86_shift_adj_1 5687 (half_mode, low[0], high[0], operands[2], scratch)); 5688 } 5689 else 5690 emit_insn (gen_x86_shift_adj_3 5691 (half_mode, low[0], high[0], operands[2])); 5692 } 5693} 5694 5695void 5696ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode) 5697{ 5698 rtx (*gen_lshr3)(rtx, rtx, rtx) 5699 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3; 5700 rtx (*gen_shrd)(rtx, rtx, rtx); 5701 int half_width = GET_MODE_BITSIZE (mode) >> 1; 5702 5703 rtx low[2], high[2]; 5704 int count; 5705 5706 if (CONST_INT_P (operands[2])) 5707 { 5708 split_double_mode (mode, operands, 2, low, high); 5709 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); 5710 5711 if (count >= half_width) 5712 { 5713 emit_move_insn (low[0], high[1]); 5714 ix86_expand_clear (high[0]); 5715 5716 if (count > half_width) 5717 emit_insn (gen_lshr3 (low[0], low[0], 5718 GEN_INT (count - half_width))); 5719 } 5720 else 5721 { 5722 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; 5723 5724 if (!rtx_equal_p (operands[0], operands[1])) 5725 emit_move_insn (operands[0], operands[1]); 5726 5727 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); 5728 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count))); 5729 } 5730 } 5731 else 5732 { 5733 machine_mode half_mode; 5734 5735 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; 5736 5737 if (!rtx_equal_p (operands[0], operands[1])) 5738 emit_move_insn (operands[0], operands[1]); 5739 5740 split_double_mode (mode, operands, 1, low, high); 5741 half_mode = mode == DImode ? SImode : DImode; 5742 5743 emit_insn (gen_shrd (low[0], high[0], operands[2])); 5744 emit_insn (gen_lshr3 (high[0], high[0], operands[2])); 5745 5746 if (TARGET_CMOVE && scratch) 5747 { 5748 ix86_expand_clear (scratch); 5749 emit_insn (gen_x86_shift_adj_1 5750 (half_mode, low[0], high[0], operands[2], scratch)); 5751 } 5752 else 5753 emit_insn (gen_x86_shift_adj_2 5754 (half_mode, low[0], high[0], operands[2])); 5755 } 5756} 5757 5758/* Return mode for the memcpy/memset loop counter. Prefer SImode over 5759 DImode for constant loop counts. */ 5760 5761static machine_mode 5762counter_mode (rtx count_exp) 5763{ 5764 if (GET_MODE (count_exp) != VOIDmode) 5765 return GET_MODE (count_exp); 5766 if (!CONST_INT_P (count_exp)) 5767 return Pmode; 5768 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) 5769 return DImode; 5770 return SImode; 5771} 5772 5773/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR 5774 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT 5775 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set 5776 memory by VALUE (supposed to be in MODE). 5777 5778 The size is rounded down to whole number of chunk size moved at once. 5779 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ 5780 5781 5782static void 5783expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, 5784 rtx destptr, rtx srcptr, rtx value, 5785 rtx count, machine_mode mode, int unroll, 5786 int expected_size, bool issetmem) 5787{ 5788 rtx_code_label *out_label, *top_label; 5789 rtx iter, tmp; 5790 machine_mode iter_mode = counter_mode (count); 5791 int piece_size_n = GET_MODE_SIZE (mode) * unroll; 5792 rtx piece_size = GEN_INT (piece_size_n); 5793 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); 5794 rtx size; 5795 int i; 5796 5797 top_label = gen_label_rtx (); 5798 out_label = gen_label_rtx (); 5799 iter = gen_reg_rtx (iter_mode); 5800 5801 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, 5802 NULL, 1, OPTAB_DIRECT); 5803 /* Those two should combine. */ 5804 if (piece_size == const1_rtx) 5805 { 5806 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, 5807 true, out_label); 5808 predict_jump (REG_BR_PROB_BASE * 10 / 100); 5809 } 5810 emit_move_insn (iter, const0_rtx); 5811 5812 emit_label (top_label); 5813 5814 tmp = convert_modes (Pmode, iter_mode, iter, true); 5815 5816 /* This assert could be relaxed - in this case we'll need to compute 5817 smallest power of two, containing in PIECE_SIZE_N and pass it to 5818 offset_address. */ 5819 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0); 5820 destmem = offset_address (destmem, tmp, piece_size_n); 5821 destmem = adjust_address (destmem, mode, 0); 5822 5823 if (!issetmem) 5824 { 5825 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n); 5826 srcmem = adjust_address (srcmem, mode, 0); 5827 5828 /* When unrolling for chips that reorder memory reads and writes, 5829 we can save registers by using single temporary. 5830 Also using 4 temporaries is overkill in 32bit mode. */ 5831 if (!TARGET_64BIT && 0) 5832 { 5833 for (i = 0; i < unroll; i++) 5834 { 5835 if (i) 5836 { 5837 destmem = adjust_address (copy_rtx (destmem), mode, 5838 GET_MODE_SIZE (mode)); 5839 srcmem = adjust_address (copy_rtx (srcmem), mode, 5840 GET_MODE_SIZE (mode)); 5841 } 5842 emit_move_insn (destmem, srcmem); 5843 } 5844 } 5845 else 5846 { 5847 rtx tmpreg[4]; 5848 gcc_assert (unroll <= 4); 5849 for (i = 0; i < unroll; i++) 5850 { 5851 tmpreg[i] = gen_reg_rtx (mode); 5852 if (i) 5853 srcmem = adjust_address (copy_rtx (srcmem), mode, 5854 GET_MODE_SIZE (mode)); 5855 emit_move_insn (tmpreg[i], srcmem); 5856 } 5857 for (i = 0; i < unroll; i++) 5858 { 5859 if (i) 5860 destmem = adjust_address (copy_rtx (destmem), mode, 5861 GET_MODE_SIZE (mode)); 5862 emit_move_insn (destmem, tmpreg[i]); 5863 } 5864 } 5865 } 5866 else 5867 for (i = 0; i < unroll; i++) 5868 { 5869 if (i) 5870 destmem = adjust_address (copy_rtx (destmem), mode, 5871 GET_MODE_SIZE (mode)); 5872 emit_move_insn (destmem, value); 5873 } 5874 5875 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, 5876 true, OPTAB_LIB_WIDEN); 5877 if (tmp != iter) 5878 emit_move_insn (iter, tmp); 5879 5880 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, 5881 true, top_label); 5882 if (expected_size != -1) 5883 { 5884 expected_size /= GET_MODE_SIZE (mode) * unroll; 5885 if (expected_size == 0) 5886 predict_jump (0); 5887 else if (expected_size > REG_BR_PROB_BASE) 5888 predict_jump (REG_BR_PROB_BASE - 1); 5889 else 5890 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) 5891 / expected_size); 5892 } 5893 else 5894 predict_jump (REG_BR_PROB_BASE * 80 / 100); 5895 iter = ix86_zero_extend_to_Pmode (iter); 5896 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, 5897 true, OPTAB_LIB_WIDEN); 5898 if (tmp != destptr) 5899 emit_move_insn (destptr, tmp); 5900 if (!issetmem) 5901 { 5902 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, 5903 true, OPTAB_LIB_WIDEN); 5904 if (tmp != srcptr) 5905 emit_move_insn (srcptr, tmp); 5906 } 5907 emit_label (out_label); 5908} 5909 5910/* Divide COUNTREG by SCALE. */ 5911static rtx 5912scale_counter (rtx countreg, int scale) 5913{ 5914 rtx sc; 5915 5916 if (scale == 1) 5917 return countreg; 5918 if (CONST_INT_P (countreg)) 5919 return GEN_INT (INTVAL (countreg) / scale); 5920 gcc_assert (REG_P (countreg)); 5921 5922 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, 5923 GEN_INT (exact_log2 (scale)), 5924 NULL, 1, OPTAB_DIRECT); 5925 return sc; 5926} 5927 5928/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument. 5929 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. 5930 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. 5931 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE. 5932 ORIG_VALUE is the original value passed to memset to fill the memory with. 5933 Other arguments have same meaning as for previous function. */ 5934 5935static void 5936expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem, 5937 rtx destptr, rtx srcptr, rtx value, rtx orig_value, 5938 rtx count, 5939 machine_mode mode, bool issetmem) 5940{ 5941 rtx destexp; 5942 rtx srcexp; 5943 rtx countreg; 5944 HOST_WIDE_INT rounded_count; 5945 5946 /* If possible, it is shorter to use rep movs. 5947 TODO: Maybe it is better to move this logic to decide_alg. */ 5948 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3) 5949 && (!issetmem || orig_value == const0_rtx)) 5950 mode = SImode; 5951 5952 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) 5953 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); 5954 5955 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, 5956 GET_MODE_SIZE (mode))); 5957 if (mode != QImode) 5958 { 5959 destexp = gen_rtx_ASHIFT (Pmode, countreg, 5960 GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); 5961 destexp = gen_rtx_PLUS (Pmode, destexp, destptr); 5962 } 5963 else 5964 destexp = gen_rtx_PLUS (Pmode, destptr, countreg); 5965 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count)) 5966 { 5967 rounded_count 5968 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); 5969 destmem = shallow_copy_rtx (destmem); 5970 set_mem_size (destmem, rounded_count); 5971 } 5972 else if (MEM_SIZE_KNOWN_P (destmem)) 5973 clear_mem_size (destmem); 5974 5975 if (issetmem) 5976 { 5977 value = force_reg (mode, gen_lowpart (mode, value)); 5978 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); 5979 } 5980 else 5981 { 5982 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) 5983 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); 5984 if (mode != QImode) 5985 { 5986 srcexp = gen_rtx_ASHIFT (Pmode, countreg, 5987 GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); 5988 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); 5989 } 5990 else 5991 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); 5992 if (CONST_INT_P (count)) 5993 { 5994 rounded_count 5995 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); 5996 srcmem = shallow_copy_rtx (srcmem); 5997 set_mem_size (srcmem, rounded_count); 5998 } 5999 else 6000 { 6001 if (MEM_SIZE_KNOWN_P (srcmem)) 6002 clear_mem_size (srcmem); 6003 } 6004 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, 6005 destexp, srcexp)); 6006 } 6007} 6008 6009/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to 6010 DESTMEM. 6011 SRC is passed by pointer to be updated on return. 6012 Return value is updated DST. */ 6013static rtx 6014emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr, 6015 HOST_WIDE_INT size_to_move) 6016{ 6017 rtx dst = destmem, src = *srcmem, adjust, tempreg; 6018 enum insn_code code; 6019 machine_mode move_mode; 6020 int piece_size, i; 6021 6022 /* Find the widest mode in which we could perform moves. 6023 Start with the biggest power of 2 less than SIZE_TO_MOVE and half 6024 it until move of such size is supported. */ 6025 piece_size = 1 << floor_log2 (size_to_move); 6026 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode) 6027 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) 6028 { 6029 gcc_assert (piece_size > 1); 6030 piece_size >>= 1; 6031 } 6032 6033 /* Find the corresponding vector mode with the same size as MOVE_MODE. 6034 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ 6035 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) 6036 { 6037 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); 6038 if (!mode_for_vector (word_mode, nunits).exists (&move_mode) 6039 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) 6040 { 6041 move_mode = word_mode; 6042 piece_size = GET_MODE_SIZE (move_mode); 6043 code = optab_handler (mov_optab, move_mode); 6044 } 6045 } 6046 gcc_assert (code != CODE_FOR_nothing); 6047 6048 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); 6049 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0); 6050 6051 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ 6052 gcc_assert (size_to_move % piece_size == 0); 6053 adjust = GEN_INT (piece_size); 6054 for (i = 0; i < size_to_move; i += piece_size) 6055 { 6056 /* We move from memory to memory, so we'll need to do it via 6057 a temporary register. */ 6058 tempreg = gen_reg_rtx (move_mode); 6059 emit_insn (GEN_FCN (code) (tempreg, src)); 6060 emit_insn (GEN_FCN (code) (dst, tempreg)); 6061 6062 emit_move_insn (destptr, 6063 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); 6064 emit_move_insn (srcptr, 6065 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust)); 6066 6067 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 6068 piece_size); 6069 src = adjust_automodify_address_nv (src, move_mode, srcptr, 6070 piece_size); 6071 } 6072 6073 /* Update DST and SRC rtx. */ 6074 *srcmem = src; 6075 return dst; 6076} 6077 6078/* Helper function for the string operations below. Dest VARIABLE whether 6079 it is aligned to VALUE bytes. If true, jump to the label. */ 6080 6081static rtx_code_label * 6082ix86_expand_aligntest (rtx variable, int value, bool epilogue) 6083{ 6084 rtx_code_label *label = gen_label_rtx (); 6085 rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); 6086 if (GET_MODE (variable) == DImode) 6087 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); 6088 else 6089 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); 6090 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), 6091 1, label); 6092 if (epilogue) 6093 predict_jump (REG_BR_PROB_BASE * 50 / 100); 6094 else 6095 predict_jump (REG_BR_PROB_BASE * 90 / 100); 6096 return label; 6097} 6098 6099 6100/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ 6101 6102static void 6103expand_cpymem_epilogue (rtx destmem, rtx srcmem, 6104 rtx destptr, rtx srcptr, rtx count, int max_size) 6105{ 6106 rtx src, dest; 6107 if (CONST_INT_P (count)) 6108 { 6109 HOST_WIDE_INT countval = INTVAL (count); 6110 HOST_WIDE_INT epilogue_size = countval % max_size; 6111 int i; 6112 6113 /* For now MAX_SIZE should be a power of 2. This assert could be 6114 relaxed, but it'll require a bit more complicated epilogue 6115 expanding. */ 6116 gcc_assert ((max_size & (max_size - 1)) == 0); 6117 for (i = max_size; i >= 1; i >>= 1) 6118 { 6119 if (epilogue_size & i) 6120 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); 6121 } 6122 return; 6123 } 6124 if (max_size > 8) 6125 { 6126 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), 6127 count, 1, OPTAB_DIRECT); 6128 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL, 6129 count, QImode, 1, 4, false); 6130 return; 6131 } 6132 6133 /* When there are stringops, we can cheaply increase dest and src pointers. 6134 Otherwise we save code size by maintaining offset (zero is readily 6135 available from preceding rep operation) and using x86 addressing modes. 6136 */ 6137 if (TARGET_SINGLE_STRINGOP) 6138 { 6139 if (max_size > 4) 6140 { 6141 rtx_code_label *label = ix86_expand_aligntest (count, 4, true); 6142 src = change_address (srcmem, SImode, srcptr); 6143 dest = change_address (destmem, SImode, destptr); 6144 emit_insn (gen_strmov (destptr, dest, srcptr, src)); 6145 emit_label (label); 6146 LABEL_NUSES (label) = 1; 6147 } 6148 if (max_size > 2) 6149 { 6150 rtx_code_label *label = ix86_expand_aligntest (count, 2, true); 6151 src = change_address (srcmem, HImode, srcptr); 6152 dest = change_address (destmem, HImode, destptr); 6153 emit_insn (gen_strmov (destptr, dest, srcptr, src)); 6154 emit_label (label); 6155 LABEL_NUSES (label) = 1; 6156 } 6157 if (max_size > 1) 6158 { 6159 rtx_code_label *label = ix86_expand_aligntest (count, 1, true); 6160 src = change_address (srcmem, QImode, srcptr); 6161 dest = change_address (destmem, QImode, destptr); 6162 emit_insn (gen_strmov (destptr, dest, srcptr, src)); 6163 emit_label (label); 6164 LABEL_NUSES (label) = 1; 6165 } 6166 } 6167 else 6168 { 6169 rtx offset = force_reg (Pmode, const0_rtx); 6170 rtx tmp; 6171 6172 if (max_size > 4) 6173 { 6174 rtx_code_label *label = ix86_expand_aligntest (count, 4, true); 6175 src = change_address (srcmem, SImode, srcptr); 6176 dest = change_address (destmem, SImode, destptr); 6177 emit_move_insn (dest, src); 6178 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, 6179 true, OPTAB_LIB_WIDEN); 6180 if (tmp != offset) 6181 emit_move_insn (offset, tmp); 6182 emit_label (label); 6183 LABEL_NUSES (label) = 1; 6184 } 6185 if (max_size > 2) 6186 { 6187 rtx_code_label *label = ix86_expand_aligntest (count, 2, true); 6188 tmp = gen_rtx_PLUS (Pmode, srcptr, offset); 6189 src = change_address (srcmem, HImode, tmp); 6190 tmp = gen_rtx_PLUS (Pmode, destptr, offset); 6191 dest = change_address (destmem, HImode, tmp); 6192 emit_move_insn (dest, src); 6193 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, 6194 true, OPTAB_LIB_WIDEN); 6195 if (tmp != offset) 6196 emit_move_insn (offset, tmp); 6197 emit_label (label); 6198 LABEL_NUSES (label) = 1; 6199 } 6200 if (max_size > 1) 6201 { 6202 rtx_code_label *label = ix86_expand_aligntest (count, 1, true); 6203 tmp = gen_rtx_PLUS (Pmode, srcptr, offset); 6204 src = change_address (srcmem, QImode, tmp); 6205 tmp = gen_rtx_PLUS (Pmode, destptr, offset); 6206 dest = change_address (destmem, QImode, tmp); 6207 emit_move_insn (dest, src); 6208 emit_label (label); 6209 LABEL_NUSES (label) = 1; 6210 } 6211 } 6212} 6213 6214/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM 6215 with value PROMOTED_VAL. 6216 SRC is passed by pointer to be updated on return. 6217 Return value is updated DST. */ 6218static rtx 6219emit_memset (rtx destmem, rtx destptr, rtx promoted_val, 6220 HOST_WIDE_INT size_to_move) 6221{ 6222 rtx dst = destmem, adjust; 6223 enum insn_code code; 6224 machine_mode move_mode; 6225 int piece_size, i; 6226 6227 /* Find the widest mode in which we could perform moves. 6228 Start with the biggest power of 2 less than SIZE_TO_MOVE and half 6229 it until move of such size is supported. */ 6230 move_mode = GET_MODE (promoted_val); 6231 if (move_mode == VOIDmode) 6232 move_mode = QImode; 6233 if (size_to_move < GET_MODE_SIZE (move_mode)) 6234 { 6235 unsigned int move_bits = size_to_move * BITS_PER_UNIT; 6236 move_mode = int_mode_for_size (move_bits, 0).require (); 6237 promoted_val = gen_lowpart (move_mode, promoted_val); 6238 } 6239 piece_size = GET_MODE_SIZE (move_mode); 6240 code = optab_handler (mov_optab, move_mode); 6241 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); 6242 6243 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); 6244 6245 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ 6246 gcc_assert (size_to_move % piece_size == 0); 6247 adjust = GEN_INT (piece_size); 6248 for (i = 0; i < size_to_move; i += piece_size) 6249 { 6250 if (piece_size <= GET_MODE_SIZE (word_mode)) 6251 { 6252 emit_insn (gen_strset (destptr, dst, promoted_val)); 6253 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 6254 piece_size); 6255 continue; 6256 } 6257 6258 emit_insn (GEN_FCN (code) (dst, promoted_val)); 6259 6260 emit_move_insn (destptr, 6261 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); 6262 6263 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 6264 piece_size); 6265 } 6266 6267 /* Update DST rtx. */ 6268 return dst; 6269} 6270/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ 6271static void 6272expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, 6273 rtx count, int max_size) 6274{ 6275 count = expand_simple_binop (counter_mode (count), AND, count, 6276 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); 6277 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL, 6278 gen_lowpart (QImode, value), count, QImode, 6279 1, max_size / 2, true); 6280} 6281 6282/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ 6283static void 6284expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, 6285 rtx count, int max_size) 6286{ 6287 rtx dest; 6288 6289 if (CONST_INT_P (count)) 6290 { 6291 HOST_WIDE_INT countval = INTVAL (count); 6292 HOST_WIDE_INT epilogue_size = countval % max_size; 6293 int i; 6294 6295 /* For now MAX_SIZE should be a power of 2. This assert could be 6296 relaxed, but it'll require a bit more complicated epilogue 6297 expanding. */ 6298 gcc_assert ((max_size & (max_size - 1)) == 0); 6299 for (i = max_size; i >= 1; i >>= 1) 6300 { 6301 if (epilogue_size & i) 6302 { 6303 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) 6304 destmem = emit_memset (destmem, destptr, vec_value, i); 6305 else 6306 destmem = emit_memset (destmem, destptr, value, i); 6307 } 6308 } 6309 return; 6310 } 6311 if (max_size > 32) 6312 { 6313 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); 6314 return; 6315 } 6316 if (max_size > 16) 6317 { 6318 rtx_code_label *label = ix86_expand_aligntest (count, 16, true); 6319 if (TARGET_64BIT) 6320 { 6321 dest = change_address (destmem, DImode, destptr); 6322 emit_insn (gen_strset (destptr, dest, value)); 6323 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8); 6324 emit_insn (gen_strset (destptr, dest, value)); 6325 } 6326 else 6327 { 6328 dest = change_address (destmem, SImode, destptr); 6329 emit_insn (gen_strset (destptr, dest, value)); 6330 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); 6331 emit_insn (gen_strset (destptr, dest, value)); 6332 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8); 6333 emit_insn (gen_strset (destptr, dest, value)); 6334 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12); 6335 emit_insn (gen_strset (destptr, dest, value)); 6336 } 6337 emit_label (label); 6338 LABEL_NUSES (label) = 1; 6339 } 6340 if (max_size > 8) 6341 { 6342 rtx_code_label *label = ix86_expand_aligntest (count, 8, true); 6343 if (TARGET_64BIT) 6344 { 6345 dest = change_address (destmem, DImode, destptr); 6346 emit_insn (gen_strset (destptr, dest, value)); 6347 } 6348 else 6349 { 6350 dest = change_address (destmem, SImode, destptr); 6351 emit_insn (gen_strset (destptr, dest, value)); 6352 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); 6353 emit_insn (gen_strset (destptr, dest, value)); 6354 } 6355 emit_label (label); 6356 LABEL_NUSES (label) = 1; 6357 } 6358 if (max_size > 4) 6359 { 6360 rtx_code_label *label = ix86_expand_aligntest (count, 4, true); 6361 dest = change_address (destmem, SImode, destptr); 6362 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); 6363 emit_label (label); 6364 LABEL_NUSES (label) = 1; 6365 } 6366 if (max_size > 2) 6367 { 6368 rtx_code_label *label = ix86_expand_aligntest (count, 2, true); 6369 dest = change_address (destmem, HImode, destptr); 6370 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); 6371 emit_label (label); 6372 LABEL_NUSES (label) = 1; 6373 } 6374 if (max_size > 1) 6375 { 6376 rtx_code_label *label = ix86_expand_aligntest (count, 1, true); 6377 dest = change_address (destmem, QImode, destptr); 6378 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); 6379 emit_label (label); 6380 LABEL_NUSES (label) = 1; 6381 } 6382} 6383 6384/* Adjust COUNTER by the VALUE. */ 6385static void 6386ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value) 6387{ 6388 emit_insn (gen_add2_insn (countreg, GEN_INT (-value))); 6389} 6390 6391/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to 6392 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. 6393 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are 6394 ignored. 6395 Return value is updated DESTMEM. */ 6396 6397static rtx 6398expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem, 6399 rtx destptr, rtx srcptr, rtx value, 6400 rtx vec_value, rtx count, int align, 6401 int desired_alignment, bool issetmem) 6402{ 6403 int i; 6404 for (i = 1; i < desired_alignment; i <<= 1) 6405 { 6406 if (align <= i) 6407 { 6408 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false); 6409 if (issetmem) 6410 { 6411 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) 6412 destmem = emit_memset (destmem, destptr, vec_value, i); 6413 else 6414 destmem = emit_memset (destmem, destptr, value, i); 6415 } 6416 else 6417 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); 6418 ix86_adjust_counter (count, i); 6419 emit_label (label); 6420 LABEL_NUSES (label) = 1; 6421 set_mem_align (destmem, i * 2 * BITS_PER_UNIT); 6422 } 6423 } 6424 return destmem; 6425} 6426 6427/* Test if COUNT&SIZE is nonzero and if so, expand movme 6428 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes 6429 and jump to DONE_LABEL. */ 6430static void 6431expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, 6432 rtx destptr, rtx srcptr, 6433 rtx value, rtx vec_value, 6434 rtx count, int size, 6435 rtx done_label, bool issetmem) 6436{ 6437 rtx_code_label *label = ix86_expand_aligntest (count, size, false); 6438 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk (); 6439 rtx modesize; 6440 int n; 6441 6442 /* If we do not have vector value to copy, we must reduce size. */ 6443 if (issetmem) 6444 { 6445 if (!vec_value) 6446 { 6447 if (GET_MODE (value) == VOIDmode && size > 8) 6448 mode = Pmode; 6449 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value))) 6450 mode = GET_MODE (value); 6451 } 6452 else 6453 mode = GET_MODE (vec_value), value = vec_value; 6454 } 6455 else 6456 { 6457 /* Choose appropriate vector mode. */ 6458 if (size >= 32) 6459 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode; 6460 else if (size >= 16) 6461 mode = TARGET_SSE ? V16QImode : DImode; 6462 srcmem = change_address (srcmem, mode, srcptr); 6463 } 6464 destmem = change_address (destmem, mode, destptr); 6465 modesize = GEN_INT (GET_MODE_SIZE (mode)); 6466 gcc_assert (GET_MODE_SIZE (mode) <= size); 6467 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) 6468 { 6469 if (issetmem) 6470 emit_move_insn (destmem, gen_lowpart (mode, value)); 6471 else 6472 { 6473 emit_move_insn (destmem, srcmem); 6474 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); 6475 } 6476 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); 6477 } 6478 6479 destmem = offset_address (destmem, count, 1); 6480 destmem = offset_address (destmem, GEN_INT (-2 * size), 6481 GET_MODE_SIZE (mode)); 6482 if (!issetmem) 6483 { 6484 srcmem = offset_address (srcmem, count, 1); 6485 srcmem = offset_address (srcmem, GEN_INT (-2 * size), 6486 GET_MODE_SIZE (mode)); 6487 } 6488 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) 6489 { 6490 if (issetmem) 6491 emit_move_insn (destmem, gen_lowpart (mode, value)); 6492 else 6493 { 6494 emit_move_insn (destmem, srcmem); 6495 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); 6496 } 6497 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); 6498 } 6499 emit_jump_insn (gen_jump (done_label)); 6500 emit_barrier (); 6501 6502 emit_label (label); 6503 LABEL_NUSES (label) = 1; 6504} 6505 6506/* Handle small memcpy (up to SIZE that is supposed to be small power of 2. 6507 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN 6508 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can 6509 proceed with an loop copying SIZE bytes at once. Do moves in MODE. 6510 DONE_LABEL is a label after the whole copying sequence. The label is created 6511 on demand if *DONE_LABEL is NULL. 6512 MIN_SIZE is minimal size of block copied. This value gets adjusted for new 6513 bounds after the initial copies. 6514 6515 DESTMEM/SRCMEM are memory expressions pointing to the copies block, 6516 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether 6517 we will dispatch to a library call for large blocks. 6518 6519 In pseudocode we do: 6520 6521 if (COUNT < SIZE) 6522 { 6523 Assume that SIZE is 4. Bigger sizes are handled analogously 6524 if (COUNT & 4) 6525 { 6526 copy 4 bytes from SRCPTR to DESTPTR 6527 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4 6528 goto done_label 6529 } 6530 if (!COUNT) 6531 goto done_label; 6532 copy 1 byte from SRCPTR to DESTPTR 6533 if (COUNT & 2) 6534 { 6535 copy 2 bytes from SRCPTR to DESTPTR 6536 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2 6537 } 6538 } 6539 else 6540 { 6541 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR 6542 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE 6543 6544 OLD_DESPTR = DESTPTR; 6545 Align DESTPTR up to DESIRED_ALIGN 6546 SRCPTR += DESTPTR - OLD_DESTPTR 6547 COUNT -= DEST_PTR - OLD_DESTPTR 6548 if (DYNAMIC_CHECK) 6549 Round COUNT down to multiple of SIZE 6550 << optional caller supplied zero size guard is here >> 6551 << optional caller supplied dynamic check is here >> 6552 << caller supplied main copy loop is here >> 6553 } 6554 done_label: 6555 */ 6556static void 6557expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem, 6558 rtx *destptr, rtx *srcptr, 6559 machine_mode mode, 6560 rtx value, rtx vec_value, 6561 rtx *count, 6562 rtx_code_label **done_label, 6563 int size, 6564 int desired_align, 6565 int align, 6566 unsigned HOST_WIDE_INT *min_size, 6567 bool dynamic_check, 6568 bool issetmem) 6569{ 6570 rtx_code_label *loop_label = NULL, *label; 6571 int n; 6572 rtx modesize; 6573 int prolog_size = 0; 6574 rtx mode_value; 6575 6576 /* Chose proper value to copy. */ 6577 if (issetmem && VECTOR_MODE_P (mode)) 6578 mode_value = vec_value; 6579 else 6580 mode_value = value; 6581 gcc_assert (GET_MODE_SIZE (mode) <= size); 6582 6583 /* See if block is big or small, handle small blocks. */ 6584 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size) 6585 { 6586 int size2 = size; 6587 loop_label = gen_label_rtx (); 6588 6589 if (!*done_label) 6590 *done_label = gen_label_rtx (); 6591 6592 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count), 6593 1, loop_label); 6594 size2 >>= 1; 6595 6596 /* Handle sizes > 3. */ 6597 for (;size2 > 2; size2 >>= 1) 6598 expand_small_cpymem_or_setmem (destmem, srcmem, 6599 *destptr, *srcptr, 6600 value, vec_value, 6601 *count, 6602 size2, *done_label, issetmem); 6603 /* Nothing to copy? Jump to DONE_LABEL if so */ 6604 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count), 6605 1, *done_label); 6606 6607 /* Do a byte copy. */ 6608 destmem = change_address (destmem, QImode, *destptr); 6609 if (issetmem) 6610 emit_move_insn (destmem, gen_lowpart (QImode, value)); 6611 else 6612 { 6613 srcmem = change_address (srcmem, QImode, *srcptr); 6614 emit_move_insn (destmem, srcmem); 6615 } 6616 6617 /* Handle sizes 2 and 3. */ 6618 label = ix86_expand_aligntest (*count, 2, false); 6619 destmem = change_address (destmem, HImode, *destptr); 6620 destmem = offset_address (destmem, *count, 1); 6621 destmem = offset_address (destmem, GEN_INT (-2), 2); 6622 if (issetmem) 6623 emit_move_insn (destmem, gen_lowpart (HImode, value)); 6624 else 6625 { 6626 srcmem = change_address (srcmem, HImode, *srcptr); 6627 srcmem = offset_address (srcmem, *count, 1); 6628 srcmem = offset_address (srcmem, GEN_INT (-2), 2); 6629 emit_move_insn (destmem, srcmem); 6630 } 6631 6632 emit_label (label); 6633 LABEL_NUSES (label) = 1; 6634 emit_jump_insn (gen_jump (*done_label)); 6635 emit_barrier (); 6636 } 6637 else 6638 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size 6639 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size); 6640 6641 /* Start memcpy for COUNT >= SIZE. */ 6642 if (loop_label) 6643 { 6644 emit_label (loop_label); 6645 LABEL_NUSES (loop_label) = 1; 6646 } 6647 6648 /* Copy first desired_align bytes. */ 6649 if (!issetmem) 6650 srcmem = change_address (srcmem, mode, *srcptr); 6651 destmem = change_address (destmem, mode, *destptr); 6652 modesize = GEN_INT (GET_MODE_SIZE (mode)); 6653 for (n = 0; prolog_size < desired_align - align; n++) 6654 { 6655 if (issetmem) 6656 emit_move_insn (destmem, mode_value); 6657 else 6658 { 6659 emit_move_insn (destmem, srcmem); 6660 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); 6661 } 6662 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); 6663 prolog_size += GET_MODE_SIZE (mode); 6664 } 6665 6666 6667 /* Copy last SIZE bytes. */ 6668 destmem = offset_address (destmem, *count, 1); 6669 destmem = offset_address (destmem, 6670 GEN_INT (-size - prolog_size), 6671 1); 6672 if (issetmem) 6673 emit_move_insn (destmem, mode_value); 6674 else 6675 { 6676 srcmem = offset_address (srcmem, *count, 1); 6677 srcmem = offset_address (srcmem, 6678 GEN_INT (-size - prolog_size), 6679 1); 6680 emit_move_insn (destmem, srcmem); 6681 } 6682 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++) 6683 { 6684 destmem = offset_address (destmem, modesize, 1); 6685 if (issetmem) 6686 emit_move_insn (destmem, mode_value); 6687 else 6688 { 6689 srcmem = offset_address (srcmem, modesize, 1); 6690 emit_move_insn (destmem, srcmem); 6691 } 6692 } 6693 6694 /* Align destination. */ 6695 if (desired_align > 1 && desired_align > align) 6696 { 6697 rtx saveddest = *destptr; 6698 6699 gcc_assert (desired_align <= size); 6700 /* Align destptr up, place it to new register. */ 6701 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr, 6702 GEN_INT (prolog_size), 6703 NULL_RTX, 1, OPTAB_DIRECT); 6704 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest)) 6705 REG_POINTER (*destptr) = 1; 6706 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr, 6707 GEN_INT (-desired_align), 6708 *destptr, 1, OPTAB_DIRECT); 6709 /* See how many bytes we skipped. */ 6710 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest, 6711 *destptr, 6712 saveddest, 1, OPTAB_DIRECT); 6713 /* Adjust srcptr and count. */ 6714 if (!issetmem) 6715 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, 6716 saveddest, *srcptr, 1, OPTAB_DIRECT); 6717 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, 6718 saveddest, *count, 1, OPTAB_DIRECT); 6719 /* We copied at most size + prolog_size. */ 6720 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size)) 6721 *min_size 6722 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size); 6723 else 6724 *min_size = 0; 6725 6726 /* Our loops always round down the block size, but for dispatch to 6727 library we need precise value. */ 6728 if (dynamic_check) 6729 *count = expand_simple_binop (GET_MODE (*count), AND, *count, 6730 GEN_INT (-size), *count, 1, OPTAB_DIRECT); 6731 } 6732 else 6733 { 6734 gcc_assert (prolog_size == 0); 6735 /* Decrease count, so we won't end up copying last word twice. */ 6736 if (!CONST_INT_P (*count)) 6737 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, 6738 constm1_rtx, *count, 1, OPTAB_DIRECT); 6739 else 6740 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1, 6741 (unsigned HOST_WIDE_INT)size)); 6742 if (*min_size) 6743 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size); 6744 } 6745} 6746 6747 6748/* This function is like the previous one, except here we know how many bytes 6749 need to be copied. That allows us to update alignment not only of DST, which 6750 is returned, but also of SRC, which is passed as a pointer for that 6751 reason. */ 6752static rtx 6753expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg, 6754 rtx srcreg, rtx value, rtx vec_value, 6755 int desired_align, int align_bytes, 6756 bool issetmem) 6757{ 6758 rtx src = NULL; 6759 rtx orig_dst = dst; 6760 rtx orig_src = NULL; 6761 int piece_size = 1; 6762 int copied_bytes = 0; 6763 6764 if (!issetmem) 6765 { 6766 gcc_assert (srcp != NULL); 6767 src = *srcp; 6768 orig_src = src; 6769 } 6770 6771 for (piece_size = 1; 6772 piece_size <= desired_align && copied_bytes < align_bytes; 6773 piece_size <<= 1) 6774 { 6775 if (align_bytes & piece_size) 6776 { 6777 if (issetmem) 6778 { 6779 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) 6780 dst = emit_memset (dst, destreg, vec_value, piece_size); 6781 else 6782 dst = emit_memset (dst, destreg, value, piece_size); 6783 } 6784 else 6785 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); 6786 copied_bytes += piece_size; 6787 } 6788 } 6789 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) 6790 set_mem_align (dst, desired_align * BITS_PER_UNIT); 6791 if (MEM_SIZE_KNOWN_P (orig_dst)) 6792 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); 6793 6794 if (!issetmem) 6795 { 6796 int src_align_bytes = get_mem_align_offset (src, desired_align 6797 * BITS_PER_UNIT); 6798 if (src_align_bytes >= 0) 6799 src_align_bytes = desired_align - src_align_bytes; 6800 if (src_align_bytes >= 0) 6801 { 6802 unsigned int src_align; 6803 for (src_align = desired_align; src_align >= 2; src_align >>= 1) 6804 { 6805 if ((src_align_bytes & (src_align - 1)) 6806 == (align_bytes & (src_align - 1))) 6807 break; 6808 } 6809 if (src_align > (unsigned int) desired_align) 6810 src_align = desired_align; 6811 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) 6812 set_mem_align (src, src_align * BITS_PER_UNIT); 6813 } 6814 if (MEM_SIZE_KNOWN_P (orig_src)) 6815 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); 6816 *srcp = src; 6817 } 6818 6819 return dst; 6820} 6821 6822/* Return true if ALG can be used in current context. 6823 Assume we expand memset if MEMSET is true. */ 6824static bool 6825alg_usable_p (enum stringop_alg alg, bool memset, bool have_as) 6826{ 6827 if (alg == no_stringop) 6828 return false; 6829 if (alg == vector_loop) 6830 return TARGET_SSE || TARGET_AVX; 6831 /* Algorithms using the rep prefix want at least edi and ecx; 6832 additionally, memset wants eax and memcpy wants esi. Don't 6833 consider such algorithms if the user has appropriated those 6834 registers for their own purposes, or if we have a non-default 6835 address space, since some string insns cannot override the segment. */ 6836 if (alg == rep_prefix_1_byte 6837 || alg == rep_prefix_4_byte 6838 || alg == rep_prefix_8_byte) 6839 { 6840 if (have_as) 6841 return false; 6842 if (fixed_regs[CX_REG] 6843 || fixed_regs[DI_REG] 6844 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])) 6845 return false; 6846 } 6847 return true; 6848} 6849 6850/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ 6851static enum stringop_alg 6852decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, 6853 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, 6854 bool memset, bool zero_memset, bool have_as, 6855 int *dynamic_check, bool *noalign, bool recur) 6856{ 6857 const struct stringop_algs *algs; 6858 bool optimize_for_speed; 6859 int max = 0; 6860 const struct processor_costs *cost; 6861 int i; 6862 bool any_alg_usable_p = false; 6863 6864 *noalign = false; 6865 *dynamic_check = -1; 6866 6867 /* Even if the string operation call is cold, we still might spend a lot 6868 of time processing large blocks. */ 6869 if (optimize_function_for_size_p (cfun) 6870 || (optimize_insn_for_size_p () 6871 && (max_size < 256 6872 || (expected_size != -1 && expected_size < 256)))) 6873 optimize_for_speed = false; 6874 else 6875 optimize_for_speed = true; 6876 6877 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; 6878 if (memset) 6879 algs = &cost->memset[TARGET_64BIT != 0]; 6880 else 6881 algs = &cost->memcpy[TARGET_64BIT != 0]; 6882 6883 /* See maximal size for user defined algorithm. */ 6884 for (i = 0; i < MAX_STRINGOP_ALGS; i++) 6885 { 6886 enum stringop_alg candidate = algs->size[i].alg; 6887 bool usable = alg_usable_p (candidate, memset, have_as); 6888 any_alg_usable_p |= usable; 6889 6890 if (candidate != libcall && candidate && usable) 6891 max = algs->size[i].max; 6892 } 6893 6894 /* If expected size is not known but max size is small enough 6895 so inline version is a win, set expected size into 6896 the range. */ 6897 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1) 6898 && expected_size == -1) 6899 expected_size = min_size / 2 + max_size / 2; 6900 6901 /* If user specified the algorithm, honor it if possible. */ 6902 if (ix86_stringop_alg != no_stringop 6903 && alg_usable_p (ix86_stringop_alg, memset, have_as)) 6904 return ix86_stringop_alg; 6905 /* rep; movq or rep; movl is the smallest variant. */ 6906 else if (!optimize_for_speed) 6907 { 6908 *noalign = true; 6909 if (!count || (count & 3) || (memset && !zero_memset)) 6910 return alg_usable_p (rep_prefix_1_byte, memset, have_as) 6911 ? rep_prefix_1_byte : loop_1_byte; 6912 else 6913 return alg_usable_p (rep_prefix_4_byte, memset, have_as) 6914 ? rep_prefix_4_byte : loop; 6915 } 6916 /* Very tiny blocks are best handled via the loop, REP is expensive to 6917 setup. */ 6918 else if (expected_size != -1 && expected_size < 4) 6919 return loop_1_byte; 6920 else if (expected_size != -1) 6921 { 6922 enum stringop_alg alg = libcall; 6923 bool alg_noalign = false; 6924 for (i = 0; i < MAX_STRINGOP_ALGS; i++) 6925 { 6926 /* We get here if the algorithms that were not libcall-based 6927 were rep-prefix based and we are unable to use rep prefixes 6928 based on global register usage. Break out of the loop and 6929 use the heuristic below. */ 6930 if (algs->size[i].max == 0) 6931 break; 6932 if (algs->size[i].max >= expected_size || algs->size[i].max == -1) 6933 { 6934 enum stringop_alg candidate = algs->size[i].alg; 6935 6936 if (candidate != libcall 6937 && alg_usable_p (candidate, memset, have_as)) 6938 { 6939 alg = candidate; 6940 alg_noalign = algs->size[i].noalign; 6941 } 6942 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking 6943 last non-libcall inline algorithm. */ 6944 if (TARGET_INLINE_ALL_STRINGOPS) 6945 { 6946 /* When the current size is best to be copied by a libcall, 6947 but we are still forced to inline, run the heuristic below 6948 that will pick code for medium sized blocks. */ 6949 if (alg != libcall) 6950 { 6951 *noalign = alg_noalign; 6952 return alg; 6953 } 6954 else if (!any_alg_usable_p) 6955 break; 6956 } 6957 else if (alg_usable_p (candidate, memset, have_as)) 6958 { 6959 *noalign = algs->size[i].noalign; 6960 return candidate; 6961 } 6962 } 6963 } 6964 } 6965 /* When asked to inline the call anyway, try to pick meaningful choice. 6966 We look for maximal size of block that is faster to copy by hand and 6967 take blocks of at most of that size guessing that average size will 6968 be roughly half of the block. 6969 6970 If this turns out to be bad, we might simply specify the preferred 6971 choice in ix86_costs. */ 6972 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) 6973 && (algs->unknown_size == libcall 6974 || !alg_usable_p (algs->unknown_size, memset, have_as))) 6975 { 6976 enum stringop_alg alg; 6977 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; 6978 6979 /* If there aren't any usable algorithms or if recursing already, 6980 then recursing on smaller sizes or same size isn't going to 6981 find anything. Just return the simple byte-at-a-time copy loop. */ 6982 if (!any_alg_usable_p || recur) 6983 { 6984 /* Pick something reasonable. */ 6985 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur) 6986 *dynamic_check = 128; 6987 return loop_1_byte; 6988 } 6989 alg = decide_alg (count, new_expected_size, min_size, max_size, memset, 6990 zero_memset, have_as, dynamic_check, noalign, true); 6991 gcc_assert (*dynamic_check == -1); 6992 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) 6993 *dynamic_check = max; 6994 else 6995 gcc_assert (alg != libcall); 6996 return alg; 6997 } 6998 return (alg_usable_p (algs->unknown_size, memset, have_as) 6999 ? algs->unknown_size : libcall); 7000} 7001 7002/* Decide on alignment. We know that the operand is already aligned to ALIGN 7003 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ 7004static int 7005decide_alignment (int align, 7006 enum stringop_alg alg, 7007 int expected_size, 7008 machine_mode move_mode) 7009{ 7010 int desired_align = 0; 7011 7012 gcc_assert (alg != no_stringop); 7013 7014 if (alg == libcall) 7015 return 0; 7016 if (move_mode == VOIDmode) 7017 return 0; 7018 7019 desired_align = GET_MODE_SIZE (move_mode); 7020 /* PentiumPro has special logic triggering for 8 byte aligned blocks. 7021 copying whole cacheline at once. */ 7022 if (TARGET_PENTIUMPRO 7023 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte)) 7024 desired_align = 8; 7025 7026 if (optimize_size) 7027 desired_align = 1; 7028 if (desired_align < align) 7029 desired_align = align; 7030 if (expected_size != -1 && expected_size < 4) 7031 desired_align = align; 7032 7033 return desired_align; 7034} 7035 7036 7037/* Helper function for memcpy. For QImode value 0xXY produce 7038 0xXYXYXYXY of wide specified by MODE. This is essentially 7039 a * 0x10101010, but we can do slightly better than 7040 synth_mult by unwinding the sequence by hand on CPUs with 7041 slow multiply. */ 7042static rtx 7043promote_duplicated_reg (machine_mode mode, rtx val) 7044{ 7045 machine_mode valmode = GET_MODE (val); 7046 rtx tmp; 7047 int nops = mode == DImode ? 3 : 2; 7048 7049 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); 7050 if (val == const0_rtx) 7051 return copy_to_mode_reg (mode, CONST0_RTX (mode)); 7052 if (CONST_INT_P (val)) 7053 { 7054 HOST_WIDE_INT v = INTVAL (val) & 255; 7055 7056 v |= v << 8; 7057 v |= v << 16; 7058 if (mode == DImode) 7059 v |= (v << 16) << 16; 7060 return copy_to_mode_reg (mode, gen_int_mode (v, mode)); 7061 } 7062 7063 if (valmode == VOIDmode) 7064 valmode = QImode; 7065 if (valmode != QImode) 7066 val = gen_lowpart (QImode, val); 7067 if (mode == QImode) 7068 return val; 7069 if (!TARGET_PARTIAL_REG_STALL) 7070 nops--; 7071 if (ix86_cost->mult_init[mode == DImode ? 3 : 2] 7072 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) 7073 <= (ix86_cost->shift_const + ix86_cost->add) * nops 7074 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) 7075 { 7076 rtx reg = convert_modes (mode, QImode, val, true); 7077 tmp = promote_duplicated_reg (mode, const1_rtx); 7078 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, 7079 OPTAB_DIRECT); 7080 } 7081 else 7082 { 7083 rtx reg = convert_modes (mode, QImode, val, true); 7084 7085 if (!TARGET_PARTIAL_REG_STALL) 7086 if (mode == SImode) 7087 emit_insn (gen_insvsi_1 (reg, reg)); 7088 else 7089 emit_insn (gen_insvdi_1 (reg, reg)); 7090 else 7091 { 7092 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), 7093 NULL, 1, OPTAB_DIRECT); 7094 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, 7095 OPTAB_DIRECT); 7096 } 7097 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), 7098 NULL, 1, OPTAB_DIRECT); 7099 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); 7100 if (mode == SImode) 7101 return reg; 7102 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), 7103 NULL, 1, OPTAB_DIRECT); 7104 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); 7105 return reg; 7106 } 7107} 7108 7109/* Duplicate value VAL using promote_duplicated_reg into maximal size that will 7110 be needed by main loop copying SIZE_NEEDED chunks and prologue getting 7111 alignment from ALIGN to DESIRED_ALIGN. */ 7112static rtx 7113promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, 7114 int align) 7115{ 7116 rtx promoted_val; 7117 7118 if (TARGET_64BIT 7119 && (size_needed > 4 || (desired_align > align && desired_align > 4))) 7120 promoted_val = promote_duplicated_reg (DImode, val); 7121 else if (size_needed > 2 || (desired_align > align && desired_align > 2)) 7122 promoted_val = promote_duplicated_reg (SImode, val); 7123 else if (size_needed > 1 || (desired_align > align && desired_align > 1)) 7124 promoted_val = promote_duplicated_reg (HImode, val); 7125 else 7126 promoted_val = val; 7127 7128 return promoted_val; 7129} 7130 7131/* Copy the address to a Pmode register. This is used for x32 to 7132 truncate DImode TLS address to a SImode register. */ 7133 7134static rtx 7135ix86_copy_addr_to_reg (rtx addr) 7136{ 7137 rtx reg; 7138 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode) 7139 { 7140 reg = copy_addr_to_reg (addr); 7141 REG_POINTER (reg) = 1; 7142 return reg; 7143 } 7144 else 7145 { 7146 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode); 7147 reg = copy_to_mode_reg (DImode, addr); 7148 REG_POINTER (reg) = 1; 7149 return gen_rtx_SUBREG (SImode, reg, 0); 7150 } 7151} 7152 7153/* Expand string move (memcpy) ot store (memset) operation. Use i386 string 7154 operations when profitable. The code depends upon architecture, block size 7155 and alignment, but always has one of the following overall structures: 7156 7157 Aligned move sequence: 7158 7159 1) Prologue guard: Conditional that jumps up to epilogues for small 7160 blocks that can be handled by epilogue alone. This is faster 7161 but also needed for correctness, since prologue assume the block 7162 is larger than the desired alignment. 7163 7164 Optional dynamic check for size and libcall for large 7165 blocks is emitted here too, with -minline-stringops-dynamically. 7166 7167 2) Prologue: copy first few bytes in order to get destination 7168 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less 7169 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be 7170 copied. We emit either a jump tree on power of two sized 7171 blocks, or a byte loop. 7172 7173 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks 7174 with specified algorithm. 7175 7176 4) Epilogue: code copying tail of the block that is too small to be 7177 handled by main body (or up to size guarded by prologue guard). 7178 7179 Misaligned move sequence 7180 7181 1) missaligned move prologue/epilogue containing: 7182 a) Prologue handling small memory blocks and jumping to done_label 7183 (skipped if blocks are known to be large enough) 7184 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is 7185 needed by single possibly misaligned move 7186 (skipped if alignment is not needed) 7187 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves 7188 7189 2) Zero size guard dispatching to done_label, if needed 7190 7191 3) dispatch to library call, if needed, 7192 7193 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks 7194 with specified algorithm. */ 7195bool 7196ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, 7197 rtx align_exp, rtx expected_align_exp, 7198 rtx expected_size_exp, rtx min_size_exp, 7199 rtx max_size_exp, rtx probable_max_size_exp, 7200 bool issetmem) 7201{ 7202 rtx destreg; 7203 rtx srcreg = NULL; 7204 rtx_code_label *label = NULL; 7205 rtx tmp; 7206 rtx_code_label *jump_around_label = NULL; 7207 HOST_WIDE_INT align = 1; 7208 unsigned HOST_WIDE_INT count = 0; 7209 HOST_WIDE_INT expected_size = -1; 7210 int size_needed = 0, epilogue_size_needed; 7211 int desired_align = 0, align_bytes = 0; 7212 enum stringop_alg alg; 7213 rtx promoted_val = NULL; 7214 rtx vec_promoted_val = NULL; 7215 bool force_loopy_epilogue = false; 7216 int dynamic_check; 7217 bool need_zero_guard = false; 7218 bool noalign; 7219 machine_mode move_mode = VOIDmode; 7220 machine_mode wider_mode; 7221 int unroll_factor = 1; 7222 /* TODO: Once value ranges are available, fill in proper data. */ 7223 unsigned HOST_WIDE_INT min_size = 0; 7224 unsigned HOST_WIDE_INT max_size = -1; 7225 unsigned HOST_WIDE_INT probable_max_size = -1; 7226 bool misaligned_prologue_used = false; 7227 bool have_as; 7228 7229 if (CONST_INT_P (align_exp)) 7230 align = INTVAL (align_exp); 7231 /* i386 can do misaligned access on reasonably increased cost. */ 7232 if (CONST_INT_P (expected_align_exp) 7233 && INTVAL (expected_align_exp) > align) 7234 align = INTVAL (expected_align_exp); 7235 /* ALIGN is the minimum of destination and source alignment, but we care here 7236 just about destination alignment. */ 7237 else if (!issetmem 7238 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) 7239 align = MEM_ALIGN (dst) / BITS_PER_UNIT; 7240 7241 if (CONST_INT_P (count_exp)) 7242 { 7243 min_size = max_size = probable_max_size = count = expected_size 7244 = INTVAL (count_exp); 7245 /* When COUNT is 0, there is nothing to do. */ 7246 if (!count) 7247 return true; 7248 } 7249 else 7250 { 7251 if (min_size_exp) 7252 min_size = INTVAL (min_size_exp); 7253 if (max_size_exp) 7254 max_size = INTVAL (max_size_exp); 7255 if (probable_max_size_exp) 7256 probable_max_size = INTVAL (probable_max_size_exp); 7257 if (CONST_INT_P (expected_size_exp)) 7258 expected_size = INTVAL (expected_size_exp); 7259 } 7260 7261 /* Make sure we don't need to care about overflow later on. */ 7262 if (count > (HOST_WIDE_INT_1U << 30)) 7263 return false; 7264 7265 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)); 7266 if (!issetmem) 7267 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)); 7268 7269 /* Step 0: Decide on preferred algorithm, desired alignment and 7270 size of chunks to be copied by main loop. */ 7271 alg = decide_alg (count, expected_size, min_size, probable_max_size, 7272 issetmem, 7273 issetmem && val_exp == const0_rtx, have_as, 7274 &dynamic_check, &noalign, false); 7275 7276 if (dump_file) 7277 fprintf (dump_file, "Selected stringop expansion strategy: %s\n", 7278 stringop_alg_names[alg]); 7279 7280 if (alg == libcall) 7281 return false; 7282 gcc_assert (alg != no_stringop); 7283 7284 /* For now vector-version of memset is generated only for memory zeroing, as 7285 creating of promoted vector value is very cheap in this case. */ 7286 if (issetmem && alg == vector_loop && val_exp != const0_rtx) 7287 alg = unrolled_loop; 7288 7289 if (!count) 7290 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); 7291 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); 7292 if (!issetmem) 7293 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); 7294 7295 unroll_factor = 1; 7296 move_mode = word_mode; 7297 switch (alg) 7298 { 7299 case libcall: 7300 case no_stringop: 7301 case last_alg: 7302 gcc_unreachable (); 7303 case loop_1_byte: 7304 need_zero_guard = true; 7305 move_mode = QImode; 7306 break; 7307 case loop: 7308 need_zero_guard = true; 7309 break; 7310 case unrolled_loop: 7311 need_zero_guard = true; 7312 unroll_factor = (TARGET_64BIT ? 4 : 2); 7313 break; 7314 case vector_loop: 7315 need_zero_guard = true; 7316 unroll_factor = 4; 7317 /* Find the widest supported mode. */ 7318 move_mode = word_mode; 7319 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) 7320 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) 7321 move_mode = wider_mode; 7322 7323 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128) 7324 move_mode = TImode; 7325 7326 /* Find the corresponding vector mode with the same size as MOVE_MODE. 7327 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ 7328 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) 7329 { 7330 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); 7331 if (!mode_for_vector (word_mode, nunits).exists (&move_mode) 7332 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) 7333 move_mode = word_mode; 7334 } 7335 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); 7336 break; 7337 case rep_prefix_8_byte: 7338 move_mode = DImode; 7339 break; 7340 case rep_prefix_4_byte: 7341 move_mode = SImode; 7342 break; 7343 case rep_prefix_1_byte: 7344 move_mode = QImode; 7345 break; 7346 } 7347 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; 7348 epilogue_size_needed = size_needed; 7349 7350 /* If we are going to call any library calls conditionally, make sure any 7351 pending stack adjustment happen before the first conditional branch, 7352 otherwise they will be emitted before the library call only and won't 7353 happen from the other branches. */ 7354 if (dynamic_check != -1) 7355 do_pending_stack_adjust (); 7356 7357 desired_align = decide_alignment (align, alg, expected_size, move_mode); 7358 if (!TARGET_ALIGN_STRINGOPS || noalign) 7359 align = desired_align; 7360 7361 /* Step 1: Prologue guard. */ 7362 7363 /* Alignment code needs count to be in register. */ 7364 if (CONST_INT_P (count_exp) && desired_align > align) 7365 { 7366 if (INTVAL (count_exp) > desired_align 7367 && INTVAL (count_exp) > size_needed) 7368 { 7369 align_bytes 7370 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); 7371 if (align_bytes <= 0) 7372 align_bytes = 0; 7373 else 7374 align_bytes = desired_align - align_bytes; 7375 } 7376 if (align_bytes == 0) 7377 count_exp = force_reg (counter_mode (count_exp), count_exp); 7378 } 7379 gcc_assert (desired_align >= 1 && align >= 1); 7380 7381 /* Misaligned move sequences handle both prologue and epilogue at once. 7382 Default code generation results in a smaller code for large alignments 7383 and also avoids redundant job when sizes are known precisely. */ 7384 misaligned_prologue_used 7385 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES 7386 && MAX (desired_align, epilogue_size_needed) <= 32 7387 && desired_align <= epilogue_size_needed 7388 && ((desired_align > align && !align_bytes) 7389 || (!count && epilogue_size_needed > 1))); 7390 7391 /* Do the cheap promotion to allow better CSE across the 7392 main loop and epilogue (ie one load of the big constant in the 7393 front of all code. 7394 For now the misaligned move sequences do not have fast path 7395 without broadcasting. */ 7396 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used))) 7397 { 7398 if (alg == vector_loop) 7399 { 7400 gcc_assert (val_exp == const0_rtx); 7401 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); 7402 promoted_val = promote_duplicated_reg_to_size (val_exp, 7403 GET_MODE_SIZE (word_mode), 7404 desired_align, align); 7405 } 7406 else 7407 { 7408 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, 7409 desired_align, align); 7410 } 7411 } 7412 /* Misaligned move sequences handles both prologues and epilogues at once. 7413 Default code generation results in smaller code for large alignments and 7414 also avoids redundant job when sizes are known precisely. */ 7415 if (misaligned_prologue_used) 7416 { 7417 /* Misaligned move prologue handled small blocks by itself. */ 7418 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves 7419 (dst, src, &destreg, &srcreg, 7420 move_mode, promoted_val, vec_promoted_val, 7421 &count_exp, 7422 &jump_around_label, 7423 desired_align < align 7424 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed, 7425 desired_align, align, &min_size, dynamic_check, issetmem); 7426 if (!issetmem) 7427 src = change_address (src, BLKmode, srcreg); 7428 dst = change_address (dst, BLKmode, destreg); 7429 set_mem_align (dst, desired_align * BITS_PER_UNIT); 7430 epilogue_size_needed = 0; 7431 if (need_zero_guard 7432 && min_size < (unsigned HOST_WIDE_INT) size_needed) 7433 { 7434 /* It is possible that we copied enough so the main loop will not 7435 execute. */ 7436 gcc_assert (size_needed > 1); 7437 if (jump_around_label == NULL_RTX) 7438 jump_around_label = gen_label_rtx (); 7439 emit_cmp_and_jump_insns (count_exp, 7440 GEN_INT (size_needed), 7441 LTU, 0, counter_mode (count_exp), 1, jump_around_label); 7442 if (expected_size == -1 7443 || expected_size < (desired_align - align) / 2 + size_needed) 7444 predict_jump (REG_BR_PROB_BASE * 20 / 100); 7445 else 7446 predict_jump (REG_BR_PROB_BASE * 60 / 100); 7447 } 7448 } 7449 /* Ensure that alignment prologue won't copy past end of block. */ 7450 else if (size_needed > 1 || (desired_align > 1 && desired_align > align)) 7451 { 7452 epilogue_size_needed = MAX (size_needed - 1, desired_align - align); 7453 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. 7454 Make sure it is power of 2. */ 7455 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); 7456 7457 /* To improve performance of small blocks, we jump around the VAL 7458 promoting mode. This mean that if the promoted VAL is not constant, 7459 we might not use it in the epilogue and have to use byte 7460 loop variant. */ 7461 if (issetmem && epilogue_size_needed > 2 && !promoted_val) 7462 force_loopy_epilogue = true; 7463 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed) 7464 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) 7465 { 7466 /* If main algorithm works on QImode, no epilogue is needed. 7467 For small sizes just don't align anything. */ 7468 if (size_needed == 1) 7469 desired_align = align; 7470 else 7471 goto epilogue; 7472 } 7473 else if (!count 7474 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) 7475 { 7476 label = gen_label_rtx (); 7477 emit_cmp_and_jump_insns (count_exp, 7478 GEN_INT (epilogue_size_needed), 7479 LTU, 0, counter_mode (count_exp), 1, label); 7480 if (expected_size == -1 || expected_size < epilogue_size_needed) 7481 predict_jump (REG_BR_PROB_BASE * 60 / 100); 7482 else 7483 predict_jump (REG_BR_PROB_BASE * 20 / 100); 7484 } 7485 } 7486 7487 /* Emit code to decide on runtime whether library call or inline should be 7488 used. */ 7489 if (dynamic_check != -1) 7490 { 7491 if (!issetmem && CONST_INT_P (count_exp)) 7492 { 7493 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) 7494 { 7495 emit_block_copy_via_libcall (dst, src, count_exp); 7496 count_exp = const0_rtx; 7497 goto epilogue; 7498 } 7499 } 7500 else 7501 { 7502 rtx_code_label *hot_label = gen_label_rtx (); 7503 if (jump_around_label == NULL_RTX) 7504 jump_around_label = gen_label_rtx (); 7505 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), 7506 LEU, 0, counter_mode (count_exp), 7507 1, hot_label); 7508 predict_jump (REG_BR_PROB_BASE * 90 / 100); 7509 if (issetmem) 7510 set_storage_via_libcall (dst, count_exp, val_exp); 7511 else 7512 emit_block_copy_via_libcall (dst, src, count_exp); 7513 emit_jump (jump_around_label); 7514 emit_label (hot_label); 7515 } 7516 } 7517 7518 /* Step 2: Alignment prologue. */ 7519 /* Do the expensive promotion once we branched off the small blocks. */ 7520 if (issetmem && !promoted_val) 7521 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, 7522 desired_align, align); 7523 7524 if (desired_align > align && !misaligned_prologue_used) 7525 { 7526 if (align_bytes == 0) 7527 { 7528 /* Except for the first move in prologue, we no longer know 7529 constant offset in aliasing info. It don't seems to worth 7530 the pain to maintain it for the first move, so throw away 7531 the info early. */ 7532 dst = change_address (dst, BLKmode, destreg); 7533 if (!issetmem) 7534 src = change_address (src, BLKmode, srcreg); 7535 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg, 7536 promoted_val, vec_promoted_val, 7537 count_exp, align, desired_align, 7538 issetmem); 7539 /* At most desired_align - align bytes are copied. */ 7540 if (min_size < (unsigned)(desired_align - align)) 7541 min_size = 0; 7542 else 7543 min_size -= desired_align - align; 7544 } 7545 else 7546 { 7547 /* If we know how many bytes need to be stored before dst is 7548 sufficiently aligned, maintain aliasing info accurately. */ 7549 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg, 7550 srcreg, 7551 promoted_val, 7552 vec_promoted_val, 7553 desired_align, 7554 align_bytes, 7555 issetmem); 7556 7557 count_exp = plus_constant (counter_mode (count_exp), 7558 count_exp, -align_bytes); 7559 count -= align_bytes; 7560 min_size -= align_bytes; 7561 max_size -= align_bytes; 7562 } 7563 if (need_zero_guard 7564 && min_size < (unsigned HOST_WIDE_INT) size_needed 7565 && (count < (unsigned HOST_WIDE_INT) size_needed 7566 || (align_bytes == 0 7567 && count < ((unsigned HOST_WIDE_INT) size_needed 7568 + desired_align - align)))) 7569 { 7570 /* It is possible that we copied enough so the main loop will not 7571 execute. */ 7572 gcc_assert (size_needed > 1); 7573 if (label == NULL_RTX) 7574 label = gen_label_rtx (); 7575 emit_cmp_and_jump_insns (count_exp, 7576 GEN_INT (size_needed), 7577 LTU, 0, counter_mode (count_exp), 1, label); 7578 if (expected_size == -1 7579 || expected_size < (desired_align - align) / 2 + size_needed) 7580 predict_jump (REG_BR_PROB_BASE * 20 / 100); 7581 else 7582 predict_jump (REG_BR_PROB_BASE * 60 / 100); 7583 } 7584 } 7585 if (label && size_needed == 1) 7586 { 7587 emit_label (label); 7588 LABEL_NUSES (label) = 1; 7589 label = NULL; 7590 epilogue_size_needed = 1; 7591 if (issetmem) 7592 promoted_val = val_exp; 7593 } 7594 else if (label == NULL_RTX && !misaligned_prologue_used) 7595 epilogue_size_needed = size_needed; 7596 7597 /* Step 3: Main loop. */ 7598 7599 switch (alg) 7600 { 7601 case libcall: 7602 case no_stringop: 7603 case last_alg: 7604 gcc_unreachable (); 7605 case loop_1_byte: 7606 case loop: 7607 case unrolled_loop: 7608 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val, 7609 count_exp, move_mode, unroll_factor, 7610 expected_size, issetmem); 7611 break; 7612 case vector_loop: 7613 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, 7614 vec_promoted_val, count_exp, move_mode, 7615 unroll_factor, expected_size, issetmem); 7616 break; 7617 case rep_prefix_8_byte: 7618 case rep_prefix_4_byte: 7619 case rep_prefix_1_byte: 7620 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val, 7621 val_exp, count_exp, move_mode, issetmem); 7622 break; 7623 } 7624 /* Adjust properly the offset of src and dest memory for aliasing. */ 7625 if (CONST_INT_P (count_exp)) 7626 { 7627 if (!issetmem) 7628 src = adjust_automodify_address_nv (src, BLKmode, srcreg, 7629 (count / size_needed) * size_needed); 7630 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, 7631 (count / size_needed) * size_needed); 7632 } 7633 else 7634 { 7635 if (!issetmem) 7636 src = change_address (src, BLKmode, srcreg); 7637 dst = change_address (dst, BLKmode, destreg); 7638 } 7639 7640 /* Step 4: Epilogue to copy the remaining bytes. */ 7641 epilogue: 7642 if (label) 7643 { 7644 /* When the main loop is done, COUNT_EXP might hold original count, 7645 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. 7646 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED 7647 bytes. Compensate if needed. */ 7648 7649 if (size_needed < epilogue_size_needed) 7650 { 7651 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp, 7652 GEN_INT (size_needed - 1), count_exp, 1, 7653 OPTAB_DIRECT); 7654 if (tmp != count_exp) 7655 emit_move_insn (count_exp, tmp); 7656 } 7657 emit_label (label); 7658 LABEL_NUSES (label) = 1; 7659 } 7660 7661 if (count_exp != const0_rtx && epilogue_size_needed > 1) 7662 { 7663 if (force_loopy_epilogue) 7664 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, 7665 epilogue_size_needed); 7666 else 7667 { 7668 if (issetmem) 7669 expand_setmem_epilogue (dst, destreg, promoted_val, 7670 vec_promoted_val, count_exp, 7671 epilogue_size_needed); 7672 else 7673 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp, 7674 epilogue_size_needed); 7675 } 7676 } 7677 if (jump_around_label) 7678 emit_label (jump_around_label); 7679 return true; 7680} 7681 7682 7683/* Expand the appropriate insns for doing strlen if not just doing 7684 repnz; scasb 7685 7686 out = result, initialized with the start address 7687 align_rtx = alignment of the address. 7688 scratch = scratch register, initialized with the startaddress when 7689 not aligned, otherwise undefined 7690 7691 This is just the body. It needs the initializations mentioned above and 7692 some address computing at the end. These things are done in i386.md. */ 7693 7694static void 7695ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) 7696{ 7697 int align; 7698 rtx tmp; 7699 rtx_code_label *align_2_label = NULL; 7700 rtx_code_label *align_3_label = NULL; 7701 rtx_code_label *align_4_label = gen_label_rtx (); 7702 rtx_code_label *end_0_label = gen_label_rtx (); 7703 rtx mem; 7704 rtx tmpreg = gen_reg_rtx (SImode); 7705 rtx scratch = gen_reg_rtx (SImode); 7706 rtx cmp; 7707 7708 align = 0; 7709 if (CONST_INT_P (align_rtx)) 7710 align = INTVAL (align_rtx); 7711 7712 /* Loop to check 1..3 bytes for null to get an aligned pointer. */ 7713 7714 /* Is there a known alignment and is it less than 4? */ 7715 if (align < 4) 7716 { 7717 rtx scratch1 = gen_reg_rtx (Pmode); 7718 emit_move_insn (scratch1, out); 7719 /* Is there a known alignment and is it not 2? */ 7720 if (align != 2) 7721 { 7722 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */ 7723 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ 7724 7725 /* Leave just the 3 lower bits. */ 7726 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), 7727 NULL_RTX, 0, OPTAB_WIDEN); 7728 7729 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, 7730 Pmode, 1, align_4_label); 7731 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL, 7732 Pmode, 1, align_2_label); 7733 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL, 7734 Pmode, 1, align_3_label); 7735 } 7736 else 7737 { 7738 /* Since the alignment is 2, we have to check 2 or 0 bytes; 7739 check if is aligned to 4 - byte. */ 7740 7741 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx, 7742 NULL_RTX, 0, OPTAB_WIDEN); 7743 7744 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, 7745 Pmode, 1, align_4_label); 7746 } 7747 7748 mem = change_address (src, QImode, out); 7749 7750 /* Now compare the bytes. */ 7751 7752 /* Compare the first n unaligned byte on a byte per byte basis. */ 7753 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, 7754 QImode, 1, end_0_label); 7755 7756 /* Increment the address. */ 7757 emit_insn (gen_add2_insn (out, const1_rtx)); 7758 7759 /* Not needed with an alignment of 2 */ 7760 if (align != 2) 7761 { 7762 emit_label (align_2_label); 7763 7764 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 7765 end_0_label); 7766 7767 emit_insn (gen_add2_insn (out, const1_rtx)); 7768 7769 emit_label (align_3_label); 7770 } 7771 7772 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 7773 end_0_label); 7774 7775 emit_insn (gen_add2_insn (out, const1_rtx)); 7776 } 7777 7778 /* Generate loop to check 4 bytes at a time. It is not a good idea to 7779 align this loop. It gives only huge programs, but does not help to 7780 speed up. */ 7781 emit_label (align_4_label); 7782 7783 mem = change_address (src, SImode, out); 7784 emit_move_insn (scratch, mem); 7785 emit_insn (gen_add2_insn (out, GEN_INT (4))); 7786 7787 /* This formula yields a nonzero result iff one of the bytes is zero. 7788 This saves three branches inside loop and many cycles. */ 7789 7790 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); 7791 emit_insn (gen_one_cmplsi2 (scratch, scratch)); 7792 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); 7793 emit_insn (gen_andsi3 (tmpreg, tmpreg, 7794 gen_int_mode (0x80808080, SImode))); 7795 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 7796 align_4_label); 7797 7798 if (TARGET_CMOVE) 7799 { 7800 rtx reg = gen_reg_rtx (SImode); 7801 rtx reg2 = gen_reg_rtx (Pmode); 7802 emit_move_insn (reg, tmpreg); 7803 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); 7804 7805 /* If zero is not in the first two bytes, move two bytes forward. */ 7806 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); 7807 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); 7808 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); 7809 emit_insn (gen_rtx_SET (tmpreg, 7810 gen_rtx_IF_THEN_ELSE (SImode, tmp, 7811 reg, 7812 tmpreg))); 7813 /* Emit lea manually to avoid clobbering of flags. */ 7814 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx))); 7815 7816 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); 7817 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); 7818 emit_insn (gen_rtx_SET (out, 7819 gen_rtx_IF_THEN_ELSE (Pmode, tmp, 7820 reg2, 7821 out))); 7822 } 7823 else 7824 { 7825 rtx_code_label *end_2_label = gen_label_rtx (); 7826 /* Is zero in the first two bytes? */ 7827 7828 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); 7829 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); 7830 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx); 7831 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 7832 gen_rtx_LABEL_REF (VOIDmode, end_2_label), 7833 pc_rtx); 7834 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 7835 JUMP_LABEL (tmp) = end_2_label; 7836 7837 /* Not in the first two. Move two bytes forward. */ 7838 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); 7839 emit_insn (gen_add2_insn (out, const2_rtx)); 7840 7841 emit_label (end_2_label); 7842 7843 } 7844 7845 /* Avoid branch in fixing the byte. */ 7846 tmpreg = gen_lowpart (QImode, tmpreg); 7847 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg)); 7848 tmp = gen_rtx_REG (CCmode, FLAGS_REG); 7849 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx); 7850 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp)); 7851 7852 emit_label (end_0_label); 7853} 7854 7855/* Expand strlen. */ 7856 7857bool 7858ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) 7859{ 7860if (TARGET_UNROLL_STRLEN 7861 && TARGET_INLINE_ALL_STRINGOPS 7862 && eoschar == const0_rtx 7863 && optimize > 1) 7864 { 7865 /* The generic case of strlen expander is long. Avoid it's 7866 expanding unless TARGET_INLINE_ALL_STRINGOPS. */ 7867 rtx addr = force_reg (Pmode, XEXP (src, 0)); 7868 /* Well it seems that some optimizer does not combine a call like 7869 foo(strlen(bar), strlen(bar)); 7870 when the move and the subtraction is done here. It does calculate 7871 the length just once when these instructions are done inside of 7872 output_strlen_unroll(). But I think since &bar[strlen(bar)] is 7873 often used and I use one fewer register for the lifetime of 7874 output_strlen_unroll() this is better. */ 7875 7876 emit_move_insn (out, addr); 7877 7878 ix86_expand_strlensi_unroll_1 (out, src, align); 7879 7880 /* strlensi_unroll_1 returns the address of the zero at the end of 7881 the string, like memchr(), so compute the length by subtracting 7882 the start address. */ 7883 emit_insn (gen_sub2_insn (out, addr)); 7884 return true; 7885 } 7886 else 7887 return false; 7888} 7889 7890/* For given symbol (function) construct code to compute address of it's PLT 7891 entry in large x86-64 PIC model. */ 7892 7893static rtx 7894construct_plt_address (rtx symbol) 7895{ 7896 rtx tmp, unspec; 7897 7898 gcc_assert (GET_CODE (symbol) == SYMBOL_REF); 7899 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF); 7900 gcc_assert (Pmode == DImode); 7901 7902 tmp = gen_reg_rtx (Pmode); 7903 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); 7904 7905 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); 7906 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx)); 7907 return tmp; 7908} 7909 7910/* Additional registers that are clobbered by SYSV calls. */ 7911 7912static int const x86_64_ms_sysv_extra_clobbered_registers 7913 [NUM_X86_64_MS_CLOBBERED_REGS] = 7914{ 7915 SI_REG, DI_REG, 7916 XMM6_REG, XMM7_REG, 7917 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG, 7918 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG 7919}; 7920 7921rtx_insn * 7922ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, 7923 rtx callarg2, 7924 rtx pop, bool sibcall) 7925{ 7926 rtx vec[3]; 7927 rtx use = NULL, call; 7928 unsigned int vec_len = 0; 7929 tree fndecl; 7930 7931 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) 7932 { 7933 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); 7934 if (fndecl 7935 && (lookup_attribute ("interrupt", 7936 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))) 7937 error ("interrupt service routine cannot be called directly"); 7938 } 7939 else 7940 fndecl = NULL_TREE; 7941 7942 if (pop == const0_rtx) 7943 pop = NULL; 7944 gcc_assert (!TARGET_64BIT || !pop); 7945 7946 rtx addr = XEXP (fnaddr, 0); 7947 if (TARGET_MACHO && !TARGET_64BIT) 7948 { 7949#if TARGET_MACHO 7950 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) 7951 fnaddr = machopic_indirect_call_target (fnaddr); 7952#endif 7953 } 7954 else 7955 { 7956 /* Static functions and indirect calls don't need the pic register. Also, 7957 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making 7958 it an indirect call. */ 7959 if (flag_pic 7960 && GET_CODE (addr) == SYMBOL_REF 7961 && !SYMBOL_REF_LOCAL_P (addr)) 7962 { 7963 if (flag_plt 7964 && (SYMBOL_REF_DECL (addr) == NULL_TREE 7965 || !lookup_attribute ("noplt", 7966 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr))))) 7967 { 7968 if (!TARGET_64BIT 7969 || (ix86_cmodel == CM_LARGE_PIC 7970 && DEFAULT_ABI != MS_ABI)) 7971 { 7972 use_reg (&use, gen_rtx_REG (Pmode, 7973 REAL_PIC_OFFSET_TABLE_REGNUM)); 7974 if (ix86_use_pseudo_pic_reg ()) 7975 emit_move_insn (gen_rtx_REG (Pmode, 7976 REAL_PIC_OFFSET_TABLE_REGNUM), 7977 pic_offset_table_rtx); 7978 } 7979 } 7980 else if (!TARGET_PECOFF && !TARGET_MACHO) 7981 { 7982 if (TARGET_64BIT 7983 && ix86_cmodel == CM_LARGE_PIC 7984 && DEFAULT_ABI != MS_ABI) 7985 { 7986 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), 7987 UNSPEC_GOT); 7988 fnaddr = gen_rtx_CONST (Pmode, fnaddr); 7989 fnaddr = force_reg (Pmode, fnaddr); 7990 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr); 7991 } 7992 else if (TARGET_64BIT) 7993 { 7994 fnaddr = gen_rtx_UNSPEC (Pmode, 7995 gen_rtvec (1, addr), 7996 UNSPEC_GOTPCREL); 7997 fnaddr = gen_rtx_CONST (Pmode, fnaddr); 7998 } 7999 else 8000 { 8001 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), 8002 UNSPEC_GOT); 8003 fnaddr = gen_rtx_CONST (Pmode, fnaddr); 8004 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, 8005 fnaddr); 8006 } 8007 fnaddr = gen_const_mem (Pmode, fnaddr); 8008 /* Pmode may not be the same as word_mode for x32, which 8009 doesn't support indirect branch via 32-bit memory slot. 8010 Since x32 GOT slot is 64 bit with zero upper 32 bits, 8011 indirect branch via x32 GOT slot is OK. */ 8012 if (GET_MODE (fnaddr) != word_mode) 8013 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); 8014 fnaddr = gen_rtx_MEM (QImode, fnaddr); 8015 } 8016 } 8017 } 8018 8019 /* Skip setting up RAX register for -mskip-rax-setup when there are no 8020 parameters passed in vector registers. */ 8021 if (TARGET_64BIT 8022 && (INTVAL (callarg2) > 0 8023 || (INTVAL (callarg2) == 0 8024 && (TARGET_SSE || !flag_skip_rax_setup)))) 8025 { 8026 rtx al = gen_rtx_REG (QImode, AX_REG); 8027 emit_move_insn (al, callarg2); 8028 use_reg (&use, al); 8029 } 8030 8031 if (ix86_cmodel == CM_LARGE_PIC 8032 && !TARGET_PECOFF 8033 && MEM_P (fnaddr) 8034 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF 8035 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) 8036 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); 8037 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect 8038 branch via x32 GOT slot is OK. */ 8039 else if (!(TARGET_X32 8040 && MEM_P (fnaddr) 8041 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND 8042 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)) 8043 && (sibcall 8044 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode) 8045 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))) 8046 { 8047 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1); 8048 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr)); 8049 } 8050 8051 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); 8052 8053 if (retval) 8054 call = gen_rtx_SET (retval, call); 8055 vec[vec_len++] = call; 8056 8057 if (pop) 8058 { 8059 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop); 8060 pop = gen_rtx_SET (stack_pointer_rtx, pop); 8061 vec[vec_len++] = pop; 8062 } 8063 8064 if (cfun->machine->no_caller_saved_registers 8065 && (!fndecl 8066 || (!TREE_THIS_VOLATILE (fndecl) 8067 && !lookup_attribute ("no_caller_saved_registers", 8068 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))) 8069 { 8070 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS; 8071 bool is_64bit_ms_abi = (TARGET_64BIT 8072 && ix86_function_abi (fndecl) == MS_ABI); 8073 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi); 8074 8075 /* If there are no caller-saved registers, add all registers 8076 that are clobbered by the call which returns. */ 8077 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++) 8078 if (!fixed_regs[i] 8079 && (ix86_call_used_regs[i] == 1 8080 || (ix86_call_used_regs[i] & c_mask)) 8081 && !STACK_REGNO_P (i) 8082 && !MMX_REGNO_P (i)) 8083 clobber_reg (&use, 8084 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i)); 8085 } 8086 else if (TARGET_64BIT_MS_ABI 8087 && (!callarg2 || INTVAL (callarg2) != -2)) 8088 { 8089 unsigned i; 8090 8091 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++) 8092 { 8093 int regno = x86_64_ms_sysv_extra_clobbered_registers[i]; 8094 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode; 8095 8096 clobber_reg (&use, gen_rtx_REG (mode, regno)); 8097 } 8098 8099 /* Set here, but it may get cleared later. */ 8100 if (TARGET_CALL_MS2SYSV_XLOGUES) 8101 { 8102 if (!TARGET_SSE) 8103 ; 8104 8105 /* Don't break hot-patched functions. */ 8106 else if (ix86_function_ms_hook_prologue (current_function_decl)) 8107 ; 8108 8109 /* TODO: Cases not yet examined. */ 8110 else if (flag_split_stack) 8111 warn_once_call_ms2sysv_xlogues ("-fsplit-stack"); 8112 8113 else 8114 { 8115 gcc_assert (!reload_completed); 8116 cfun->machine->call_ms2sysv = true; 8117 } 8118 } 8119 } 8120 8121 if (TARGET_MACHO && TARGET_64BIT && !sibcall 8122 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr)) 8123 || !fndecl || TREE_PUBLIC (fndecl))) 8124 { 8125 /* We allow public functions defined in a TU to bind locally for PIC 8126 code (the default) on 64bit Mach-O. 8127 If such functions are not inlined, we cannot tell at compile-time if 8128 they will be called via the lazy symbol resolver (this can depend on 8129 options given at link-time). Therefore, we must assume that the lazy 8130 resolver could be used which clobbers R11 and R10. */ 8131 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG)); 8132 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG)); 8133 } 8134 8135 if (vec_len > 1) 8136 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec)); 8137 rtx_insn *call_insn = emit_call_insn (call); 8138 if (use) 8139 CALL_INSN_FUNCTION_USAGE (call_insn) = use; 8140 8141 return call_insn; 8142} 8143 8144/* Split simple return with popping POPC bytes from stack to indirect 8145 branch with stack adjustment . */ 8146 8147void 8148ix86_split_simple_return_pop_internal (rtx popc) 8149{ 8150 struct machine_function *m = cfun->machine; 8151 rtx ecx = gen_rtx_REG (SImode, CX_REG); 8152 rtx_insn *insn; 8153 8154 /* There is no "pascal" calling convention in any 64bit ABI. */ 8155 gcc_assert (!TARGET_64BIT); 8156 8157 insn = emit_insn (gen_pop (ecx)); 8158 m->fs.cfa_offset -= UNITS_PER_WORD; 8159 m->fs.sp_offset -= UNITS_PER_WORD; 8160 8161 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); 8162 x = gen_rtx_SET (stack_pointer_rtx, x); 8163 add_reg_note (insn, REG_CFA_ADJUST_CFA, x); 8164 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); 8165 RTX_FRAME_RELATED_P (insn) = 1; 8166 8167 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc); 8168 x = gen_rtx_SET (stack_pointer_rtx, x); 8169 insn = emit_insn (x); 8170 add_reg_note (insn, REG_CFA_ADJUST_CFA, x); 8171 RTX_FRAME_RELATED_P (insn) = 1; 8172 8173 /* Now return address is in ECX. */ 8174 emit_jump_insn (gen_simple_return_indirect_internal (ecx)); 8175} 8176 8177/* Errors in the source file can cause expand_expr to return const0_rtx 8178 where we expect a vector. To avoid crashing, use one of the vector 8179 clear instructions. */ 8180 8181static rtx 8182safe_vector_operand (rtx x, machine_mode mode) 8183{ 8184 if (x == const0_rtx) 8185 x = CONST0_RTX (mode); 8186 return x; 8187} 8188 8189/* Subroutine of ix86_expand_builtin to take care of binop insns. */ 8190 8191static rtx 8192ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) 8193{ 8194 rtx pat; 8195 tree arg0 = CALL_EXPR_ARG (exp, 0); 8196 tree arg1 = CALL_EXPR_ARG (exp, 1); 8197 rtx op0 = expand_normal (arg0); 8198 rtx op1 = expand_normal (arg1); 8199 machine_mode tmode = insn_data[icode].operand[0].mode; 8200 machine_mode mode0 = insn_data[icode].operand[1].mode; 8201 machine_mode mode1 = insn_data[icode].operand[2].mode; 8202 8203 if (VECTOR_MODE_P (mode0)) 8204 op0 = safe_vector_operand (op0, mode0); 8205 if (VECTOR_MODE_P (mode1)) 8206 op1 = safe_vector_operand (op1, mode1); 8207 8208 if (optimize || !target 8209 || GET_MODE (target) != tmode 8210 || !insn_data[icode].operand[0].predicate (target, tmode)) 8211 target = gen_reg_rtx (tmode); 8212 8213 if (GET_MODE (op1) == SImode && mode1 == TImode) 8214 { 8215 rtx x = gen_reg_rtx (V4SImode); 8216 emit_insn (gen_sse2_loadd (x, op1)); 8217 op1 = gen_lowpart (TImode, x); 8218 } 8219 8220 if (!insn_data[icode].operand[1].predicate (op0, mode0)) 8221 op0 = copy_to_mode_reg (mode0, op0); 8222 if (!insn_data[icode].operand[2].predicate (op1, mode1)) 8223 op1 = copy_to_mode_reg (mode1, op1); 8224 8225 pat = GEN_FCN (icode) (target, op0, op1); 8226 if (! pat) 8227 return 0; 8228 8229 emit_insn (pat); 8230 8231 return target; 8232} 8233 8234/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ 8235 8236static rtx 8237ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, 8238 enum ix86_builtin_func_type m_type, 8239 enum rtx_code sub_code) 8240{ 8241 rtx pat; 8242 int i; 8243 int nargs; 8244 bool comparison_p = false; 8245 bool tf_p = false; 8246 bool last_arg_constant = false; 8247 int num_memory = 0; 8248 struct { 8249 rtx op; 8250 machine_mode mode; 8251 } args[4]; 8252 8253 machine_mode tmode = insn_data[icode].operand[0].mode; 8254 8255 switch (m_type) 8256 { 8257 case MULTI_ARG_4_DF2_DI_I: 8258 case MULTI_ARG_4_DF2_DI_I1: 8259 case MULTI_ARG_4_SF2_SI_I: 8260 case MULTI_ARG_4_SF2_SI_I1: 8261 nargs = 4; 8262 last_arg_constant = true; 8263 break; 8264 8265 case MULTI_ARG_3_SF: 8266 case MULTI_ARG_3_DF: 8267 case MULTI_ARG_3_SF2: 8268 case MULTI_ARG_3_DF2: 8269 case MULTI_ARG_3_DI: 8270 case MULTI_ARG_3_SI: 8271 case MULTI_ARG_3_SI_DI: 8272 case MULTI_ARG_3_HI: 8273 case MULTI_ARG_3_HI_SI: 8274 case MULTI_ARG_3_QI: 8275 case MULTI_ARG_3_DI2: 8276 case MULTI_ARG_3_SI2: 8277 case MULTI_ARG_3_HI2: 8278 case MULTI_ARG_3_QI2: 8279 nargs = 3; 8280 break; 8281 8282 case MULTI_ARG_2_SF: 8283 case MULTI_ARG_2_DF: 8284 case MULTI_ARG_2_DI: 8285 case MULTI_ARG_2_SI: 8286 case MULTI_ARG_2_HI: 8287 case MULTI_ARG_2_QI: 8288 nargs = 2; 8289 break; 8290 8291 case MULTI_ARG_2_DI_IMM: 8292 case MULTI_ARG_2_SI_IMM: 8293 case MULTI_ARG_2_HI_IMM: 8294 case MULTI_ARG_2_QI_IMM: 8295 nargs = 2; 8296 last_arg_constant = true; 8297 break; 8298 8299 case MULTI_ARG_1_SF: 8300 case MULTI_ARG_1_DF: 8301 case MULTI_ARG_1_SF2: 8302 case MULTI_ARG_1_DF2: 8303 case MULTI_ARG_1_DI: 8304 case MULTI_ARG_1_SI: 8305 case MULTI_ARG_1_HI: 8306 case MULTI_ARG_1_QI: 8307 case MULTI_ARG_1_SI_DI: 8308 case MULTI_ARG_1_HI_DI: 8309 case MULTI_ARG_1_HI_SI: 8310 case MULTI_ARG_1_QI_DI: 8311 case MULTI_ARG_1_QI_SI: 8312 case MULTI_ARG_1_QI_HI: 8313 nargs = 1; 8314 break; 8315 8316 case MULTI_ARG_2_DI_CMP: 8317 case MULTI_ARG_2_SI_CMP: 8318 case MULTI_ARG_2_HI_CMP: 8319 case MULTI_ARG_2_QI_CMP: 8320 nargs = 2; 8321 comparison_p = true; 8322 break; 8323 8324 case MULTI_ARG_2_SF_TF: 8325 case MULTI_ARG_2_DF_TF: 8326 case MULTI_ARG_2_DI_TF: 8327 case MULTI_ARG_2_SI_TF: 8328 case MULTI_ARG_2_HI_TF: 8329 case MULTI_ARG_2_QI_TF: 8330 nargs = 2; 8331 tf_p = true; 8332 break; 8333 8334 default: 8335 gcc_unreachable (); 8336 } 8337 8338 if (optimize || !target 8339 || GET_MODE (target) != tmode 8340 || !insn_data[icode].operand[0].predicate (target, tmode)) 8341 target = gen_reg_rtx (tmode); 8342 else if (memory_operand (target, tmode)) 8343 num_memory++; 8344 8345 gcc_assert (nargs <= 4); 8346 8347 for (i = 0; i < nargs; i++) 8348 { 8349 tree arg = CALL_EXPR_ARG (exp, i); 8350 rtx op = expand_normal (arg); 8351 int adjust = (comparison_p) ? 1 : 0; 8352 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; 8353 8354 if (last_arg_constant && i == nargs - 1) 8355 { 8356 if (!insn_data[icode].operand[i + 1].predicate (op, mode)) 8357 { 8358 enum insn_code new_icode = icode; 8359 switch (icode) 8360 { 8361 case CODE_FOR_xop_vpermil2v2df3: 8362 case CODE_FOR_xop_vpermil2v4sf3: 8363 case CODE_FOR_xop_vpermil2v4df3: 8364 case CODE_FOR_xop_vpermil2v8sf3: 8365 error ("the last argument must be a 2-bit immediate"); 8366 return gen_reg_rtx (tmode); 8367 case CODE_FOR_xop_rotlv2di3: 8368 new_icode = CODE_FOR_rotlv2di3; 8369 goto xop_rotl; 8370 case CODE_FOR_xop_rotlv4si3: 8371 new_icode = CODE_FOR_rotlv4si3; 8372 goto xop_rotl; 8373 case CODE_FOR_xop_rotlv8hi3: 8374 new_icode = CODE_FOR_rotlv8hi3; 8375 goto xop_rotl; 8376 case CODE_FOR_xop_rotlv16qi3: 8377 new_icode = CODE_FOR_rotlv16qi3; 8378 xop_rotl: 8379 if (CONST_INT_P (op)) 8380 { 8381 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1; 8382 op = GEN_INT (INTVAL (op) & mask); 8383 gcc_checking_assert 8384 (insn_data[icode].operand[i + 1].predicate (op, mode)); 8385 } 8386 else 8387 { 8388 gcc_checking_assert 8389 (nargs == 2 8390 && insn_data[new_icode].operand[0].mode == tmode 8391 && insn_data[new_icode].operand[1].mode == tmode 8392 && insn_data[new_icode].operand[2].mode == mode 8393 && insn_data[new_icode].operand[0].predicate 8394 == insn_data[icode].operand[0].predicate 8395 && insn_data[new_icode].operand[1].predicate 8396 == insn_data[icode].operand[1].predicate); 8397 icode = new_icode; 8398 goto non_constant; 8399 } 8400 break; 8401 default: 8402 gcc_unreachable (); 8403 } 8404 } 8405 } 8406 else 8407 { 8408 non_constant: 8409 if (VECTOR_MODE_P (mode)) 8410 op = safe_vector_operand (op, mode); 8411 8412 /* If we aren't optimizing, only allow one memory operand to be 8413 generated. */ 8414 if (memory_operand (op, mode)) 8415 num_memory++; 8416 8417 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); 8418 8419 if (optimize 8420 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode) 8421 || num_memory > 1) 8422 op = force_reg (mode, op); 8423 } 8424 8425 args[i].op = op; 8426 args[i].mode = mode; 8427 } 8428 8429 switch (nargs) 8430 { 8431 case 1: 8432 pat = GEN_FCN (icode) (target, args[0].op); 8433 break; 8434 8435 case 2: 8436 if (tf_p) 8437 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, 8438 GEN_INT ((int)sub_code)); 8439 else if (! comparison_p) 8440 pat = GEN_FCN (icode) (target, args[0].op, args[1].op); 8441 else 8442 { 8443 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), 8444 args[0].op, 8445 args[1].op); 8446 8447 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); 8448 } 8449 break; 8450 8451 case 3: 8452 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); 8453 break; 8454 8455 case 4: 8456 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); 8457 break; 8458 8459 default: 8460 gcc_unreachable (); 8461 } 8462 8463 if (! pat) 8464 return 0; 8465 8466 emit_insn (pat); 8467 return target; 8468} 8469 8470/* Subroutine of ix86_expand_args_builtin to take care of scalar unop 8471 insns with vec_merge. */ 8472 8473static rtx 8474ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp, 8475 rtx target) 8476{ 8477 rtx pat; 8478 tree arg0 = CALL_EXPR_ARG (exp, 0); 8479 rtx op1, op0 = expand_normal (arg0); 8480 machine_mode tmode = insn_data[icode].operand[0].mode; 8481 machine_mode mode0 = insn_data[icode].operand[1].mode; 8482 8483 if (optimize || !target 8484 || GET_MODE (target) != tmode 8485 || !insn_data[icode].operand[0].predicate (target, tmode)) 8486 target = gen_reg_rtx (tmode); 8487 8488 if (VECTOR_MODE_P (mode0)) 8489 op0 = safe_vector_operand (op0, mode0); 8490 8491 if ((optimize && !register_operand (op0, mode0)) 8492 || !insn_data[icode].operand[1].predicate (op0, mode0)) 8493 op0 = copy_to_mode_reg (mode0, op0); 8494 8495 op1 = op0; 8496 if (!insn_data[icode].operand[2].predicate (op1, mode0)) 8497 op1 = copy_to_mode_reg (mode0, op1); 8498 8499 pat = GEN_FCN (icode) (target, op0, op1); 8500 if (! pat) 8501 return 0; 8502 emit_insn (pat); 8503 return target; 8504} 8505 8506/* Subroutine of ix86_expand_builtin to take care of comparison insns. */ 8507 8508static rtx 8509ix86_expand_sse_compare (const struct builtin_description *d, 8510 tree exp, rtx target, bool swap) 8511{ 8512 rtx pat; 8513 tree arg0 = CALL_EXPR_ARG (exp, 0); 8514 tree arg1 = CALL_EXPR_ARG (exp, 1); 8515 rtx op0 = expand_normal (arg0); 8516 rtx op1 = expand_normal (arg1); 8517 rtx op2; 8518 machine_mode tmode = insn_data[d->icode].operand[0].mode; 8519 machine_mode mode0 = insn_data[d->icode].operand[1].mode; 8520 machine_mode mode1 = insn_data[d->icode].operand[2].mode; 8521 enum rtx_code comparison = d->comparison; 8522 8523 if (VECTOR_MODE_P (mode0)) 8524 op0 = safe_vector_operand (op0, mode0); 8525 if (VECTOR_MODE_P (mode1)) 8526 op1 = safe_vector_operand (op1, mode1); 8527 8528 /* Swap operands if we have a comparison that isn't available in 8529 hardware. */ 8530 if (swap) 8531 std::swap (op0, op1); 8532 8533 if (optimize || !target 8534 || GET_MODE (target) != tmode 8535 || !insn_data[d->icode].operand[0].predicate (target, tmode)) 8536 target = gen_reg_rtx (tmode); 8537 8538 if ((optimize && !register_operand (op0, mode0)) 8539 || !insn_data[d->icode].operand[1].predicate (op0, mode0)) 8540 op0 = copy_to_mode_reg (mode0, op0); 8541 if ((optimize && !register_operand (op1, mode1)) 8542 || !insn_data[d->icode].operand[2].predicate (op1, mode1)) 8543 op1 = copy_to_mode_reg (mode1, op1); 8544 8545 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); 8546 pat = GEN_FCN (d->icode) (target, op0, op1, op2); 8547 if (! pat) 8548 return 0; 8549 emit_insn (pat); 8550 return target; 8551} 8552 8553/* Subroutine of ix86_expand_builtin to take care of comi insns. */ 8554 8555static rtx 8556ix86_expand_sse_comi (const struct builtin_description *d, tree exp, 8557 rtx target) 8558{ 8559 rtx pat; 8560 tree arg0 = CALL_EXPR_ARG (exp, 0); 8561 tree arg1 = CALL_EXPR_ARG (exp, 1); 8562 rtx op0 = expand_normal (arg0); 8563 rtx op1 = expand_normal (arg1); 8564 machine_mode mode0 = insn_data[d->icode].operand[0].mode; 8565 machine_mode mode1 = insn_data[d->icode].operand[1].mode; 8566 enum rtx_code comparison = d->comparison; 8567 8568 if (VECTOR_MODE_P (mode0)) 8569 op0 = safe_vector_operand (op0, mode0); 8570 if (VECTOR_MODE_P (mode1)) 8571 op1 = safe_vector_operand (op1, mode1); 8572 8573 /* Swap operands if we have a comparison that isn't available in 8574 hardware. */ 8575 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS) 8576 std::swap (op0, op1); 8577 8578 target = gen_reg_rtx (SImode); 8579 emit_move_insn (target, const0_rtx); 8580 target = gen_rtx_SUBREG (QImode, target, 0); 8581 8582 if ((optimize && !register_operand (op0, mode0)) 8583 || !insn_data[d->icode].operand[0].predicate (op0, mode0)) 8584 op0 = copy_to_mode_reg (mode0, op0); 8585 if ((optimize && !register_operand (op1, mode1)) 8586 || !insn_data[d->icode].operand[1].predicate (op1, mode1)) 8587 op1 = copy_to_mode_reg (mode1, op1); 8588 8589 pat = GEN_FCN (d->icode) (op0, op1); 8590 if (! pat) 8591 return 0; 8592 emit_insn (pat); 8593 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 8594 gen_rtx_fmt_ee (comparison, QImode, 8595 SET_DEST (pat), 8596 const0_rtx))); 8597 8598 return SUBREG_REG (target); 8599} 8600 8601/* Subroutines of ix86_expand_args_builtin to take care of round insns. */ 8602 8603static rtx 8604ix86_expand_sse_round (const struct builtin_description *d, tree exp, 8605 rtx target) 8606{ 8607 rtx pat; 8608 tree arg0 = CALL_EXPR_ARG (exp, 0); 8609 rtx op1, op0 = expand_normal (arg0); 8610 machine_mode tmode = insn_data[d->icode].operand[0].mode; 8611 machine_mode mode0 = insn_data[d->icode].operand[1].mode; 8612 8613 if (optimize || target == 0 8614 || GET_MODE (target) != tmode 8615 || !insn_data[d->icode].operand[0].predicate (target, tmode)) 8616 target = gen_reg_rtx (tmode); 8617 8618 if (VECTOR_MODE_P (mode0)) 8619 op0 = safe_vector_operand (op0, mode0); 8620 8621 if ((optimize && !register_operand (op0, mode0)) 8622 || !insn_data[d->icode].operand[0].predicate (op0, mode0)) 8623 op0 = copy_to_mode_reg (mode0, op0); 8624 8625 op1 = GEN_INT (d->comparison); 8626 8627 pat = GEN_FCN (d->icode) (target, op0, op1); 8628 if (! pat) 8629 return 0; 8630 emit_insn (pat); 8631 return target; 8632} 8633 8634static rtx 8635ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d, 8636 tree exp, rtx target) 8637{ 8638 rtx pat; 8639 tree arg0 = CALL_EXPR_ARG (exp, 0); 8640 tree arg1 = CALL_EXPR_ARG (exp, 1); 8641 rtx op0 = expand_normal (arg0); 8642 rtx op1 = expand_normal (arg1); 8643 rtx op2; 8644 machine_mode tmode = insn_data[d->icode].operand[0].mode; 8645 machine_mode mode0 = insn_data[d->icode].operand[1].mode; 8646 machine_mode mode1 = insn_data[d->icode].operand[2].mode; 8647 8648 if (optimize || target == 0 8649 || GET_MODE (target) != tmode 8650 || !insn_data[d->icode].operand[0].predicate (target, tmode)) 8651 target = gen_reg_rtx (tmode); 8652 8653 op0 = safe_vector_operand (op0, mode0); 8654 op1 = safe_vector_operand (op1, mode1); 8655 8656 if ((optimize && !register_operand (op0, mode0)) 8657 || !insn_data[d->icode].operand[0].predicate (op0, mode0)) 8658 op0 = copy_to_mode_reg (mode0, op0); 8659 if ((optimize && !register_operand (op1, mode1)) 8660 || !insn_data[d->icode].operand[1].predicate (op1, mode1)) 8661 op1 = copy_to_mode_reg (mode1, op1); 8662 8663 op2 = GEN_INT (d->comparison); 8664 8665 pat = GEN_FCN (d->icode) (target, op0, op1, op2); 8666 if (! pat) 8667 return 0; 8668 emit_insn (pat); 8669 return target; 8670} 8671 8672/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ 8673 8674static rtx 8675ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, 8676 rtx target) 8677{ 8678 rtx pat; 8679 tree arg0 = CALL_EXPR_ARG (exp, 0); 8680 tree arg1 = CALL_EXPR_ARG (exp, 1); 8681 rtx op0 = expand_normal (arg0); 8682 rtx op1 = expand_normal (arg1); 8683 machine_mode mode0 = insn_data[d->icode].operand[0].mode; 8684 machine_mode mode1 = insn_data[d->icode].operand[1].mode; 8685 enum rtx_code comparison = d->comparison; 8686 8687 if (VECTOR_MODE_P (mode0)) 8688 op0 = safe_vector_operand (op0, mode0); 8689 if (VECTOR_MODE_P (mode1)) 8690 op1 = safe_vector_operand (op1, mode1); 8691 8692 target = gen_reg_rtx (SImode); 8693 emit_move_insn (target, const0_rtx); 8694 target = gen_rtx_SUBREG (QImode, target, 0); 8695 8696 if ((optimize && !register_operand (op0, mode0)) 8697 || !insn_data[d->icode].operand[0].predicate (op0, mode0)) 8698 op0 = copy_to_mode_reg (mode0, op0); 8699 if ((optimize && !register_operand (op1, mode1)) 8700 || !insn_data[d->icode].operand[1].predicate (op1, mode1)) 8701 op1 = copy_to_mode_reg (mode1, op1); 8702 8703 pat = GEN_FCN (d->icode) (op0, op1); 8704 if (! pat) 8705 return 0; 8706 emit_insn (pat); 8707 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 8708 gen_rtx_fmt_ee (comparison, QImode, 8709 SET_DEST (pat), 8710 const0_rtx))); 8711 8712 return SUBREG_REG (target); 8713} 8714 8715/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */ 8716 8717static rtx 8718ix86_expand_sse_pcmpestr (const struct builtin_description *d, 8719 tree exp, rtx target) 8720{ 8721 rtx pat; 8722 tree arg0 = CALL_EXPR_ARG (exp, 0); 8723 tree arg1 = CALL_EXPR_ARG (exp, 1); 8724 tree arg2 = CALL_EXPR_ARG (exp, 2); 8725 tree arg3 = CALL_EXPR_ARG (exp, 3); 8726 tree arg4 = CALL_EXPR_ARG (exp, 4); 8727 rtx scratch0, scratch1; 8728 rtx op0 = expand_normal (arg0); 8729 rtx op1 = expand_normal (arg1); 8730 rtx op2 = expand_normal (arg2); 8731 rtx op3 = expand_normal (arg3); 8732 rtx op4 = expand_normal (arg4); 8733 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm; 8734 8735 tmode0 = insn_data[d->icode].operand[0].mode; 8736 tmode1 = insn_data[d->icode].operand[1].mode; 8737 modev2 = insn_data[d->icode].operand[2].mode; 8738 modei3 = insn_data[d->icode].operand[3].mode; 8739 modev4 = insn_data[d->icode].operand[4].mode; 8740 modei5 = insn_data[d->icode].operand[5].mode; 8741 modeimm = insn_data[d->icode].operand[6].mode; 8742 8743 if (VECTOR_MODE_P (modev2)) 8744 op0 = safe_vector_operand (op0, modev2); 8745 if (VECTOR_MODE_P (modev4)) 8746 op2 = safe_vector_operand (op2, modev4); 8747 8748 if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) 8749 op0 = copy_to_mode_reg (modev2, op0); 8750 if (!insn_data[d->icode].operand[3].predicate (op1, modei3)) 8751 op1 = copy_to_mode_reg (modei3, op1); 8752 if ((optimize && !register_operand (op2, modev4)) 8753 || !insn_data[d->icode].operand[4].predicate (op2, modev4)) 8754 op2 = copy_to_mode_reg (modev4, op2); 8755 if (!insn_data[d->icode].operand[5].predicate (op3, modei5)) 8756 op3 = copy_to_mode_reg (modei5, op3); 8757 8758 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm)) 8759 { 8760 error ("the fifth argument must be an 8-bit immediate"); 8761 return const0_rtx; 8762 } 8763 8764 if (d->code == IX86_BUILTIN_PCMPESTRI128) 8765 { 8766 if (optimize || !target 8767 || GET_MODE (target) != tmode0 8768 || !insn_data[d->icode].operand[0].predicate (target, tmode0)) 8769 target = gen_reg_rtx (tmode0); 8770 8771 scratch1 = gen_reg_rtx (tmode1); 8772 8773 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4); 8774 } 8775 else if (d->code == IX86_BUILTIN_PCMPESTRM128) 8776 { 8777 if (optimize || !target 8778 || GET_MODE (target) != tmode1 8779 || !insn_data[d->icode].operand[1].predicate (target, tmode1)) 8780 target = gen_reg_rtx (tmode1); 8781 8782 scratch0 = gen_reg_rtx (tmode0); 8783 8784 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4); 8785 } 8786 else 8787 { 8788 gcc_assert (d->flag); 8789 8790 scratch0 = gen_reg_rtx (tmode0); 8791 scratch1 = gen_reg_rtx (tmode1); 8792 8793 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4); 8794 } 8795 8796 if (! pat) 8797 return 0; 8798 8799 emit_insn (pat); 8800 8801 if (d->flag) 8802 { 8803 target = gen_reg_rtx (SImode); 8804 emit_move_insn (target, const0_rtx); 8805 target = gen_rtx_SUBREG (QImode, target, 0); 8806 8807 emit_insn 8808 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 8809 gen_rtx_fmt_ee (EQ, QImode, 8810 gen_rtx_REG ((machine_mode) d->flag, 8811 FLAGS_REG), 8812 const0_rtx))); 8813 return SUBREG_REG (target); 8814 } 8815 else 8816 return target; 8817} 8818 8819 8820/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */ 8821 8822static rtx 8823ix86_expand_sse_pcmpistr (const struct builtin_description *d, 8824 tree exp, rtx target) 8825{ 8826 rtx pat; 8827 tree arg0 = CALL_EXPR_ARG (exp, 0); 8828 tree arg1 = CALL_EXPR_ARG (exp, 1); 8829 tree arg2 = CALL_EXPR_ARG (exp, 2); 8830 rtx scratch0, scratch1; 8831 rtx op0 = expand_normal (arg0); 8832 rtx op1 = expand_normal (arg1); 8833 rtx op2 = expand_normal (arg2); 8834 machine_mode tmode0, tmode1, modev2, modev3, modeimm; 8835 8836 tmode0 = insn_data[d->icode].operand[0].mode; 8837 tmode1 = insn_data[d->icode].operand[1].mode; 8838 modev2 = insn_data[d->icode].operand[2].mode; 8839 modev3 = insn_data[d->icode].operand[3].mode; 8840 modeimm = insn_data[d->icode].operand[4].mode; 8841 8842 if (VECTOR_MODE_P (modev2)) 8843 op0 = safe_vector_operand (op0, modev2); 8844 if (VECTOR_MODE_P (modev3)) 8845 op1 = safe_vector_operand (op1, modev3); 8846 8847 if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) 8848 op0 = copy_to_mode_reg (modev2, op0); 8849 if ((optimize && !register_operand (op1, modev3)) 8850 || !insn_data[d->icode].operand[3].predicate (op1, modev3)) 8851 op1 = copy_to_mode_reg (modev3, op1); 8852 8853 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm)) 8854 { 8855 error ("the third argument must be an 8-bit immediate"); 8856 return const0_rtx; 8857 } 8858 8859 if (d->code == IX86_BUILTIN_PCMPISTRI128) 8860 { 8861 if (optimize || !target 8862 || GET_MODE (target) != tmode0 8863 || !insn_data[d->icode].operand[0].predicate (target, tmode0)) 8864 target = gen_reg_rtx (tmode0); 8865 8866 scratch1 = gen_reg_rtx (tmode1); 8867 8868 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2); 8869 } 8870 else if (d->code == IX86_BUILTIN_PCMPISTRM128) 8871 { 8872 if (optimize || !target 8873 || GET_MODE (target) != tmode1 8874 || !insn_data[d->icode].operand[1].predicate (target, tmode1)) 8875 target = gen_reg_rtx (tmode1); 8876 8877 scratch0 = gen_reg_rtx (tmode0); 8878 8879 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2); 8880 } 8881 else 8882 { 8883 gcc_assert (d->flag); 8884 8885 scratch0 = gen_reg_rtx (tmode0); 8886 scratch1 = gen_reg_rtx (tmode1); 8887 8888 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2); 8889 } 8890 8891 if (! pat) 8892 return 0; 8893 8894 emit_insn (pat); 8895 8896 if (d->flag) 8897 { 8898 target = gen_reg_rtx (SImode); 8899 emit_move_insn (target, const0_rtx); 8900 target = gen_rtx_SUBREG (QImode, target, 0); 8901 8902 emit_insn 8903 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 8904 gen_rtx_fmt_ee (EQ, QImode, 8905 gen_rtx_REG ((machine_mode) d->flag, 8906 FLAGS_REG), 8907 const0_rtx))); 8908 return SUBREG_REG (target); 8909 } 8910 else 8911 return target; 8912} 8913 8914/* Fixup modeless constants to fit required mode. */ 8915 8916static rtx 8917fixup_modeless_constant (rtx x, machine_mode mode) 8918{ 8919 if (GET_MODE (x) == VOIDmode) 8920 x = convert_to_mode (mode, x, 1); 8921 return x; 8922} 8923 8924/* Subroutine of ix86_expand_builtin to take care of insns with 8925 variable number of operands. */ 8926 8927static rtx 8928ix86_expand_args_builtin (const struct builtin_description *d, 8929 tree exp, rtx target) 8930{ 8931 rtx pat, real_target; 8932 unsigned int i, nargs; 8933 unsigned int nargs_constant = 0; 8934 unsigned int mask_pos = 0; 8935 int num_memory = 0; 8936 struct 8937 { 8938 rtx op; 8939 machine_mode mode; 8940 } args[6]; 8941 bool second_arg_count = false; 8942 enum insn_code icode = d->icode; 8943 const struct insn_data_d *insn_p = &insn_data[icode]; 8944 machine_mode tmode = insn_p->operand[0].mode; 8945 machine_mode rmode = VOIDmode; 8946 bool swap = false; 8947 enum rtx_code comparison = d->comparison; 8948 8949 switch ((enum ix86_builtin_func_type) d->flag) 8950 { 8951 case V2DF_FTYPE_V2DF_ROUND: 8952 case V4DF_FTYPE_V4DF_ROUND: 8953 case V8DF_FTYPE_V8DF_ROUND: 8954 case V4SF_FTYPE_V4SF_ROUND: 8955 case V8SF_FTYPE_V8SF_ROUND: 8956 case V16SF_FTYPE_V16SF_ROUND: 8957 case V4SI_FTYPE_V4SF_ROUND: 8958 case V8SI_FTYPE_V8SF_ROUND: 8959 case V16SI_FTYPE_V16SF_ROUND: 8960 return ix86_expand_sse_round (d, exp, target); 8961 case V4SI_FTYPE_V2DF_V2DF_ROUND: 8962 case V8SI_FTYPE_V4DF_V4DF_ROUND: 8963 case V16SI_FTYPE_V8DF_V8DF_ROUND: 8964 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); 8965 case INT_FTYPE_V8SF_V8SF_PTEST: 8966 case INT_FTYPE_V4DI_V4DI_PTEST: 8967 case INT_FTYPE_V4DF_V4DF_PTEST: 8968 case INT_FTYPE_V4SF_V4SF_PTEST: 8969 case INT_FTYPE_V2DI_V2DI_PTEST: 8970 case INT_FTYPE_V2DF_V2DF_PTEST: 8971 return ix86_expand_sse_ptest (d, exp, target); 8972 case FLOAT128_FTYPE_FLOAT128: 8973 case FLOAT_FTYPE_FLOAT: 8974 case INT_FTYPE_INT: 8975 case UINT_FTYPE_UINT: 8976 case UINT16_FTYPE_UINT16: 8977 case UINT64_FTYPE_INT: 8978 case UINT64_FTYPE_UINT64: 8979 case INT64_FTYPE_INT64: 8980 case INT64_FTYPE_V4SF: 8981 case INT64_FTYPE_V2DF: 8982 case INT_FTYPE_V16QI: 8983 case INT_FTYPE_V8QI: 8984 case INT_FTYPE_V8SF: 8985 case INT_FTYPE_V4DF: 8986 case INT_FTYPE_V4SF: 8987 case INT_FTYPE_V2DF: 8988 case INT_FTYPE_V32QI: 8989 case V16QI_FTYPE_V16QI: 8990 case V8SI_FTYPE_V8SF: 8991 case V8SI_FTYPE_V4SI: 8992 case V8HI_FTYPE_V8HI: 8993 case V8HI_FTYPE_V16QI: 8994 case V8QI_FTYPE_V8QI: 8995 case V8SF_FTYPE_V8SF: 8996 case V8SF_FTYPE_V8SI: 8997 case V8SF_FTYPE_V4SF: 8998 case V8SF_FTYPE_V8HI: 8999 case V4SI_FTYPE_V4SI: 9000 case V4SI_FTYPE_V16QI: 9001 case V4SI_FTYPE_V4SF: 9002 case V4SI_FTYPE_V8SI: 9003 case V4SI_FTYPE_V8HI: 9004 case V4SI_FTYPE_V4DF: 9005 case V4SI_FTYPE_V2DF: 9006 case V4HI_FTYPE_V4HI: 9007 case V4DF_FTYPE_V4DF: 9008 case V4DF_FTYPE_V4SI: 9009 case V4DF_FTYPE_V4SF: 9010 case V4DF_FTYPE_V2DF: 9011 case V4SF_FTYPE_V4SF: 9012 case V4SF_FTYPE_V4SI: 9013 case V4SF_FTYPE_V8SF: 9014 case V4SF_FTYPE_V4DF: 9015 case V4SF_FTYPE_V8HI: 9016 case V4SF_FTYPE_V2DF: 9017 case V2DI_FTYPE_V2DI: 9018 case V2DI_FTYPE_V16QI: 9019 case V2DI_FTYPE_V8HI: 9020 case V2DI_FTYPE_V4SI: 9021 case V2DF_FTYPE_V2DF: 9022 case V2DF_FTYPE_V4SI: 9023 case V2DF_FTYPE_V4DF: 9024 case V2DF_FTYPE_V4SF: 9025 case V2DF_FTYPE_V2SI: 9026 case V2SI_FTYPE_V2SI: 9027 case V2SI_FTYPE_V4SF: 9028 case V2SI_FTYPE_V2SF: 9029 case V2SI_FTYPE_V2DF: 9030 case V2SF_FTYPE_V2SF: 9031 case V2SF_FTYPE_V2SI: 9032 case V32QI_FTYPE_V32QI: 9033 case V32QI_FTYPE_V16QI: 9034 case V16HI_FTYPE_V16HI: 9035 case V16HI_FTYPE_V8HI: 9036 case V8SI_FTYPE_V8SI: 9037 case V16HI_FTYPE_V16QI: 9038 case V8SI_FTYPE_V16QI: 9039 case V4DI_FTYPE_V16QI: 9040 case V8SI_FTYPE_V8HI: 9041 case V4DI_FTYPE_V8HI: 9042 case V4DI_FTYPE_V4SI: 9043 case V4DI_FTYPE_V2DI: 9044 case UQI_FTYPE_UQI: 9045 case UHI_FTYPE_UHI: 9046 case USI_FTYPE_USI: 9047 case USI_FTYPE_UQI: 9048 case USI_FTYPE_UHI: 9049 case UDI_FTYPE_UDI: 9050 case UHI_FTYPE_V16QI: 9051 case USI_FTYPE_V32QI: 9052 case UDI_FTYPE_V64QI: 9053 case V16QI_FTYPE_UHI: 9054 case V32QI_FTYPE_USI: 9055 case V64QI_FTYPE_UDI: 9056 case V8HI_FTYPE_UQI: 9057 case V16HI_FTYPE_UHI: 9058 case V32HI_FTYPE_USI: 9059 case V4SI_FTYPE_UQI: 9060 case V8SI_FTYPE_UQI: 9061 case V4SI_FTYPE_UHI: 9062 case V8SI_FTYPE_UHI: 9063 case UQI_FTYPE_V8HI: 9064 case UHI_FTYPE_V16HI: 9065 case USI_FTYPE_V32HI: 9066 case UQI_FTYPE_V4SI: 9067 case UQI_FTYPE_V8SI: 9068 case UHI_FTYPE_V16SI: 9069 case UQI_FTYPE_V2DI: 9070 case UQI_FTYPE_V4DI: 9071 case UQI_FTYPE_V8DI: 9072 case V16SI_FTYPE_UHI: 9073 case V2DI_FTYPE_UQI: 9074 case V4DI_FTYPE_UQI: 9075 case V16SI_FTYPE_INT: 9076 case V16SF_FTYPE_V8SF: 9077 case V16SI_FTYPE_V8SI: 9078 case V16SF_FTYPE_V4SF: 9079 case V16SI_FTYPE_V4SI: 9080 case V16SI_FTYPE_V16SF: 9081 case V16SI_FTYPE_V16SI: 9082 case V64QI_FTYPE_V64QI: 9083 case V32HI_FTYPE_V32HI: 9084 case V16SF_FTYPE_V16SF: 9085 case V8DI_FTYPE_UQI: 9086 case V8DI_FTYPE_V8DI: 9087 case V8DF_FTYPE_V4DF: 9088 case V8DF_FTYPE_V2DF: 9089 case V8DF_FTYPE_V8DF: 9090 case V4DI_FTYPE_V4DI: 9091 case V16HI_FTYPE_V16SF: 9092 case V8HI_FTYPE_V8SF: 9093 case V8HI_FTYPE_V4SF: 9094 nargs = 1; 9095 break; 9096 case V4SF_FTYPE_V4SF_VEC_MERGE: 9097 case V2DF_FTYPE_V2DF_VEC_MERGE: 9098 return ix86_expand_unop_vec_merge_builtin (icode, exp, target); 9099 case FLOAT128_FTYPE_FLOAT128_FLOAT128: 9100 case V16QI_FTYPE_V16QI_V16QI: 9101 case V16QI_FTYPE_V8HI_V8HI: 9102 case V16SF_FTYPE_V16SF_V16SF: 9103 case V8QI_FTYPE_V8QI_V8QI: 9104 case V8QI_FTYPE_V4HI_V4HI: 9105 case V8HI_FTYPE_V8HI_V8HI: 9106 case V8HI_FTYPE_V16QI_V16QI: 9107 case V8HI_FTYPE_V4SI_V4SI: 9108 case V8SF_FTYPE_V8SF_V8SF: 9109 case V8SF_FTYPE_V8SF_V8SI: 9110 case V8DF_FTYPE_V8DF_V8DF: 9111 case V4SI_FTYPE_V4SI_V4SI: 9112 case V4SI_FTYPE_V8HI_V8HI: 9113 case V4SI_FTYPE_V2DF_V2DF: 9114 case V4HI_FTYPE_V4HI_V4HI: 9115 case V4HI_FTYPE_V8QI_V8QI: 9116 case V4HI_FTYPE_V2SI_V2SI: 9117 case V4DF_FTYPE_V4DF_V4DF: 9118 case V4DF_FTYPE_V4DF_V4DI: 9119 case V4SF_FTYPE_V4SF_V4SF: 9120 case V4SF_FTYPE_V4SF_V4SI: 9121 case V4SF_FTYPE_V4SF_V2SI: 9122 case V4SF_FTYPE_V4SF_V2DF: 9123 case V4SF_FTYPE_V4SF_UINT: 9124 case V4SF_FTYPE_V4SF_DI: 9125 case V4SF_FTYPE_V4SF_SI: 9126 case V2DI_FTYPE_V2DI_V2DI: 9127 case V2DI_FTYPE_V16QI_V16QI: 9128 case V2DI_FTYPE_V4SI_V4SI: 9129 case V2DI_FTYPE_V2DI_V16QI: 9130 case V2SI_FTYPE_V2SI_V2SI: 9131 case V2SI_FTYPE_V4HI_V4HI: 9132 case V2SI_FTYPE_V2SF_V2SF: 9133 case V2DF_FTYPE_V2DF_V2DF: 9134 case V2DF_FTYPE_V2DF_V4SF: 9135 case V2DF_FTYPE_V2DF_V2DI: 9136 case V2DF_FTYPE_V2DF_DI: 9137 case V2DF_FTYPE_V2DF_SI: 9138 case V2DF_FTYPE_V2DF_UINT: 9139 case V2SF_FTYPE_V2SF_V2SF: 9140 case V1DI_FTYPE_V1DI_V1DI: 9141 case V1DI_FTYPE_V8QI_V8QI: 9142 case V1DI_FTYPE_V2SI_V2SI: 9143 case V32QI_FTYPE_V16HI_V16HI: 9144 case V16HI_FTYPE_V8SI_V8SI: 9145 case V64QI_FTYPE_V64QI_V64QI: 9146 case V32QI_FTYPE_V32QI_V32QI: 9147 case V16HI_FTYPE_V32QI_V32QI: 9148 case V16HI_FTYPE_V16HI_V16HI: 9149 case V8SI_FTYPE_V4DF_V4DF: 9150 case V8SI_FTYPE_V8SI_V8SI: 9151 case V8SI_FTYPE_V16HI_V16HI: 9152 case V4DI_FTYPE_V4DI_V4DI: 9153 case V4DI_FTYPE_V8SI_V8SI: 9154 case V8DI_FTYPE_V64QI_V64QI: 9155 if (comparison == UNKNOWN) 9156 return ix86_expand_binop_builtin (icode, exp, target); 9157 nargs = 2; 9158 break; 9159 case V4SF_FTYPE_V4SF_V4SF_SWAP: 9160 case V2DF_FTYPE_V2DF_V2DF_SWAP: 9161 gcc_assert (comparison != UNKNOWN); 9162 nargs = 2; 9163 swap = true; 9164 break; 9165 case V16HI_FTYPE_V16HI_V8HI_COUNT: 9166 case V16HI_FTYPE_V16HI_SI_COUNT: 9167 case V8SI_FTYPE_V8SI_V4SI_COUNT: 9168 case V8SI_FTYPE_V8SI_SI_COUNT: 9169 case V4DI_FTYPE_V4DI_V2DI_COUNT: 9170 case V4DI_FTYPE_V4DI_INT_COUNT: 9171 case V8HI_FTYPE_V8HI_V8HI_COUNT: 9172 case V8HI_FTYPE_V8HI_SI_COUNT: 9173 case V4SI_FTYPE_V4SI_V4SI_COUNT: 9174 case V4SI_FTYPE_V4SI_SI_COUNT: 9175 case V4HI_FTYPE_V4HI_V4HI_COUNT: 9176 case V4HI_FTYPE_V4HI_SI_COUNT: 9177 case V2DI_FTYPE_V2DI_V2DI_COUNT: 9178 case V2DI_FTYPE_V2DI_SI_COUNT: 9179 case V2SI_FTYPE_V2SI_V2SI_COUNT: 9180 case V2SI_FTYPE_V2SI_SI_COUNT: 9181 case V1DI_FTYPE_V1DI_V1DI_COUNT: 9182 case V1DI_FTYPE_V1DI_SI_COUNT: 9183 nargs = 2; 9184 second_arg_count = true; 9185 break; 9186 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT: 9187 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT: 9188 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT: 9189 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT: 9190 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT: 9191 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT: 9192 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT: 9193 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT: 9194 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT: 9195 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT: 9196 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT: 9197 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT: 9198 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT: 9199 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT: 9200 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT: 9201 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT: 9202 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT: 9203 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT: 9204 nargs = 4; 9205 second_arg_count = true; 9206 break; 9207 case UINT64_FTYPE_UINT64_UINT64: 9208 case UINT_FTYPE_UINT_UINT: 9209 case UINT_FTYPE_UINT_USHORT: 9210 case UINT_FTYPE_UINT_UCHAR: 9211 case UINT16_FTYPE_UINT16_INT: 9212 case UINT8_FTYPE_UINT8_INT: 9213 case UQI_FTYPE_UQI_UQI: 9214 case UHI_FTYPE_UHI_UHI: 9215 case USI_FTYPE_USI_USI: 9216 case UDI_FTYPE_UDI_UDI: 9217 case V16SI_FTYPE_V8DF_V8DF: 9218 case V32HI_FTYPE_V16SF_V16SF: 9219 case V16HI_FTYPE_V8SF_V8SF: 9220 case V8HI_FTYPE_V4SF_V4SF: 9221 case V16HI_FTYPE_V16SF_UHI: 9222 case V8HI_FTYPE_V8SF_UQI: 9223 case V8HI_FTYPE_V4SF_UQI: 9224 nargs = 2; 9225 break; 9226 case V2DI_FTYPE_V2DI_INT_CONVERT: 9227 nargs = 2; 9228 rmode = V1TImode; 9229 nargs_constant = 1; 9230 break; 9231 case V4DI_FTYPE_V4DI_INT_CONVERT: 9232 nargs = 2; 9233 rmode = V2TImode; 9234 nargs_constant = 1; 9235 break; 9236 case V8DI_FTYPE_V8DI_INT_CONVERT: 9237 nargs = 2; 9238 rmode = V4TImode; 9239 nargs_constant = 1; 9240 break; 9241 case V8HI_FTYPE_V8HI_INT: 9242 case V8HI_FTYPE_V8SF_INT: 9243 case V16HI_FTYPE_V16SF_INT: 9244 case V8HI_FTYPE_V4SF_INT: 9245 case V8SF_FTYPE_V8SF_INT: 9246 case V4SF_FTYPE_V16SF_INT: 9247 case V16SF_FTYPE_V16SF_INT: 9248 case V4SI_FTYPE_V4SI_INT: 9249 case V4SI_FTYPE_V8SI_INT: 9250 case V4HI_FTYPE_V4HI_INT: 9251 case V4DF_FTYPE_V4DF_INT: 9252 case V4DF_FTYPE_V8DF_INT: 9253 case V4SF_FTYPE_V4SF_INT: 9254 case V4SF_FTYPE_V8SF_INT: 9255 case V2DI_FTYPE_V2DI_INT: 9256 case V2DF_FTYPE_V2DF_INT: 9257 case V2DF_FTYPE_V4DF_INT: 9258 case V16HI_FTYPE_V16HI_INT: 9259 case V8SI_FTYPE_V8SI_INT: 9260 case V16SI_FTYPE_V16SI_INT: 9261 case V4SI_FTYPE_V16SI_INT: 9262 case V4DI_FTYPE_V4DI_INT: 9263 case V2DI_FTYPE_V4DI_INT: 9264 case V4DI_FTYPE_V8DI_INT: 9265 case UQI_FTYPE_UQI_UQI_CONST: 9266 case UHI_FTYPE_UHI_UQI: 9267 case USI_FTYPE_USI_UQI: 9268 case UDI_FTYPE_UDI_UQI: 9269 nargs = 2; 9270 nargs_constant = 1; 9271 break; 9272 case V16QI_FTYPE_V16QI_V16QI_V16QI: 9273 case V8SF_FTYPE_V8SF_V8SF_V8SF: 9274 case V4DF_FTYPE_V4DF_V4DF_V4DF: 9275 case V4SF_FTYPE_V4SF_V4SF_V4SF: 9276 case V2DF_FTYPE_V2DF_V2DF_V2DF: 9277 case V32QI_FTYPE_V32QI_V32QI_V32QI: 9278 case UHI_FTYPE_V16SI_V16SI_UHI: 9279 case UQI_FTYPE_V8DI_V8DI_UQI: 9280 case V16HI_FTYPE_V16SI_V16HI_UHI: 9281 case V16QI_FTYPE_V16SI_V16QI_UHI: 9282 case V16QI_FTYPE_V8DI_V16QI_UQI: 9283 case V16SF_FTYPE_V16SF_V16SF_UHI: 9284 case V16SF_FTYPE_V4SF_V16SF_UHI: 9285 case V16SI_FTYPE_SI_V16SI_UHI: 9286 case V16SI_FTYPE_V16HI_V16SI_UHI: 9287 case V16SI_FTYPE_V16QI_V16SI_UHI: 9288 case V8SF_FTYPE_V4SF_V8SF_UQI: 9289 case V4DF_FTYPE_V2DF_V4DF_UQI: 9290 case V8SI_FTYPE_V4SI_V8SI_UQI: 9291 case V8SI_FTYPE_SI_V8SI_UQI: 9292 case V4SI_FTYPE_V4SI_V4SI_UQI: 9293 case V4SI_FTYPE_SI_V4SI_UQI: 9294 case V4DI_FTYPE_V2DI_V4DI_UQI: 9295 case V4DI_FTYPE_DI_V4DI_UQI: 9296 case V2DI_FTYPE_V2DI_V2DI_UQI: 9297 case V2DI_FTYPE_DI_V2DI_UQI: 9298 case V64QI_FTYPE_V64QI_V64QI_UDI: 9299 case V64QI_FTYPE_V16QI_V64QI_UDI: 9300 case V64QI_FTYPE_QI_V64QI_UDI: 9301 case V32QI_FTYPE_V32QI_V32QI_USI: 9302 case V32QI_FTYPE_V16QI_V32QI_USI: 9303 case V32QI_FTYPE_QI_V32QI_USI: 9304 case V16QI_FTYPE_V16QI_V16QI_UHI: 9305 case V16QI_FTYPE_QI_V16QI_UHI: 9306 case V32HI_FTYPE_V8HI_V32HI_USI: 9307 case V32HI_FTYPE_HI_V32HI_USI: 9308 case V16HI_FTYPE_V8HI_V16HI_UHI: 9309 case V16HI_FTYPE_HI_V16HI_UHI: 9310 case V8HI_FTYPE_V8HI_V8HI_UQI: 9311 case V8HI_FTYPE_HI_V8HI_UQI: 9312 case V8SF_FTYPE_V8HI_V8SF_UQI: 9313 case V4SF_FTYPE_V8HI_V4SF_UQI: 9314 case V8SI_FTYPE_V8SF_V8SI_UQI: 9315 case V4SI_FTYPE_V4SF_V4SI_UQI: 9316 case V4DI_FTYPE_V4SF_V4DI_UQI: 9317 case V2DI_FTYPE_V4SF_V2DI_UQI: 9318 case V4SF_FTYPE_V4DI_V4SF_UQI: 9319 case V4SF_FTYPE_V2DI_V4SF_UQI: 9320 case V4DF_FTYPE_V4DI_V4DF_UQI: 9321 case V2DF_FTYPE_V2DI_V2DF_UQI: 9322 case V16QI_FTYPE_V8HI_V16QI_UQI: 9323 case V16QI_FTYPE_V16HI_V16QI_UHI: 9324 case V16QI_FTYPE_V4SI_V16QI_UQI: 9325 case V16QI_FTYPE_V8SI_V16QI_UQI: 9326 case V8HI_FTYPE_V4SI_V8HI_UQI: 9327 case V8HI_FTYPE_V8SI_V8HI_UQI: 9328 case V16QI_FTYPE_V2DI_V16QI_UQI: 9329 case V16QI_FTYPE_V4DI_V16QI_UQI: 9330 case V8HI_FTYPE_V2DI_V8HI_UQI: 9331 case V8HI_FTYPE_V4DI_V8HI_UQI: 9332 case V4SI_FTYPE_V2DI_V4SI_UQI: 9333 case V4SI_FTYPE_V4DI_V4SI_UQI: 9334 case V32QI_FTYPE_V32HI_V32QI_USI: 9335 case UHI_FTYPE_V16QI_V16QI_UHI: 9336 case USI_FTYPE_V32QI_V32QI_USI: 9337 case UDI_FTYPE_V64QI_V64QI_UDI: 9338 case UQI_FTYPE_V8HI_V8HI_UQI: 9339 case UHI_FTYPE_V16HI_V16HI_UHI: 9340 case USI_FTYPE_V32HI_V32HI_USI: 9341 case UQI_FTYPE_V4SI_V4SI_UQI: 9342 case UQI_FTYPE_V8SI_V8SI_UQI: 9343 case UQI_FTYPE_V2DI_V2DI_UQI: 9344 case UQI_FTYPE_V4DI_V4DI_UQI: 9345 case V4SF_FTYPE_V2DF_V4SF_UQI: 9346 case V4SF_FTYPE_V4DF_V4SF_UQI: 9347 case V16SI_FTYPE_V16SI_V16SI_UHI: 9348 case V16SI_FTYPE_V4SI_V16SI_UHI: 9349 case V2DI_FTYPE_V4SI_V2DI_UQI: 9350 case V2DI_FTYPE_V8HI_V2DI_UQI: 9351 case V2DI_FTYPE_V16QI_V2DI_UQI: 9352 case V4DI_FTYPE_V4DI_V4DI_UQI: 9353 case V4DI_FTYPE_V4SI_V4DI_UQI: 9354 case V4DI_FTYPE_V8HI_V4DI_UQI: 9355 case V4DI_FTYPE_V16QI_V4DI_UQI: 9356 case V4DI_FTYPE_V4DF_V4DI_UQI: 9357 case V2DI_FTYPE_V2DF_V2DI_UQI: 9358 case V4SI_FTYPE_V4DF_V4SI_UQI: 9359 case V4SI_FTYPE_V2DF_V4SI_UQI: 9360 case V4SI_FTYPE_V8HI_V4SI_UQI: 9361 case V4SI_FTYPE_V16QI_V4SI_UQI: 9362 case V4DI_FTYPE_V4DI_V4DI_V4DI: 9363 case V8DF_FTYPE_V2DF_V8DF_UQI: 9364 case V8DF_FTYPE_V4DF_V8DF_UQI: 9365 case V8DF_FTYPE_V8DF_V8DF_UQI: 9366 case V8SF_FTYPE_V8SF_V8SF_UQI: 9367 case V8SF_FTYPE_V8SI_V8SF_UQI: 9368 case V4DF_FTYPE_V4DF_V4DF_UQI: 9369 case V4SF_FTYPE_V4SF_V4SF_UQI: 9370 case V2DF_FTYPE_V2DF_V2DF_UQI: 9371 case V2DF_FTYPE_V4SF_V2DF_UQI: 9372 case V2DF_FTYPE_V4SI_V2DF_UQI: 9373 case V4SF_FTYPE_V4SI_V4SF_UQI: 9374 case V4DF_FTYPE_V4SF_V4DF_UQI: 9375 case V4DF_FTYPE_V4SI_V4DF_UQI: 9376 case V8SI_FTYPE_V8SI_V8SI_UQI: 9377 case V8SI_FTYPE_V8HI_V8SI_UQI: 9378 case V8SI_FTYPE_V16QI_V8SI_UQI: 9379 case V8DF_FTYPE_V8SI_V8DF_UQI: 9380 case V8DI_FTYPE_DI_V8DI_UQI: 9381 case V16SF_FTYPE_V8SF_V16SF_UHI: 9382 case V16SI_FTYPE_V8SI_V16SI_UHI: 9383 case V16HI_FTYPE_V16HI_V16HI_UHI: 9384 case V8HI_FTYPE_V16QI_V8HI_UQI: 9385 case V16HI_FTYPE_V16QI_V16HI_UHI: 9386 case V32HI_FTYPE_V32HI_V32HI_USI: 9387 case V32HI_FTYPE_V32QI_V32HI_USI: 9388 case V8DI_FTYPE_V16QI_V8DI_UQI: 9389 case V8DI_FTYPE_V2DI_V8DI_UQI: 9390 case V8DI_FTYPE_V4DI_V8DI_UQI: 9391 case V8DI_FTYPE_V8DI_V8DI_UQI: 9392 case V8DI_FTYPE_V8HI_V8DI_UQI: 9393 case V8DI_FTYPE_V8SI_V8DI_UQI: 9394 case V8HI_FTYPE_V8DI_V8HI_UQI: 9395 case V8SI_FTYPE_V8DI_V8SI_UQI: 9396 case V4SI_FTYPE_V4SI_V4SI_V4SI: 9397 case V16SI_FTYPE_V16SI_V16SI_V16SI: 9398 case V8DI_FTYPE_V8DI_V8DI_V8DI: 9399 case V32HI_FTYPE_V32HI_V32HI_V32HI: 9400 case V2DI_FTYPE_V2DI_V2DI_V2DI: 9401 case V16HI_FTYPE_V16HI_V16HI_V16HI: 9402 case V8SI_FTYPE_V8SI_V8SI_V8SI: 9403 case V8HI_FTYPE_V8HI_V8HI_V8HI: 9404 case V32HI_FTYPE_V16SF_V16SF_USI: 9405 case V16HI_FTYPE_V8SF_V8SF_UHI: 9406 case V8HI_FTYPE_V4SF_V4SF_UQI: 9407 case V16HI_FTYPE_V16SF_V16HI_UHI: 9408 case V8HI_FTYPE_V8SF_V8HI_UQI: 9409 case V8HI_FTYPE_V4SF_V8HI_UQI: 9410 case V16SF_FTYPE_V16SF_V32HI_V32HI: 9411 case V8SF_FTYPE_V8SF_V16HI_V16HI: 9412 case V4SF_FTYPE_V4SF_V8HI_V8HI: 9413 nargs = 3; 9414 break; 9415 case V32QI_FTYPE_V32QI_V32QI_INT: 9416 case V16HI_FTYPE_V16HI_V16HI_INT: 9417 case V16QI_FTYPE_V16QI_V16QI_INT: 9418 case V4DI_FTYPE_V4DI_V4DI_INT: 9419 case V8HI_FTYPE_V8HI_V8HI_INT: 9420 case V8SI_FTYPE_V8SI_V8SI_INT: 9421 case V8SI_FTYPE_V8SI_V4SI_INT: 9422 case V8SF_FTYPE_V8SF_V8SF_INT: 9423 case V8SF_FTYPE_V8SF_V4SF_INT: 9424 case V4SI_FTYPE_V4SI_V4SI_INT: 9425 case V4DF_FTYPE_V4DF_V4DF_INT: 9426 case V16SF_FTYPE_V16SF_V16SF_INT: 9427 case V16SF_FTYPE_V16SF_V4SF_INT: 9428 case V16SI_FTYPE_V16SI_V4SI_INT: 9429 case V4DF_FTYPE_V4DF_V2DF_INT: 9430 case V4SF_FTYPE_V4SF_V4SF_INT: 9431 case V2DI_FTYPE_V2DI_V2DI_INT: 9432 case V4DI_FTYPE_V4DI_V2DI_INT: 9433 case V2DF_FTYPE_V2DF_V2DF_INT: 9434 case UQI_FTYPE_V8DI_V8UDI_INT: 9435 case UQI_FTYPE_V8DF_V8DF_INT: 9436 case UQI_FTYPE_V2DF_V2DF_INT: 9437 case UQI_FTYPE_V4SF_V4SF_INT: 9438 case UHI_FTYPE_V16SI_V16SI_INT: 9439 case UHI_FTYPE_V16SF_V16SF_INT: 9440 case V64QI_FTYPE_V64QI_V64QI_INT: 9441 case V32HI_FTYPE_V32HI_V32HI_INT: 9442 case V16SI_FTYPE_V16SI_V16SI_INT: 9443 case V8DI_FTYPE_V8DI_V8DI_INT: 9444 nargs = 3; 9445 nargs_constant = 1; 9446 break; 9447 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT: 9448 nargs = 3; 9449 rmode = V4DImode; 9450 nargs_constant = 1; 9451 break; 9452 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: 9453 nargs = 3; 9454 rmode = V2DImode; 9455 nargs_constant = 1; 9456 break; 9457 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT: 9458 nargs = 3; 9459 rmode = DImode; 9460 nargs_constant = 1; 9461 break; 9462 case V2DI_FTYPE_V2DI_UINT_UINT: 9463 nargs = 3; 9464 nargs_constant = 2; 9465 break; 9466 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT: 9467 nargs = 3; 9468 rmode = V8DImode; 9469 nargs_constant = 1; 9470 break; 9471 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT: 9472 nargs = 5; 9473 rmode = V8DImode; 9474 mask_pos = 2; 9475 nargs_constant = 1; 9476 break; 9477 case QI_FTYPE_V8DF_INT_UQI: 9478 case QI_FTYPE_V4DF_INT_UQI: 9479 case QI_FTYPE_V2DF_INT_UQI: 9480 case HI_FTYPE_V16SF_INT_UHI: 9481 case QI_FTYPE_V8SF_INT_UQI: 9482 case QI_FTYPE_V4SF_INT_UQI: 9483 case V4SI_FTYPE_V4SI_V4SI_UHI: 9484 case V8SI_FTYPE_V8SI_V8SI_UHI: 9485 nargs = 3; 9486 mask_pos = 1; 9487 nargs_constant = 1; 9488 break; 9489 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT: 9490 nargs = 5; 9491 rmode = V4DImode; 9492 mask_pos = 2; 9493 nargs_constant = 1; 9494 break; 9495 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT: 9496 nargs = 5; 9497 rmode = V2DImode; 9498 mask_pos = 2; 9499 nargs_constant = 1; 9500 break; 9501 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI: 9502 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI: 9503 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI: 9504 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI: 9505 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI: 9506 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI: 9507 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI: 9508 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI: 9509 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI: 9510 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI: 9511 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI: 9512 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI: 9513 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI: 9514 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: 9515 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: 9516 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: 9517 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: 9518 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: 9519 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: 9520 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI: 9521 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI: 9522 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI: 9523 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI: 9524 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI: 9525 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI: 9526 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI: 9527 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI: 9528 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI: 9529 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI: 9530 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI: 9531 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: 9532 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: 9533 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: 9534 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: 9535 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: 9536 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: 9537 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI: 9538 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: 9539 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: 9540 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: 9541 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: 9542 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: 9543 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: 9544 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI: 9545 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI: 9546 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI: 9547 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI: 9548 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI: 9549 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: 9550 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: 9551 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: 9552 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI: 9553 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI: 9554 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI: 9555 nargs = 4; 9556 break; 9557 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: 9558 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: 9559 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: 9560 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: 9561 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT: 9562 nargs = 4; 9563 nargs_constant = 1; 9564 break; 9565 case UQI_FTYPE_V4DI_V4DI_INT_UQI: 9566 case UQI_FTYPE_V8SI_V8SI_INT_UQI: 9567 case QI_FTYPE_V4DF_V4DF_INT_UQI: 9568 case QI_FTYPE_V8SF_V8SF_INT_UQI: 9569 case UQI_FTYPE_V2DI_V2DI_INT_UQI: 9570 case UQI_FTYPE_V4SI_V4SI_INT_UQI: 9571 case UQI_FTYPE_V2DF_V2DF_INT_UQI: 9572 case UQI_FTYPE_V4SF_V4SF_INT_UQI: 9573 case UDI_FTYPE_V64QI_V64QI_INT_UDI: 9574 case USI_FTYPE_V32QI_V32QI_INT_USI: 9575 case UHI_FTYPE_V16QI_V16QI_INT_UHI: 9576 case USI_FTYPE_V32HI_V32HI_INT_USI: 9577 case UHI_FTYPE_V16HI_V16HI_INT_UHI: 9578 case UQI_FTYPE_V8HI_V8HI_INT_UQI: 9579 nargs = 4; 9580 mask_pos = 1; 9581 nargs_constant = 1; 9582 break; 9583 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: 9584 nargs = 4; 9585 nargs_constant = 2; 9586 break; 9587 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: 9588 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: 9589 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI: 9590 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI: 9591 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI: 9592 nargs = 4; 9593 break; 9594 case UQI_FTYPE_V8DI_V8DI_INT_UQI: 9595 case UHI_FTYPE_V16SI_V16SI_INT_UHI: 9596 mask_pos = 1; 9597 nargs = 4; 9598 nargs_constant = 1; 9599 break; 9600 case V8SF_FTYPE_V8SF_INT_V8SF_UQI: 9601 case V4SF_FTYPE_V4SF_INT_V4SF_UQI: 9602 case V2DF_FTYPE_V4DF_INT_V2DF_UQI: 9603 case V2DI_FTYPE_V4DI_INT_V2DI_UQI: 9604 case V8SF_FTYPE_V16SF_INT_V8SF_UQI: 9605 case V8SI_FTYPE_V16SI_INT_V8SI_UQI: 9606 case V2DF_FTYPE_V8DF_INT_V2DF_UQI: 9607 case V2DI_FTYPE_V8DI_INT_V2DI_UQI: 9608 case V4SF_FTYPE_V8SF_INT_V4SF_UQI: 9609 case V4SI_FTYPE_V8SI_INT_V4SI_UQI: 9610 case V8HI_FTYPE_V8SF_INT_V8HI_UQI: 9611 case V8HI_FTYPE_V4SF_INT_V8HI_UQI: 9612 case V32HI_FTYPE_V32HI_INT_V32HI_USI: 9613 case V16HI_FTYPE_V16HI_INT_V16HI_UHI: 9614 case V8HI_FTYPE_V8HI_INT_V8HI_UQI: 9615 case V4DI_FTYPE_V4DI_INT_V4DI_UQI: 9616 case V2DI_FTYPE_V2DI_INT_V2DI_UQI: 9617 case V8SI_FTYPE_V8SI_INT_V8SI_UQI: 9618 case V4SI_FTYPE_V4SI_INT_V4SI_UQI: 9619 case V4DF_FTYPE_V4DF_INT_V4DF_UQI: 9620 case V2DF_FTYPE_V2DF_INT_V2DF_UQI: 9621 case V8DF_FTYPE_V8DF_INT_V8DF_UQI: 9622 case V16SF_FTYPE_V16SF_INT_V16SF_UHI: 9623 case V16HI_FTYPE_V16SF_INT_V16HI_UHI: 9624 case V16SI_FTYPE_V16SI_INT_V16SI_UHI: 9625 case V4SI_FTYPE_V16SI_INT_V4SI_UQI: 9626 case V4DI_FTYPE_V8DI_INT_V4DI_UQI: 9627 case V4DF_FTYPE_V8DF_INT_V4DF_UQI: 9628 case V4SF_FTYPE_V16SF_INT_V4SF_UQI: 9629 case V8DI_FTYPE_V8DI_INT_V8DI_UQI: 9630 nargs = 4; 9631 mask_pos = 2; 9632 nargs_constant = 1; 9633 break; 9634 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI: 9635 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI: 9636 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI: 9637 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI: 9638 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI: 9639 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI: 9640 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI: 9641 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI: 9642 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI: 9643 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI: 9644 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI: 9645 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI: 9646 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI: 9647 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI: 9648 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI: 9649 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI: 9650 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI: 9651 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI: 9652 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI: 9653 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI: 9654 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI: 9655 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI: 9656 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI: 9657 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI: 9658 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI: 9659 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI: 9660 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI: 9661 nargs = 5; 9662 mask_pos = 2; 9663 nargs_constant = 1; 9664 break; 9665 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI: 9666 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI: 9667 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI: 9668 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI: 9669 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI: 9670 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI: 9671 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI: 9672 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI: 9673 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI: 9674 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI: 9675 nargs = 5; 9676 mask_pos = 1; 9677 nargs_constant = 1; 9678 break; 9679 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI: 9680 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI: 9681 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI: 9682 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT: 9683 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT: 9684 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT: 9685 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT: 9686 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT: 9687 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT: 9688 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT: 9689 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT: 9690 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT: 9691 nargs = 5; 9692 mask_pos = 1; 9693 nargs_constant = 2; 9694 break; 9695 9696 default: 9697 gcc_unreachable (); 9698 } 9699 9700 gcc_assert (nargs <= ARRAY_SIZE (args)); 9701 9702 if (comparison != UNKNOWN) 9703 { 9704 gcc_assert (nargs == 2); 9705 return ix86_expand_sse_compare (d, exp, target, swap); 9706 } 9707 9708 if (rmode == VOIDmode || rmode == tmode) 9709 { 9710 if (optimize 9711 || target == 0 9712 || GET_MODE (target) != tmode 9713 || !insn_p->operand[0].predicate (target, tmode)) 9714 target = gen_reg_rtx (tmode); 9715 else if (memory_operand (target, tmode)) 9716 num_memory++; 9717 real_target = target; 9718 } 9719 else 9720 { 9721 real_target = gen_reg_rtx (tmode); 9722 target = lowpart_subreg (rmode, real_target, tmode); 9723 } 9724 9725 for (i = 0; i < nargs; i++) 9726 { 9727 tree arg = CALL_EXPR_ARG (exp, i); 9728 rtx op = expand_normal (arg); 9729 machine_mode mode = insn_p->operand[i + 1].mode; 9730 bool match = insn_p->operand[i + 1].predicate (op, mode); 9731 9732 if (second_arg_count && i == 1) 9733 { 9734 /* SIMD shift insns take either an 8-bit immediate or 9735 register as count. But builtin functions take int as 9736 count. If count doesn't match, we put it in register. 9737 The instructions are using 64-bit count, if op is just 9738 32-bit, zero-extend it, as negative shift counts 9739 are undefined behavior and zero-extension is more 9740 efficient. */ 9741 if (!match) 9742 { 9743 if (SCALAR_INT_MODE_P (GET_MODE (op))) 9744 op = convert_modes (mode, GET_MODE (op), op, 1); 9745 else 9746 op = lowpart_subreg (mode, op, GET_MODE (op)); 9747 if (!insn_p->operand[i + 1].predicate (op, mode)) 9748 op = copy_to_reg (op); 9749 } 9750 } 9751 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || 9752 (!mask_pos && (nargs - i) <= nargs_constant)) 9753 { 9754 if (!match) 9755 switch (icode) 9756 { 9757 case CODE_FOR_avx_vinsertf128v4di: 9758 case CODE_FOR_avx_vextractf128v4di: 9759 error ("the last argument must be an 1-bit immediate"); 9760 return const0_rtx; 9761 9762 case CODE_FOR_avx512f_cmpv8di3_mask: 9763 case CODE_FOR_avx512f_cmpv16si3_mask: 9764 case CODE_FOR_avx512f_ucmpv8di3_mask: 9765 case CODE_FOR_avx512f_ucmpv16si3_mask: 9766 case CODE_FOR_avx512vl_cmpv4di3_mask: 9767 case CODE_FOR_avx512vl_cmpv8si3_mask: 9768 case CODE_FOR_avx512vl_ucmpv4di3_mask: 9769 case CODE_FOR_avx512vl_ucmpv8si3_mask: 9770 case CODE_FOR_avx512vl_cmpv2di3_mask: 9771 case CODE_FOR_avx512vl_cmpv4si3_mask: 9772 case CODE_FOR_avx512vl_ucmpv2di3_mask: 9773 case CODE_FOR_avx512vl_ucmpv4si3_mask: 9774 error ("the last argument must be a 3-bit immediate"); 9775 return const0_rtx; 9776 9777 case CODE_FOR_sse4_1_roundsd: 9778 case CODE_FOR_sse4_1_roundss: 9779 9780 case CODE_FOR_sse4_1_roundpd: 9781 case CODE_FOR_sse4_1_roundps: 9782 case CODE_FOR_avx_roundpd256: 9783 case CODE_FOR_avx_roundps256: 9784 9785 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix: 9786 case CODE_FOR_sse4_1_roundps_sfix: 9787 case CODE_FOR_avx_roundpd_vec_pack_sfix256: 9788 case CODE_FOR_avx_roundps_sfix256: 9789 9790 case CODE_FOR_sse4_1_blendps: 9791 case CODE_FOR_avx_blendpd256: 9792 case CODE_FOR_avx_vpermilv4df: 9793 case CODE_FOR_avx_vpermilv4df_mask: 9794 case CODE_FOR_avx512f_getmantv8df_mask: 9795 case CODE_FOR_avx512f_getmantv16sf_mask: 9796 case CODE_FOR_avx512vl_getmantv8sf_mask: 9797 case CODE_FOR_avx512vl_getmantv4df_mask: 9798 case CODE_FOR_avx512vl_getmantv4sf_mask: 9799 case CODE_FOR_avx512vl_getmantv2df_mask: 9800 case CODE_FOR_avx512dq_rangepv8df_mask_round: 9801 case CODE_FOR_avx512dq_rangepv16sf_mask_round: 9802 case CODE_FOR_avx512dq_rangepv4df_mask: 9803 case CODE_FOR_avx512dq_rangepv8sf_mask: 9804 case CODE_FOR_avx512dq_rangepv2df_mask: 9805 case CODE_FOR_avx512dq_rangepv4sf_mask: 9806 case CODE_FOR_avx_shufpd256_mask: 9807 error ("the last argument must be a 4-bit immediate"); 9808 return const0_rtx; 9809 9810 case CODE_FOR_sha1rnds4: 9811 case CODE_FOR_sse4_1_blendpd: 9812 case CODE_FOR_avx_vpermilv2df: 9813 case CODE_FOR_avx_vpermilv2df_mask: 9814 case CODE_FOR_xop_vpermil2v2df3: 9815 case CODE_FOR_xop_vpermil2v4sf3: 9816 case CODE_FOR_xop_vpermil2v4df3: 9817 case CODE_FOR_xop_vpermil2v8sf3: 9818 case CODE_FOR_avx512f_vinsertf32x4_mask: 9819 case CODE_FOR_avx512f_vinserti32x4_mask: 9820 case CODE_FOR_avx512f_vextractf32x4_mask: 9821 case CODE_FOR_avx512f_vextracti32x4_mask: 9822 case CODE_FOR_sse2_shufpd: 9823 case CODE_FOR_sse2_shufpd_mask: 9824 case CODE_FOR_avx512dq_shuf_f64x2_mask: 9825 case CODE_FOR_avx512dq_shuf_i64x2_mask: 9826 case CODE_FOR_avx512vl_shuf_i32x4_mask: 9827 case CODE_FOR_avx512vl_shuf_f32x4_mask: 9828 error ("the last argument must be a 2-bit immediate"); 9829 return const0_rtx; 9830 9831 case CODE_FOR_avx_vextractf128v4df: 9832 case CODE_FOR_avx_vextractf128v8sf: 9833 case CODE_FOR_avx_vextractf128v8si: 9834 case CODE_FOR_avx_vinsertf128v4df: 9835 case CODE_FOR_avx_vinsertf128v8sf: 9836 case CODE_FOR_avx_vinsertf128v8si: 9837 case CODE_FOR_avx512f_vinsertf64x4_mask: 9838 case CODE_FOR_avx512f_vinserti64x4_mask: 9839 case CODE_FOR_avx512f_vextractf64x4_mask: 9840 case CODE_FOR_avx512f_vextracti64x4_mask: 9841 case CODE_FOR_avx512dq_vinsertf32x8_mask: 9842 case CODE_FOR_avx512dq_vinserti32x8_mask: 9843 case CODE_FOR_avx512vl_vinsertv4df: 9844 case CODE_FOR_avx512vl_vinsertv4di: 9845 case CODE_FOR_avx512vl_vinsertv8sf: 9846 case CODE_FOR_avx512vl_vinsertv8si: 9847 error ("the last argument must be a 1-bit immediate"); 9848 return const0_rtx; 9849 9850 case CODE_FOR_avx_vmcmpv2df3: 9851 case CODE_FOR_avx_vmcmpv4sf3: 9852 case CODE_FOR_avx_cmpv2df3: 9853 case CODE_FOR_avx_cmpv4sf3: 9854 case CODE_FOR_avx_cmpv4df3: 9855 case CODE_FOR_avx_cmpv8sf3: 9856 case CODE_FOR_avx512f_cmpv8df3_mask: 9857 case CODE_FOR_avx512f_cmpv16sf3_mask: 9858 case CODE_FOR_avx512f_vmcmpv2df3_mask: 9859 case CODE_FOR_avx512f_vmcmpv4sf3_mask: 9860 error ("the last argument must be a 5-bit immediate"); 9861 return const0_rtx; 9862 9863 default: 9864 switch (nargs_constant) 9865 { 9866 case 2: 9867 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || 9868 (!mask_pos && (nargs - i) == nargs_constant)) 9869 { 9870 error ("the next to last argument must be an 8-bit immediate"); 9871 break; 9872 } 9873 /* FALLTHRU */ 9874 case 1: 9875 error ("the last argument must be an 8-bit immediate"); 9876 break; 9877 default: 9878 gcc_unreachable (); 9879 } 9880 return const0_rtx; 9881 } 9882 } 9883 else 9884 { 9885 if (VECTOR_MODE_P (mode)) 9886 op = safe_vector_operand (op, mode); 9887 9888 /* If we aren't optimizing, only allow one memory operand to 9889 be generated. */ 9890 if (memory_operand (op, mode)) 9891 num_memory++; 9892 9893 op = fixup_modeless_constant (op, mode); 9894 9895 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) 9896 { 9897 if (optimize || !match || num_memory > 1) 9898 op = copy_to_mode_reg (mode, op); 9899 } 9900 else 9901 { 9902 op = copy_to_reg (op); 9903 op = lowpart_subreg (mode, op, GET_MODE (op)); 9904 } 9905 } 9906 9907 args[i].op = op; 9908 args[i].mode = mode; 9909 } 9910 9911 switch (nargs) 9912 { 9913 case 1: 9914 pat = GEN_FCN (icode) (real_target, args[0].op); 9915 break; 9916 case 2: 9917 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op); 9918 break; 9919 case 3: 9920 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, 9921 args[2].op); 9922 break; 9923 case 4: 9924 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, 9925 args[2].op, args[3].op); 9926 break; 9927 case 5: 9928 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, 9929 args[2].op, args[3].op, args[4].op); 9930 break; 9931 case 6: 9932 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, 9933 args[2].op, args[3].op, args[4].op, 9934 args[5].op); 9935 break; 9936 default: 9937 gcc_unreachable (); 9938 } 9939 9940 if (! pat) 9941 return 0; 9942 9943 emit_insn (pat); 9944 return target; 9945} 9946 9947/* Transform pattern of following layout: 9948 (set A 9949 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING)) 9950 ) 9951 into: 9952 (set (A B)) */ 9953 9954static rtx 9955ix86_erase_embedded_rounding (rtx pat) 9956{ 9957 if (GET_CODE (pat) == INSN) 9958 pat = PATTERN (pat); 9959 9960 gcc_assert (GET_CODE (pat) == SET); 9961 rtx src = SET_SRC (pat); 9962 gcc_assert (XVECLEN (src, 0) == 2); 9963 rtx p0 = XVECEXP (src, 0, 0); 9964 gcc_assert (GET_CODE (src) == UNSPEC 9965 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING); 9966 rtx res = gen_rtx_SET (SET_DEST (pat), p0); 9967 return res; 9968} 9969 9970/* Subroutine of ix86_expand_round_builtin to take care of comi insns 9971 with rounding. */ 9972static rtx 9973ix86_expand_sse_comi_round (const struct builtin_description *d, 9974 tree exp, rtx target) 9975{ 9976 rtx pat, set_dst; 9977 tree arg0 = CALL_EXPR_ARG (exp, 0); 9978 tree arg1 = CALL_EXPR_ARG (exp, 1); 9979 tree arg2 = CALL_EXPR_ARG (exp, 2); 9980 tree arg3 = CALL_EXPR_ARG (exp, 3); 9981 rtx op0 = expand_normal (arg0); 9982 rtx op1 = expand_normal (arg1); 9983 rtx op2 = expand_normal (arg2); 9984 rtx op3 = expand_normal (arg3); 9985 enum insn_code icode = d->icode; 9986 const struct insn_data_d *insn_p = &insn_data[icode]; 9987 machine_mode mode0 = insn_p->operand[0].mode; 9988 machine_mode mode1 = insn_p->operand[1].mode; 9989 9990 /* See avxintrin.h for values. */ 9991 static const enum rtx_code comparisons[32] = 9992 { 9993 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED, 9994 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED, 9995 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED, 9996 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED 9997 }; 9998 static const bool ordereds[32] = 9999 { 10000 true, true, true, false, false, false, false, true, 10001 false, false, false, true, true, true, true, false, 10002 true, true, true, false, false, false, false, true, 10003 false, false, false, true, true, true, true, false 10004 }; 10005 static const bool non_signalings[32] = 10006 { 10007 true, false, false, true, true, false, false, true, 10008 true, false, false, true, true, false, false, true, 10009 false, true, true, false, false, true, true, false, 10010 false, true, true, false, false, true, true, false 10011 }; 10012 10013 if (!CONST_INT_P (op2)) 10014 { 10015 error ("the third argument must be comparison constant"); 10016 return const0_rtx; 10017 } 10018 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32) 10019 { 10020 error ("incorrect comparison mode"); 10021 return const0_rtx; 10022 } 10023 10024 if (!insn_p->operand[2].predicate (op3, SImode)) 10025 { 10026 error ("incorrect rounding operand"); 10027 return const0_rtx; 10028 } 10029 10030 if (VECTOR_MODE_P (mode0)) 10031 op0 = safe_vector_operand (op0, mode0); 10032 if (VECTOR_MODE_P (mode1)) 10033 op1 = safe_vector_operand (op1, mode1); 10034 10035 enum rtx_code comparison = comparisons[INTVAL (op2)]; 10036 bool ordered = ordereds[INTVAL (op2)]; 10037 bool non_signaling = non_signalings[INTVAL (op2)]; 10038 rtx const_val = const0_rtx; 10039 10040 bool check_unordered = false; 10041 machine_mode mode = CCFPmode; 10042 switch (comparison) 10043 { 10044 case ORDERED: 10045 if (!ordered) 10046 { 10047 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */ 10048 if (!non_signaling) 10049 ordered = true; 10050 mode = CCSmode; 10051 } 10052 else 10053 { 10054 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */ 10055 if (non_signaling) 10056 ordered = false; 10057 mode = CCPmode; 10058 } 10059 comparison = NE; 10060 break; 10061 case UNORDERED: 10062 if (ordered) 10063 { 10064 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */ 10065 if (non_signaling) 10066 ordered = false; 10067 mode = CCSmode; 10068 } 10069 else 10070 { 10071 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */ 10072 if (!non_signaling) 10073 ordered = true; 10074 mode = CCPmode; 10075 } 10076 comparison = EQ; 10077 break; 10078 10079 case LE: /* -> GE */ 10080 case LT: /* -> GT */ 10081 case UNGE: /* -> UNLE */ 10082 case UNGT: /* -> UNLT */ 10083 std::swap (op0, op1); 10084 comparison = swap_condition (comparison); 10085 /* FALLTHRU */ 10086 case GT: 10087 case GE: 10088 case UNEQ: 10089 case UNLT: 10090 case UNLE: 10091 case LTGT: 10092 /* These are supported by CCFPmode. NB: Use ordered/signaling 10093 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF 10094 with NAN operands. */ 10095 if (ordered == non_signaling) 10096 ordered = !ordered; 10097 break; 10098 case EQ: 10099 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for 10100 _CMP_EQ_OQ/_CMP_EQ_OS. */ 10101 check_unordered = true; 10102 mode = CCZmode; 10103 break; 10104 case NE: 10105 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for 10106 _CMP_NEQ_UQ/_CMP_NEQ_US. */ 10107 gcc_assert (!ordered); 10108 check_unordered = true; 10109 mode = CCZmode; 10110 const_val = const1_rtx; 10111 break; 10112 default: 10113 gcc_unreachable (); 10114 } 10115 10116 target = gen_reg_rtx (SImode); 10117 emit_move_insn (target, const_val); 10118 target = gen_rtx_SUBREG (QImode, target, 0); 10119 10120 if ((optimize && !register_operand (op0, mode0)) 10121 || !insn_p->operand[0].predicate (op0, mode0)) 10122 op0 = copy_to_mode_reg (mode0, op0); 10123 if ((optimize && !register_operand (op1, mode1)) 10124 || !insn_p->operand[1].predicate (op1, mode1)) 10125 op1 = copy_to_mode_reg (mode1, op1); 10126 10127 /* 10128 1. COMI: ordered and signaling. 10129 2. UCOMI: unordered and non-signaling. 10130 */ 10131 if (non_signaling) 10132 icode = (icode == CODE_FOR_sse_comi_round 10133 ? CODE_FOR_sse_ucomi_round 10134 : CODE_FOR_sse2_ucomi_round); 10135 10136 pat = GEN_FCN (icode) (op0, op1, op3); 10137 if (! pat) 10138 return 0; 10139 10140 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */ 10141 if (INTVAL (op3) == NO_ROUND) 10142 { 10143 pat = ix86_erase_embedded_rounding (pat); 10144 if (! pat) 10145 return 0; 10146 10147 set_dst = SET_DEST (pat); 10148 } 10149 else 10150 { 10151 gcc_assert (GET_CODE (pat) == SET); 10152 set_dst = SET_DEST (pat); 10153 } 10154 10155 emit_insn (pat); 10156 10157 rtx_code_label *label = NULL; 10158 10159 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient 10160 with NAN operands. */ 10161 if (check_unordered) 10162 { 10163 gcc_assert (comparison == EQ || comparison == NE); 10164 10165 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG); 10166 label = gen_label_rtx (); 10167 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx); 10168 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 10169 gen_rtx_LABEL_REF (VOIDmode, label), 10170 pc_rtx); 10171 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 10172 } 10173 10174 /* NB: Set CCFPmode and check a different CCmode which is in subset 10175 of CCFPmode. */ 10176 if (GET_MODE (set_dst) != mode) 10177 { 10178 gcc_assert (mode == CCAmode || mode == CCCmode 10179 || mode == CCOmode || mode == CCPmode 10180 || mode == CCSmode || mode == CCZmode); 10181 set_dst = gen_rtx_REG (mode, FLAGS_REG); 10182 } 10183 10184 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 10185 gen_rtx_fmt_ee (comparison, QImode, 10186 set_dst, 10187 const0_rtx))); 10188 10189 if (label) 10190 emit_label (label); 10191 10192 return SUBREG_REG (target); 10193} 10194 10195static rtx 10196ix86_expand_round_builtin (const struct builtin_description *d, 10197 tree exp, rtx target) 10198{ 10199 rtx pat; 10200 unsigned int i, nargs; 10201 struct 10202 { 10203 rtx op; 10204 machine_mode mode; 10205 } args[6]; 10206 enum insn_code icode = d->icode; 10207 const struct insn_data_d *insn_p = &insn_data[icode]; 10208 machine_mode tmode = insn_p->operand[0].mode; 10209 unsigned int nargs_constant = 0; 10210 unsigned int redundant_embed_rnd = 0; 10211 10212 switch ((enum ix86_builtin_func_type) d->flag) 10213 { 10214 case UINT64_FTYPE_V2DF_INT: 10215 case UINT64_FTYPE_V4SF_INT: 10216 case UINT_FTYPE_V2DF_INT: 10217 case UINT_FTYPE_V4SF_INT: 10218 case INT64_FTYPE_V2DF_INT: 10219 case INT64_FTYPE_V4SF_INT: 10220 case INT_FTYPE_V2DF_INT: 10221 case INT_FTYPE_V4SF_INT: 10222 nargs = 2; 10223 break; 10224 case V4SF_FTYPE_V4SF_UINT_INT: 10225 case V4SF_FTYPE_V4SF_UINT64_INT: 10226 case V2DF_FTYPE_V2DF_UINT64_INT: 10227 case V4SF_FTYPE_V4SF_INT_INT: 10228 case V4SF_FTYPE_V4SF_INT64_INT: 10229 case V2DF_FTYPE_V2DF_INT64_INT: 10230 case V4SF_FTYPE_V4SF_V4SF_INT: 10231 case V2DF_FTYPE_V2DF_V2DF_INT: 10232 case V4SF_FTYPE_V4SF_V2DF_INT: 10233 case V2DF_FTYPE_V2DF_V4SF_INT: 10234 nargs = 3; 10235 break; 10236 case V8SF_FTYPE_V8DF_V8SF_QI_INT: 10237 case V8DF_FTYPE_V8DF_V8DF_QI_INT: 10238 case V8SI_FTYPE_V8DF_V8SI_QI_INT: 10239 case V8DI_FTYPE_V8DF_V8DI_QI_INT: 10240 case V8SF_FTYPE_V8DI_V8SF_QI_INT: 10241 case V8DF_FTYPE_V8DI_V8DF_QI_INT: 10242 case V16SF_FTYPE_V16SF_V16SF_HI_INT: 10243 case V8DI_FTYPE_V8SF_V8DI_QI_INT: 10244 case V16SF_FTYPE_V16SI_V16SF_HI_INT: 10245 case V16SI_FTYPE_V16SF_V16SI_HI_INT: 10246 case V8DF_FTYPE_V8SF_V8DF_QI_INT: 10247 case V16SF_FTYPE_V16HI_V16SF_HI_INT: 10248 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: 10249 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: 10250 nargs = 4; 10251 break; 10252 case V4SF_FTYPE_V4SF_V4SF_INT_INT: 10253 case V2DF_FTYPE_V2DF_V2DF_INT_INT: 10254 nargs_constant = 2; 10255 nargs = 4; 10256 break; 10257 case INT_FTYPE_V4SF_V4SF_INT_INT: 10258 case INT_FTYPE_V2DF_V2DF_INT_INT: 10259 return ix86_expand_sse_comi_round (d, exp, target); 10260 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: 10261 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: 10262 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: 10263 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: 10264 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: 10265 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: 10266 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: 10267 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: 10268 nargs = 5; 10269 break; 10270 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: 10271 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: 10272 nargs_constant = 4; 10273 nargs = 5; 10274 break; 10275 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT: 10276 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT: 10277 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT: 10278 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT: 10279 nargs_constant = 3; 10280 nargs = 5; 10281 break; 10282 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT: 10283 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT: 10284 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT: 10285 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: 10286 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT: 10287 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT: 10288 nargs = 6; 10289 nargs_constant = 4; 10290 break; 10291 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT: 10292 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT: 10293 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT: 10294 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT: 10295 nargs = 6; 10296 nargs_constant = 3; 10297 break; 10298 default: 10299 gcc_unreachable (); 10300 } 10301 gcc_assert (nargs <= ARRAY_SIZE (args)); 10302 10303 if (optimize 10304 || target == 0 10305 || GET_MODE (target) != tmode 10306 || !insn_p->operand[0].predicate (target, tmode)) 10307 target = gen_reg_rtx (tmode); 10308 10309 for (i = 0; i < nargs; i++) 10310 { 10311 tree arg = CALL_EXPR_ARG (exp, i); 10312 rtx op = expand_normal (arg); 10313 machine_mode mode = insn_p->operand[i + 1].mode; 10314 bool match = insn_p->operand[i + 1].predicate (op, mode); 10315 10316 if (i == nargs - nargs_constant) 10317 { 10318 if (!match) 10319 { 10320 switch (icode) 10321 { 10322 case CODE_FOR_avx512f_getmantv8df_mask_round: 10323 case CODE_FOR_avx512f_getmantv16sf_mask_round: 10324 case CODE_FOR_avx512f_vgetmantv2df_round: 10325 case CODE_FOR_avx512f_vgetmantv2df_mask_round: 10326 case CODE_FOR_avx512f_vgetmantv4sf_round: 10327 case CODE_FOR_avx512f_vgetmantv4sf_mask_round: 10328 error ("the immediate argument must be a 4-bit immediate"); 10329 return const0_rtx; 10330 case CODE_FOR_avx512f_cmpv8df3_mask_round: 10331 case CODE_FOR_avx512f_cmpv16sf3_mask_round: 10332 case CODE_FOR_avx512f_vmcmpv2df3_mask_round: 10333 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: 10334 error ("the immediate argument must be a 5-bit immediate"); 10335 return const0_rtx; 10336 default: 10337 error ("the immediate argument must be an 8-bit immediate"); 10338 return const0_rtx; 10339 } 10340 } 10341 } 10342 else if (i == nargs-1) 10343 { 10344 if (!insn_p->operand[nargs].predicate (op, SImode)) 10345 { 10346 error ("incorrect rounding operand"); 10347 return const0_rtx; 10348 } 10349 10350 /* If there is no rounding use normal version of the pattern. */ 10351 if (INTVAL (op) == NO_ROUND) 10352 redundant_embed_rnd = 1; 10353 } 10354 else 10355 { 10356 if (VECTOR_MODE_P (mode)) 10357 op = safe_vector_operand (op, mode); 10358 10359 op = fixup_modeless_constant (op, mode); 10360 10361 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) 10362 { 10363 if (optimize || !match) 10364 op = copy_to_mode_reg (mode, op); 10365 } 10366 else 10367 { 10368 op = copy_to_reg (op); 10369 op = lowpart_subreg (mode, op, GET_MODE (op)); 10370 } 10371 } 10372 10373 args[i].op = op; 10374 args[i].mode = mode; 10375 } 10376 10377 switch (nargs) 10378 { 10379 case 1: 10380 pat = GEN_FCN (icode) (target, args[0].op); 10381 break; 10382 case 2: 10383 pat = GEN_FCN (icode) (target, args[0].op, args[1].op); 10384 break; 10385 case 3: 10386 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, 10387 args[2].op); 10388 break; 10389 case 4: 10390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, 10391 args[2].op, args[3].op); 10392 break; 10393 case 5: 10394 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, 10395 args[2].op, args[3].op, args[4].op); 10396 break; 10397 case 6: 10398 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, 10399 args[2].op, args[3].op, args[4].op, 10400 args[5].op); 10401 break; 10402 default: 10403 gcc_unreachable (); 10404 } 10405 10406 if (!pat) 10407 return 0; 10408 10409 if (redundant_embed_rnd) 10410 pat = ix86_erase_embedded_rounding (pat); 10411 10412 emit_insn (pat); 10413 return target; 10414} 10415 10416/* Subroutine of ix86_expand_builtin to take care of special insns 10417 with variable number of operands. */ 10418 10419static rtx 10420ix86_expand_special_args_builtin (const struct builtin_description *d, 10421 tree exp, rtx target) 10422{ 10423 tree arg; 10424 rtx pat, op; 10425 unsigned int i, nargs, arg_adjust, memory; 10426 bool aligned_mem = false; 10427 struct 10428 { 10429 rtx op; 10430 machine_mode mode; 10431 } args[3]; 10432 enum insn_code icode = d->icode; 10433 bool last_arg_constant = false; 10434 const struct insn_data_d *insn_p = &insn_data[icode]; 10435 machine_mode tmode = insn_p->operand[0].mode; 10436 enum { load, store } klass; 10437 10438 switch ((enum ix86_builtin_func_type) d->flag) 10439 { 10440 case VOID_FTYPE_VOID: 10441 emit_insn (GEN_FCN (icode) (target)); 10442 return 0; 10443 case VOID_FTYPE_UINT64: 10444 case VOID_FTYPE_UNSIGNED: 10445 nargs = 0; 10446 klass = store; 10447 memory = 0; 10448 break; 10449 10450 case INT_FTYPE_VOID: 10451 case USHORT_FTYPE_VOID: 10452 case UINT64_FTYPE_VOID: 10453 case UINT_FTYPE_VOID: 10454 case UNSIGNED_FTYPE_VOID: 10455 nargs = 0; 10456 klass = load; 10457 memory = 0; 10458 break; 10459 case UINT64_FTYPE_PUNSIGNED: 10460 case V2DI_FTYPE_PV2DI: 10461 case V4DI_FTYPE_PV4DI: 10462 case V32QI_FTYPE_PCCHAR: 10463 case V16QI_FTYPE_PCCHAR: 10464 case V8SF_FTYPE_PCV4SF: 10465 case V8SF_FTYPE_PCFLOAT: 10466 case V4SF_FTYPE_PCFLOAT: 10467 case V4DF_FTYPE_PCV2DF: 10468 case V4DF_FTYPE_PCDOUBLE: 10469 case V2DF_FTYPE_PCDOUBLE: 10470 case VOID_FTYPE_PVOID: 10471 case V8DI_FTYPE_PV8DI: 10472 nargs = 1; 10473 klass = load; 10474 memory = 0; 10475 switch (icode) 10476 { 10477 case CODE_FOR_sse4_1_movntdqa: 10478 case CODE_FOR_avx2_movntdqa: 10479 case CODE_FOR_avx512f_movntdqa: 10480 aligned_mem = true; 10481 break; 10482 default: 10483 break; 10484 } 10485 break; 10486 case VOID_FTYPE_PV2SF_V4SF: 10487 case VOID_FTYPE_PV8DI_V8DI: 10488 case VOID_FTYPE_PV4DI_V4DI: 10489 case VOID_FTYPE_PV2DI_V2DI: 10490 case VOID_FTYPE_PCHAR_V32QI: 10491 case VOID_FTYPE_PCHAR_V16QI: 10492 case VOID_FTYPE_PFLOAT_V16SF: 10493 case VOID_FTYPE_PFLOAT_V8SF: 10494 case VOID_FTYPE_PFLOAT_V4SF: 10495 case VOID_FTYPE_PDOUBLE_V8DF: 10496 case VOID_FTYPE_PDOUBLE_V4DF: 10497 case VOID_FTYPE_PDOUBLE_V2DF: 10498 case VOID_FTYPE_PLONGLONG_LONGLONG: 10499 case VOID_FTYPE_PULONGLONG_ULONGLONG: 10500 case VOID_FTYPE_PUNSIGNED_UNSIGNED: 10501 case VOID_FTYPE_PINT_INT: 10502 nargs = 1; 10503 klass = store; 10504 /* Reserve memory operand for target. */ 10505 memory = ARRAY_SIZE (args); 10506 switch (icode) 10507 { 10508 /* These builtins and instructions require the memory 10509 to be properly aligned. */ 10510 case CODE_FOR_avx_movntv4di: 10511 case CODE_FOR_sse2_movntv2di: 10512 case CODE_FOR_avx_movntv8sf: 10513 case CODE_FOR_sse_movntv4sf: 10514 case CODE_FOR_sse4a_vmmovntv4sf: 10515 case CODE_FOR_avx_movntv4df: 10516 case CODE_FOR_sse2_movntv2df: 10517 case CODE_FOR_sse4a_vmmovntv2df: 10518 case CODE_FOR_sse2_movntidi: 10519 case CODE_FOR_sse_movntq: 10520 case CODE_FOR_sse2_movntisi: 10521 case CODE_FOR_avx512f_movntv16sf: 10522 case CODE_FOR_avx512f_movntv8df: 10523 case CODE_FOR_avx512f_movntv8di: 10524 aligned_mem = true; 10525 break; 10526 default: 10527 break; 10528 } 10529 break; 10530 case VOID_FTYPE_PVOID_PCVOID: 10531 nargs = 1; 10532 klass = store; 10533 memory = 0; 10534 10535 break; 10536 case V4SF_FTYPE_V4SF_PCV2SF: 10537 case V2DF_FTYPE_V2DF_PCDOUBLE: 10538 nargs = 2; 10539 klass = load; 10540 memory = 1; 10541 break; 10542 case V8SF_FTYPE_PCV8SF_V8SI: 10543 case V4DF_FTYPE_PCV4DF_V4DI: 10544 case V4SF_FTYPE_PCV4SF_V4SI: 10545 case V2DF_FTYPE_PCV2DF_V2DI: 10546 case V8SI_FTYPE_PCV8SI_V8SI: 10547 case V4DI_FTYPE_PCV4DI_V4DI: 10548 case V4SI_FTYPE_PCV4SI_V4SI: 10549 case V2DI_FTYPE_PCV2DI_V2DI: 10550 case VOID_FTYPE_INT_INT64: 10551 nargs = 2; 10552 klass = load; 10553 memory = 0; 10554 break; 10555 case VOID_FTYPE_PV8DF_V8DF_UQI: 10556 case VOID_FTYPE_PV4DF_V4DF_UQI: 10557 case VOID_FTYPE_PV2DF_V2DF_UQI: 10558 case VOID_FTYPE_PV16SF_V16SF_UHI: 10559 case VOID_FTYPE_PV8SF_V8SF_UQI: 10560 case VOID_FTYPE_PV4SF_V4SF_UQI: 10561 case VOID_FTYPE_PV8DI_V8DI_UQI: 10562 case VOID_FTYPE_PV4DI_V4DI_UQI: 10563 case VOID_FTYPE_PV2DI_V2DI_UQI: 10564 case VOID_FTYPE_PV16SI_V16SI_UHI: 10565 case VOID_FTYPE_PV8SI_V8SI_UQI: 10566 case VOID_FTYPE_PV4SI_V4SI_UQI: 10567 case VOID_FTYPE_PV64QI_V64QI_UDI: 10568 case VOID_FTYPE_PV32HI_V32HI_USI: 10569 case VOID_FTYPE_PV32QI_V32QI_USI: 10570 case VOID_FTYPE_PV16QI_V16QI_UHI: 10571 case VOID_FTYPE_PV16HI_V16HI_UHI: 10572 case VOID_FTYPE_PV8HI_V8HI_UQI: 10573 switch (icode) 10574 { 10575 /* These builtins and instructions require the memory 10576 to be properly aligned. */ 10577 case CODE_FOR_avx512f_storev16sf_mask: 10578 case CODE_FOR_avx512f_storev16si_mask: 10579 case CODE_FOR_avx512f_storev8df_mask: 10580 case CODE_FOR_avx512f_storev8di_mask: 10581 case CODE_FOR_avx512vl_storev8sf_mask: 10582 case CODE_FOR_avx512vl_storev8si_mask: 10583 case CODE_FOR_avx512vl_storev4df_mask: 10584 case CODE_FOR_avx512vl_storev4di_mask: 10585 case CODE_FOR_avx512vl_storev4sf_mask: 10586 case CODE_FOR_avx512vl_storev4si_mask: 10587 case CODE_FOR_avx512vl_storev2df_mask: 10588 case CODE_FOR_avx512vl_storev2di_mask: 10589 aligned_mem = true; 10590 break; 10591 default: 10592 break; 10593 } 10594 /* FALLTHRU */ 10595 case VOID_FTYPE_PV8SF_V8SI_V8SF: 10596 case VOID_FTYPE_PV4DF_V4DI_V4DF: 10597 case VOID_FTYPE_PV4SF_V4SI_V4SF: 10598 case VOID_FTYPE_PV2DF_V2DI_V2DF: 10599 case VOID_FTYPE_PV8SI_V8SI_V8SI: 10600 case VOID_FTYPE_PV4DI_V4DI_V4DI: 10601 case VOID_FTYPE_PV4SI_V4SI_V4SI: 10602 case VOID_FTYPE_PV2DI_V2DI_V2DI: 10603 case VOID_FTYPE_PV8SI_V8DI_UQI: 10604 case VOID_FTYPE_PV8HI_V8DI_UQI: 10605 case VOID_FTYPE_PV16HI_V16SI_UHI: 10606 case VOID_FTYPE_PV16QI_V8DI_UQI: 10607 case VOID_FTYPE_PV16QI_V16SI_UHI: 10608 case VOID_FTYPE_PV4SI_V4DI_UQI: 10609 case VOID_FTYPE_PV4SI_V2DI_UQI: 10610 case VOID_FTYPE_PV8HI_V4DI_UQI: 10611 case VOID_FTYPE_PV8HI_V2DI_UQI: 10612 case VOID_FTYPE_PV8HI_V8SI_UQI: 10613 case VOID_FTYPE_PV8HI_V4SI_UQI: 10614 case VOID_FTYPE_PV16QI_V4DI_UQI: 10615 case VOID_FTYPE_PV16QI_V2DI_UQI: 10616 case VOID_FTYPE_PV16QI_V8SI_UQI: 10617 case VOID_FTYPE_PV16QI_V4SI_UQI: 10618 case VOID_FTYPE_PCHAR_V64QI_UDI: 10619 case VOID_FTYPE_PCHAR_V32QI_USI: 10620 case VOID_FTYPE_PCHAR_V16QI_UHI: 10621 case VOID_FTYPE_PSHORT_V32HI_USI: 10622 case VOID_FTYPE_PSHORT_V16HI_UHI: 10623 case VOID_FTYPE_PSHORT_V8HI_UQI: 10624 case VOID_FTYPE_PINT_V16SI_UHI: 10625 case VOID_FTYPE_PINT_V8SI_UQI: 10626 case VOID_FTYPE_PINT_V4SI_UQI: 10627 case VOID_FTYPE_PINT64_V8DI_UQI: 10628 case VOID_FTYPE_PINT64_V4DI_UQI: 10629 case VOID_FTYPE_PINT64_V2DI_UQI: 10630 case VOID_FTYPE_PDOUBLE_V8DF_UQI: 10631 case VOID_FTYPE_PDOUBLE_V4DF_UQI: 10632 case VOID_FTYPE_PDOUBLE_V2DF_UQI: 10633 case VOID_FTYPE_PFLOAT_V16SF_UHI: 10634 case VOID_FTYPE_PFLOAT_V8SF_UQI: 10635 case VOID_FTYPE_PFLOAT_V4SF_UQI: 10636 case VOID_FTYPE_PV32QI_V32HI_USI: 10637 case VOID_FTYPE_PV16QI_V16HI_UHI: 10638 case VOID_FTYPE_PV8QI_V8HI_UQI: 10639 nargs = 2; 10640 klass = store; 10641 /* Reserve memory operand for target. */ 10642 memory = ARRAY_SIZE (args); 10643 break; 10644 case V4SF_FTYPE_PCV4SF_V4SF_UQI: 10645 case V8SF_FTYPE_PCV8SF_V8SF_UQI: 10646 case V16SF_FTYPE_PCV16SF_V16SF_UHI: 10647 case V4SI_FTYPE_PCV4SI_V4SI_UQI: 10648 case V8SI_FTYPE_PCV8SI_V8SI_UQI: 10649 case V16SI_FTYPE_PCV16SI_V16SI_UHI: 10650 case V2DF_FTYPE_PCV2DF_V2DF_UQI: 10651 case V4DF_FTYPE_PCV4DF_V4DF_UQI: 10652 case V8DF_FTYPE_PCV8DF_V8DF_UQI: 10653 case V2DI_FTYPE_PCV2DI_V2DI_UQI: 10654 case V4DI_FTYPE_PCV4DI_V4DI_UQI: 10655 case V8DI_FTYPE_PCV8DI_V8DI_UQI: 10656 case V64QI_FTYPE_PCV64QI_V64QI_UDI: 10657 case V32HI_FTYPE_PCV32HI_V32HI_USI: 10658 case V32QI_FTYPE_PCV32QI_V32QI_USI: 10659 case V16QI_FTYPE_PCV16QI_V16QI_UHI: 10660 case V16HI_FTYPE_PCV16HI_V16HI_UHI: 10661 case V8HI_FTYPE_PCV8HI_V8HI_UQI: 10662 switch (icode) 10663 { 10664 /* These builtins and instructions require the memory 10665 to be properly aligned. */ 10666 case CODE_FOR_avx512f_loadv16sf_mask: 10667 case CODE_FOR_avx512f_loadv16si_mask: 10668 case CODE_FOR_avx512f_loadv8df_mask: 10669 case CODE_FOR_avx512f_loadv8di_mask: 10670 case CODE_FOR_avx512vl_loadv8sf_mask: 10671 case CODE_FOR_avx512vl_loadv8si_mask: 10672 case CODE_FOR_avx512vl_loadv4df_mask: 10673 case CODE_FOR_avx512vl_loadv4di_mask: 10674 case CODE_FOR_avx512vl_loadv4sf_mask: 10675 case CODE_FOR_avx512vl_loadv4si_mask: 10676 case CODE_FOR_avx512vl_loadv2df_mask: 10677 case CODE_FOR_avx512vl_loadv2di_mask: 10678 case CODE_FOR_avx512bw_loadv64qi_mask: 10679 case CODE_FOR_avx512vl_loadv32qi_mask: 10680 case CODE_FOR_avx512vl_loadv16qi_mask: 10681 case CODE_FOR_avx512bw_loadv32hi_mask: 10682 case CODE_FOR_avx512vl_loadv16hi_mask: 10683 case CODE_FOR_avx512vl_loadv8hi_mask: 10684 aligned_mem = true; 10685 break; 10686 default: 10687 break; 10688 } 10689 /* FALLTHRU */ 10690 case V64QI_FTYPE_PCCHAR_V64QI_UDI: 10691 case V32QI_FTYPE_PCCHAR_V32QI_USI: 10692 case V16QI_FTYPE_PCCHAR_V16QI_UHI: 10693 case V32HI_FTYPE_PCSHORT_V32HI_USI: 10694 case V16HI_FTYPE_PCSHORT_V16HI_UHI: 10695 case V8HI_FTYPE_PCSHORT_V8HI_UQI: 10696 case V16SI_FTYPE_PCINT_V16SI_UHI: 10697 case V8SI_FTYPE_PCINT_V8SI_UQI: 10698 case V4SI_FTYPE_PCINT_V4SI_UQI: 10699 case V8DI_FTYPE_PCINT64_V8DI_UQI: 10700 case V4DI_FTYPE_PCINT64_V4DI_UQI: 10701 case V2DI_FTYPE_PCINT64_V2DI_UQI: 10702 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI: 10703 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI: 10704 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI: 10705 case V16SF_FTYPE_PCFLOAT_V16SF_UHI: 10706 case V8SF_FTYPE_PCFLOAT_V8SF_UQI: 10707 case V4SF_FTYPE_PCFLOAT_V4SF_UQI: 10708 nargs = 3; 10709 klass = load; 10710 memory = 0; 10711 break; 10712 case VOID_FTYPE_UINT_UINT_UINT: 10713 case VOID_FTYPE_UINT64_UINT_UINT: 10714 case UCHAR_FTYPE_UINT_UINT_UINT: 10715 case UCHAR_FTYPE_UINT64_UINT_UINT: 10716 nargs = 3; 10717 klass = load; 10718 memory = ARRAY_SIZE (args); 10719 last_arg_constant = true; 10720 break; 10721 default: 10722 gcc_unreachable (); 10723 } 10724 10725 gcc_assert (nargs <= ARRAY_SIZE (args)); 10726 10727 if (klass == store) 10728 { 10729 arg = CALL_EXPR_ARG (exp, 0); 10730 op = expand_normal (arg); 10731 gcc_assert (target == 0); 10732 if (memory) 10733 { 10734 op = ix86_zero_extend_to_Pmode (op); 10735 target = gen_rtx_MEM (tmode, op); 10736 /* target at this point has just BITS_PER_UNIT MEM_ALIGN 10737 on it. Try to improve it using get_pointer_alignment, 10738 and if the special builtin is one that requires strict 10739 mode alignment, also from it's GET_MODE_ALIGNMENT. 10740 Failure to do so could lead to ix86_legitimate_combined_insn 10741 rejecting all changes to such insns. */ 10742 unsigned int align = get_pointer_alignment (arg); 10743 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode)) 10744 align = GET_MODE_ALIGNMENT (tmode); 10745 if (MEM_ALIGN (target) < align) 10746 set_mem_align (target, align); 10747 } 10748 else 10749 target = force_reg (tmode, op); 10750 arg_adjust = 1; 10751 } 10752 else 10753 { 10754 arg_adjust = 0; 10755 if (optimize 10756 || target == 0 10757 || !register_operand (target, tmode) 10758 || GET_MODE (target) != tmode) 10759 target = gen_reg_rtx (tmode); 10760 } 10761 10762 for (i = 0; i < nargs; i++) 10763 { 10764 machine_mode mode = insn_p->operand[i + 1].mode; 10765 bool match; 10766 10767 arg = CALL_EXPR_ARG (exp, i + arg_adjust); 10768 op = expand_normal (arg); 10769 match = insn_p->operand[i + 1].predicate (op, mode); 10770 10771 if (last_arg_constant && (i + 1) == nargs) 10772 { 10773 if (!match) 10774 { 10775 if (icode == CODE_FOR_lwp_lwpvalsi3 10776 || icode == CODE_FOR_lwp_lwpinssi3 10777 || icode == CODE_FOR_lwp_lwpvaldi3 10778 || icode == CODE_FOR_lwp_lwpinsdi3) 10779 error ("the last argument must be a 32-bit immediate"); 10780 else 10781 error ("the last argument must be an 8-bit immediate"); 10782 return const0_rtx; 10783 } 10784 } 10785 else 10786 { 10787 if (i == memory) 10788 { 10789 /* This must be the memory operand. */ 10790 op = ix86_zero_extend_to_Pmode (op); 10791 op = gen_rtx_MEM (mode, op); 10792 /* op at this point has just BITS_PER_UNIT MEM_ALIGN 10793 on it. Try to improve it using get_pointer_alignment, 10794 and if the special builtin is one that requires strict 10795 mode alignment, also from it's GET_MODE_ALIGNMENT. 10796 Failure to do so could lead to ix86_legitimate_combined_insn 10797 rejecting all changes to such insns. */ 10798 unsigned int align = get_pointer_alignment (arg); 10799 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode)) 10800 align = GET_MODE_ALIGNMENT (mode); 10801 if (MEM_ALIGN (op) < align) 10802 set_mem_align (op, align); 10803 } 10804 else 10805 { 10806 /* This must be register. */ 10807 if (VECTOR_MODE_P (mode)) 10808 op = safe_vector_operand (op, mode); 10809 10810 op = fixup_modeless_constant (op, mode); 10811 10812 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) 10813 op = copy_to_mode_reg (mode, op); 10814 else 10815 { 10816 op = copy_to_reg (op); 10817 op = lowpart_subreg (mode, op, GET_MODE (op)); 10818 } 10819 } 10820 } 10821 10822 args[i].op = op; 10823 args[i].mode = mode; 10824 } 10825 10826 switch (nargs) 10827 { 10828 case 0: 10829 pat = GEN_FCN (icode) (target); 10830 break; 10831 case 1: 10832 pat = GEN_FCN (icode) (target, args[0].op); 10833 break; 10834 case 2: 10835 pat = GEN_FCN (icode) (target, args[0].op, args[1].op); 10836 break; 10837 case 3: 10838 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); 10839 break; 10840 default: 10841 gcc_unreachable (); 10842 } 10843 10844 if (! pat) 10845 return 0; 10846 emit_insn (pat); 10847 return klass == store ? 0 : target; 10848} 10849 10850/* Return the integer constant in ARG. Constrain it to be in the range 10851 of the subparts of VEC_TYPE; issue an error if not. */ 10852 10853static int 10854get_element_number (tree vec_type, tree arg) 10855{ 10856 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; 10857 10858 if (!tree_fits_uhwi_p (arg) 10859 || (elt = tree_to_uhwi (arg), elt > max)) 10860 { 10861 error ("selector must be an integer constant in the range " 10862 "[0, %wi]", max); 10863 return 0; 10864 } 10865 10866 return elt; 10867} 10868 10869/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around 10870 ix86_expand_vector_init. We DO have language-level syntax for this, in 10871 the form of (type){ init-list }. Except that since we can't place emms 10872 instructions from inside the compiler, we can't allow the use of MMX 10873 registers unless the user explicitly asks for it. So we do *not* define 10874 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead 10875 we have builtins invoked by mmintrin.h that gives us license to emit 10876 these sorts of instructions. */ 10877 10878static rtx 10879ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) 10880{ 10881 machine_mode tmode = TYPE_MODE (type); 10882 machine_mode inner_mode = GET_MODE_INNER (tmode); 10883 int i, n_elt = GET_MODE_NUNITS (tmode); 10884 rtvec v = rtvec_alloc (n_elt); 10885 10886 gcc_assert (VECTOR_MODE_P (tmode)); 10887 gcc_assert (call_expr_nargs (exp) == n_elt); 10888 10889 for (i = 0; i < n_elt; ++i) 10890 { 10891 rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); 10892 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); 10893 } 10894 10895 if (!target || !register_operand (target, tmode)) 10896 target = gen_reg_rtx (tmode); 10897 10898 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v)); 10899 return target; 10900} 10901 10902/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around 10903 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we 10904 had a language-level syntax for referencing vector elements. */ 10905 10906static rtx 10907ix86_expand_vec_ext_builtin (tree exp, rtx target) 10908{ 10909 machine_mode tmode, mode0; 10910 tree arg0, arg1; 10911 int elt; 10912 rtx op0; 10913 10914 arg0 = CALL_EXPR_ARG (exp, 0); 10915 arg1 = CALL_EXPR_ARG (exp, 1); 10916 10917 op0 = expand_normal (arg0); 10918 elt = get_element_number (TREE_TYPE (arg0), arg1); 10919 10920 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); 10921 mode0 = TYPE_MODE (TREE_TYPE (arg0)); 10922 gcc_assert (VECTOR_MODE_P (mode0)); 10923 10924 op0 = force_reg (mode0, op0); 10925 10926 if (optimize || !target || !register_operand (target, tmode)) 10927 target = gen_reg_rtx (tmode); 10928 10929 ix86_expand_vector_extract (true, target, op0, elt); 10930 10931 return target; 10932} 10933 10934/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around 10935 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had 10936 a language-level syntax for referencing vector elements. */ 10937 10938static rtx 10939ix86_expand_vec_set_builtin (tree exp) 10940{ 10941 machine_mode tmode, mode1; 10942 tree arg0, arg1, arg2; 10943 int elt; 10944 rtx op0, op1, target; 10945 10946 arg0 = CALL_EXPR_ARG (exp, 0); 10947 arg1 = CALL_EXPR_ARG (exp, 1); 10948 arg2 = CALL_EXPR_ARG (exp, 2); 10949 10950 tmode = TYPE_MODE (TREE_TYPE (arg0)); 10951 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); 10952 gcc_assert (VECTOR_MODE_P (tmode)); 10953 10954 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); 10955 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); 10956 elt = get_element_number (TREE_TYPE (arg0), arg2); 10957 10958 if (GET_MODE (op1) != mode1) 10959 op1 = convert_modes (mode1, GET_MODE (op1), op1, true); 10960 10961 op0 = force_reg (tmode, op0); 10962 op1 = force_reg (mode1, op1); 10963 10964 /* OP0 is the source of these builtin functions and shouldn't be 10965 modified. Create a copy, use it and return it as target. */ 10966 target = gen_reg_rtx (tmode); 10967 emit_move_insn (target, op0); 10968 ix86_expand_vector_set (true, target, op1, elt); 10969 10970 return target; 10971} 10972 10973/* Expand an expression EXP that calls a built-in function, 10974 with result going to TARGET if that's convenient 10975 (and in mode MODE if that's convenient). 10976 SUBTARGET may be used as the target for computing one of EXP's operands. 10977 IGNORE is nonzero if the value is to be ignored. */ 10978 10979rtx 10980ix86_expand_builtin (tree exp, rtx target, rtx subtarget, 10981 machine_mode mode, int ignore) 10982{ 10983 size_t i; 10984 enum insn_code icode, icode2; 10985 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 10986 tree arg0, arg1, arg2, arg3, arg4; 10987 rtx op0, op1, op2, op3, op4, pat, pat2, insn; 10988 machine_mode mode0, mode1, mode2, mode3, mode4; 10989 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl); 10990 10991 /* For CPU builtins that can be folded, fold first and expand the fold. */ 10992 switch (fcode) 10993 { 10994 case IX86_BUILTIN_CPU_INIT: 10995 { 10996 /* Make it call __cpu_indicator_init in libgcc. */ 10997 tree call_expr, fndecl, type; 10998 type = build_function_type_list (integer_type_node, NULL_TREE); 10999 fndecl = build_fn_decl ("__cpu_indicator_init", type); 11000 call_expr = build_call_expr (fndecl, 0); 11001 return expand_expr (call_expr, target, mode, EXPAND_NORMAL); 11002 } 11003 case IX86_BUILTIN_CPU_IS: 11004 case IX86_BUILTIN_CPU_SUPPORTS: 11005 { 11006 tree arg0 = CALL_EXPR_ARG (exp, 0); 11007 tree fold_expr = fold_builtin_cpu (fndecl, &arg0); 11008 gcc_assert (fold_expr != NULL_TREE); 11009 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL); 11010 } 11011 } 11012 11013 HOST_WIDE_INT isa = ix86_isa_flags; 11014 HOST_WIDE_INT isa2 = ix86_isa_flags2; 11015 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa; 11016 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2; 11017 /* The general case is we require all the ISAs specified in bisa{,2} 11018 to be enabled. 11019 The exceptions are: 11020 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A 11021 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 11022 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4 11023 where for each such pair it is sufficient if either of the ISAs is 11024 enabled, plus if it is ored with other options also those others. 11025 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */ 11026 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) 11027 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) 11028 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0) 11029 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A); 11030 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) 11031 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) 11032 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0) 11033 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32); 11034 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) 11035 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) 11036 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0) 11037 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4); 11038 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE) 11039 { 11040 bisa &= ~OPTION_MASK_ISA_MMX; 11041 bisa |= OPTION_MASK_ISA_SSE2; 11042 } 11043 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2) 11044 { 11045 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT; 11046 if (TARGET_ABI_X32) 11047 bisa |= OPTION_MASK_ABI_X32; 11048 else 11049 bisa |= OPTION_MASK_ABI_64; 11050 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL, 11051 (enum fpmath_unit) 0, 11052 (enum prefer_vector_width) 0, 11053 false, add_abi_p); 11054 if (!opts) 11055 error ("%qE needs unknown isa option", fndecl); 11056 else 11057 { 11058 gcc_assert (opts != NULL); 11059 error ("%qE needs isa option %s", fndecl, opts); 11060 free (opts); 11061 } 11062 return expand_call (exp, target, ignore); 11063 } 11064 11065 switch (fcode) 11066 { 11067 case IX86_BUILTIN_MASKMOVQ: 11068 case IX86_BUILTIN_MASKMOVDQU: 11069 icode = (fcode == IX86_BUILTIN_MASKMOVQ 11070 ? CODE_FOR_mmx_maskmovq 11071 : CODE_FOR_sse2_maskmovdqu); 11072 /* Note the arg order is different from the operand order. */ 11073 arg1 = CALL_EXPR_ARG (exp, 0); 11074 arg2 = CALL_EXPR_ARG (exp, 1); 11075 arg0 = CALL_EXPR_ARG (exp, 2); 11076 op0 = expand_normal (arg0); 11077 op1 = expand_normal (arg1); 11078 op2 = expand_normal (arg2); 11079 mode0 = insn_data[icode].operand[0].mode; 11080 mode1 = insn_data[icode].operand[1].mode; 11081 mode2 = insn_data[icode].operand[2].mode; 11082 11083 op0 = ix86_zero_extend_to_Pmode (op0); 11084 op0 = gen_rtx_MEM (mode1, op0); 11085 11086 if (!insn_data[icode].operand[0].predicate (op0, mode0)) 11087 op0 = copy_to_mode_reg (mode0, op0); 11088 if (!insn_data[icode].operand[1].predicate (op1, mode1)) 11089 op1 = copy_to_mode_reg (mode1, op1); 11090 if (!insn_data[icode].operand[2].predicate (op2, mode2)) 11091 op2 = copy_to_mode_reg (mode2, op2); 11092 pat = GEN_FCN (icode) (op0, op1, op2); 11093 if (! pat) 11094 return 0; 11095 emit_insn (pat); 11096 return 0; 11097 11098 case IX86_BUILTIN_LDMXCSR: 11099 op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); 11100 target = assign_386_stack_local (SImode, SLOT_TEMP); 11101 emit_move_insn (target, op0); 11102 emit_insn (gen_sse_ldmxcsr (target)); 11103 return 0; 11104 11105 case IX86_BUILTIN_STMXCSR: 11106 target = assign_386_stack_local (SImode, SLOT_TEMP); 11107 emit_insn (gen_sse_stmxcsr (target)); 11108 return copy_to_mode_reg (SImode, target); 11109 11110 case IX86_BUILTIN_CLFLUSH: 11111 arg0 = CALL_EXPR_ARG (exp, 0); 11112 op0 = expand_normal (arg0); 11113 icode = CODE_FOR_sse2_clflush; 11114 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 11115 op0 = ix86_zero_extend_to_Pmode (op0); 11116 11117 emit_insn (gen_sse2_clflush (op0)); 11118 return 0; 11119 11120 case IX86_BUILTIN_CLWB: 11121 arg0 = CALL_EXPR_ARG (exp, 0); 11122 op0 = expand_normal (arg0); 11123 icode = CODE_FOR_clwb; 11124 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 11125 op0 = ix86_zero_extend_to_Pmode (op0); 11126 11127 emit_insn (gen_clwb (op0)); 11128 return 0; 11129 11130 case IX86_BUILTIN_CLFLUSHOPT: 11131 arg0 = CALL_EXPR_ARG (exp, 0); 11132 op0 = expand_normal (arg0); 11133 icode = CODE_FOR_clflushopt; 11134 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 11135 op0 = ix86_zero_extend_to_Pmode (op0); 11136 11137 emit_insn (gen_clflushopt (op0)); 11138 return 0; 11139 11140 case IX86_BUILTIN_MONITOR: 11141 case IX86_BUILTIN_MONITORX: 11142 arg0 = CALL_EXPR_ARG (exp, 0); 11143 arg1 = CALL_EXPR_ARG (exp, 1); 11144 arg2 = CALL_EXPR_ARG (exp, 2); 11145 op0 = expand_normal (arg0); 11146 op1 = expand_normal (arg1); 11147 op2 = expand_normal (arg2); 11148 if (!REG_P (op0)) 11149 op0 = ix86_zero_extend_to_Pmode (op0); 11150 if (!REG_P (op1)) 11151 op1 = copy_to_mode_reg (SImode, op1); 11152 if (!REG_P (op2)) 11153 op2 = copy_to_mode_reg (SImode, op2); 11154 11155 emit_insn (fcode == IX86_BUILTIN_MONITOR 11156 ? gen_sse3_monitor (Pmode, op0, op1, op2) 11157 : gen_monitorx (Pmode, op0, op1, op2)); 11158 return 0; 11159 11160 case IX86_BUILTIN_MWAIT: 11161 arg0 = CALL_EXPR_ARG (exp, 0); 11162 arg1 = CALL_EXPR_ARG (exp, 1); 11163 op0 = expand_normal (arg0); 11164 op1 = expand_normal (arg1); 11165 if (!REG_P (op0)) 11166 op0 = copy_to_mode_reg (SImode, op0); 11167 if (!REG_P (op1)) 11168 op1 = copy_to_mode_reg (SImode, op1); 11169 emit_insn (gen_sse3_mwait (op0, op1)); 11170 return 0; 11171 11172 case IX86_BUILTIN_MWAITX: 11173 arg0 = CALL_EXPR_ARG (exp, 0); 11174 arg1 = CALL_EXPR_ARG (exp, 1); 11175 arg2 = CALL_EXPR_ARG (exp, 2); 11176 op0 = expand_normal (arg0); 11177 op1 = expand_normal (arg1); 11178 op2 = expand_normal (arg2); 11179 if (!REG_P (op0)) 11180 op0 = copy_to_mode_reg (SImode, op0); 11181 if (!REG_P (op1)) 11182 op1 = copy_to_mode_reg (SImode, op1); 11183 if (!REG_P (op2)) 11184 op2 = copy_to_mode_reg (SImode, op2); 11185 emit_insn (gen_mwaitx (op0, op1, op2)); 11186 return 0; 11187 11188 case IX86_BUILTIN_UMONITOR: 11189 arg0 = CALL_EXPR_ARG (exp, 0); 11190 op0 = expand_normal (arg0); 11191 11192 op0 = ix86_zero_extend_to_Pmode (op0); 11193 emit_insn (gen_umonitor (Pmode, op0)); 11194 return 0; 11195 11196 case IX86_BUILTIN_UMWAIT: 11197 case IX86_BUILTIN_TPAUSE: 11198 arg0 = CALL_EXPR_ARG (exp, 0); 11199 arg1 = CALL_EXPR_ARG (exp, 1); 11200 op0 = expand_normal (arg0); 11201 op1 = expand_normal (arg1); 11202 11203 if (!REG_P (op0)) 11204 op0 = copy_to_mode_reg (SImode, op0); 11205 11206 op1 = force_reg (DImode, op1); 11207 11208 if (TARGET_64BIT) 11209 { 11210 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), 11211 NULL, 1, OPTAB_DIRECT); 11212 switch (fcode) 11213 { 11214 case IX86_BUILTIN_UMWAIT: 11215 icode = CODE_FOR_umwait_rex64; 11216 break; 11217 case IX86_BUILTIN_TPAUSE: 11218 icode = CODE_FOR_tpause_rex64; 11219 break; 11220 default: 11221 gcc_unreachable (); 11222 } 11223 11224 op2 = gen_lowpart (SImode, op2); 11225 op1 = gen_lowpart (SImode, op1); 11226 pat = GEN_FCN (icode) (op0, op1, op2); 11227 } 11228 else 11229 { 11230 switch (fcode) 11231 { 11232 case IX86_BUILTIN_UMWAIT: 11233 icode = CODE_FOR_umwait; 11234 break; 11235 case IX86_BUILTIN_TPAUSE: 11236 icode = CODE_FOR_tpause; 11237 break; 11238 default: 11239 gcc_unreachable (); 11240 } 11241 pat = GEN_FCN (icode) (op0, op1); 11242 } 11243 11244 if (!pat) 11245 return 0; 11246 11247 emit_insn (pat); 11248 11249 if (target == 0 11250 || !register_operand (target, QImode)) 11251 target = gen_reg_rtx (QImode); 11252 11253 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), 11254 const0_rtx); 11255 emit_insn (gen_rtx_SET (target, pat)); 11256 11257 return target; 11258 11259 case IX86_BUILTIN_CLZERO: 11260 arg0 = CALL_EXPR_ARG (exp, 0); 11261 op0 = expand_normal (arg0); 11262 if (!REG_P (op0)) 11263 op0 = ix86_zero_extend_to_Pmode (op0); 11264 emit_insn (gen_clzero (Pmode, op0)); 11265 return 0; 11266 11267 case IX86_BUILTIN_CLDEMOTE: 11268 arg0 = CALL_EXPR_ARG (exp, 0); 11269 op0 = expand_normal (arg0); 11270 icode = CODE_FOR_cldemote; 11271 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 11272 op0 = ix86_zero_extend_to_Pmode (op0); 11273 11274 emit_insn (gen_cldemote (op0)); 11275 return 0; 11276 11277 case IX86_BUILTIN_VEC_INIT_V2SI: 11278 case IX86_BUILTIN_VEC_INIT_V4HI: 11279 case IX86_BUILTIN_VEC_INIT_V8QI: 11280 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); 11281 11282 case IX86_BUILTIN_VEC_EXT_V2DF: 11283 case IX86_BUILTIN_VEC_EXT_V2DI: 11284 case IX86_BUILTIN_VEC_EXT_V4SF: 11285 case IX86_BUILTIN_VEC_EXT_V4SI: 11286 case IX86_BUILTIN_VEC_EXT_V8HI: 11287 case IX86_BUILTIN_VEC_EXT_V2SI: 11288 case IX86_BUILTIN_VEC_EXT_V4HI: 11289 case IX86_BUILTIN_VEC_EXT_V16QI: 11290 return ix86_expand_vec_ext_builtin (exp, target); 11291 11292 case IX86_BUILTIN_VEC_SET_V2DI: 11293 case IX86_BUILTIN_VEC_SET_V4SF: 11294 case IX86_BUILTIN_VEC_SET_V4SI: 11295 case IX86_BUILTIN_VEC_SET_V8HI: 11296 case IX86_BUILTIN_VEC_SET_V4HI: 11297 case IX86_BUILTIN_VEC_SET_V16QI: 11298 return ix86_expand_vec_set_builtin (exp); 11299 11300 case IX86_BUILTIN_NANQ: 11301 case IX86_BUILTIN_NANSQ: 11302 return expand_call (exp, target, ignore); 11303 11304 case IX86_BUILTIN_RDPID: 11305 11306 op0 = gen_reg_rtx (word_mode); 11307 11308 if (TARGET_64BIT) 11309 { 11310 insn = gen_rdpid_rex64 (op0); 11311 op0 = convert_to_mode (SImode, op0, 1); 11312 } 11313 else 11314 insn = gen_rdpid (op0); 11315 11316 emit_insn (insn); 11317 11318 if (target == 0 11319 || !register_operand (target, SImode)) 11320 target = gen_reg_rtx (SImode); 11321 11322 emit_move_insn (target, op0); 11323 return target; 11324 11325 case IX86_BUILTIN_2INTERSECTD512: 11326 case IX86_BUILTIN_2INTERSECTQ512: 11327 case IX86_BUILTIN_2INTERSECTD256: 11328 case IX86_BUILTIN_2INTERSECTQ256: 11329 case IX86_BUILTIN_2INTERSECTD128: 11330 case IX86_BUILTIN_2INTERSECTQ128: 11331 arg0 = CALL_EXPR_ARG (exp, 0); 11332 arg1 = CALL_EXPR_ARG (exp, 1); 11333 arg2 = CALL_EXPR_ARG (exp, 2); 11334 arg3 = CALL_EXPR_ARG (exp, 3); 11335 op0 = expand_normal (arg0); 11336 op1 = expand_normal (arg1); 11337 op2 = expand_normal (arg2); 11338 op3 = expand_normal (arg3); 11339 11340 if (!address_operand (op0, VOIDmode)) 11341 { 11342 op0 = convert_memory_address (Pmode, op0); 11343 op0 = copy_addr_to_reg (op0); 11344 } 11345 if (!address_operand (op1, VOIDmode)) 11346 { 11347 op1 = convert_memory_address (Pmode, op1); 11348 op1 = copy_addr_to_reg (op1); 11349 } 11350 11351 switch (fcode) 11352 { 11353 case IX86_BUILTIN_2INTERSECTD512: 11354 mode4 = P2HImode; 11355 icode = CODE_FOR_avx512vp2intersect_2intersectv16si; 11356 break; 11357 case IX86_BUILTIN_2INTERSECTQ512: 11358 mode4 = P2QImode; 11359 icode = CODE_FOR_avx512vp2intersect_2intersectv8di; 11360 break; 11361 case IX86_BUILTIN_2INTERSECTD256: 11362 mode4 = P2QImode; 11363 icode = CODE_FOR_avx512vp2intersect_2intersectv8si; 11364 break; 11365 case IX86_BUILTIN_2INTERSECTQ256: 11366 mode4 = P2QImode; 11367 icode = CODE_FOR_avx512vp2intersect_2intersectv4di; 11368 break; 11369 case IX86_BUILTIN_2INTERSECTD128: 11370 mode4 = P2QImode; 11371 icode = CODE_FOR_avx512vp2intersect_2intersectv4si; 11372 break; 11373 case IX86_BUILTIN_2INTERSECTQ128: 11374 mode4 = P2QImode; 11375 icode = CODE_FOR_avx512vp2intersect_2intersectv2di; 11376 break; 11377 default: 11378 gcc_unreachable (); 11379 } 11380 11381 mode2 = insn_data[icode].operand[1].mode; 11382 mode3 = insn_data[icode].operand[2].mode; 11383 if (!insn_data[icode].operand[1].predicate (op2, mode2)) 11384 op2 = copy_to_mode_reg (mode2, op2); 11385 if (!insn_data[icode].operand[2].predicate (op3, mode3)) 11386 op3 = copy_to_mode_reg (mode3, op3); 11387 11388 op4 = gen_reg_rtx (mode4); 11389 emit_insn (GEN_FCN (icode) (op4, op2, op3)); 11390 mode0 = mode4 == P2HImode ? HImode : QImode; 11391 emit_move_insn (gen_rtx_MEM (mode0, op0), 11392 gen_lowpart (mode0, op4)); 11393 emit_move_insn (gen_rtx_MEM (mode0, op1), 11394 gen_highpart (mode0, op4)); 11395 11396 return 0; 11397 11398 case IX86_BUILTIN_RDPMC: 11399 case IX86_BUILTIN_RDTSC: 11400 case IX86_BUILTIN_RDTSCP: 11401 case IX86_BUILTIN_XGETBV: 11402 11403 op0 = gen_reg_rtx (DImode); 11404 op1 = gen_reg_rtx (DImode); 11405 11406 if (fcode == IX86_BUILTIN_RDPMC) 11407 { 11408 arg0 = CALL_EXPR_ARG (exp, 0); 11409 op2 = expand_normal (arg0); 11410 if (!register_operand (op2, SImode)) 11411 op2 = copy_to_mode_reg (SImode, op2); 11412 11413 insn = (TARGET_64BIT 11414 ? gen_rdpmc_rex64 (op0, op1, op2) 11415 : gen_rdpmc (op0, op2)); 11416 emit_insn (insn); 11417 } 11418 else if (fcode == IX86_BUILTIN_XGETBV) 11419 { 11420 arg0 = CALL_EXPR_ARG (exp, 0); 11421 op2 = expand_normal (arg0); 11422 if (!register_operand (op2, SImode)) 11423 op2 = copy_to_mode_reg (SImode, op2); 11424 11425 insn = (TARGET_64BIT 11426 ? gen_xgetbv_rex64 (op0, op1, op2) 11427 : gen_xgetbv (op0, op2)); 11428 emit_insn (insn); 11429 } 11430 else if (fcode == IX86_BUILTIN_RDTSC) 11431 { 11432 insn = (TARGET_64BIT 11433 ? gen_rdtsc_rex64 (op0, op1) 11434 : gen_rdtsc (op0)); 11435 emit_insn (insn); 11436 } 11437 else 11438 { 11439 op2 = gen_reg_rtx (SImode); 11440 11441 insn = (TARGET_64BIT 11442 ? gen_rdtscp_rex64 (op0, op1, op2) 11443 : gen_rdtscp (op0, op2)); 11444 emit_insn (insn); 11445 11446 arg0 = CALL_EXPR_ARG (exp, 0); 11447 op4 = expand_normal (arg0); 11448 if (!address_operand (op4, VOIDmode)) 11449 { 11450 op4 = convert_memory_address (Pmode, op4); 11451 op4 = copy_addr_to_reg (op4); 11452 } 11453 emit_move_insn (gen_rtx_MEM (SImode, op4), op2); 11454 } 11455 11456 if (target == 0 11457 || !register_operand (target, DImode)) 11458 target = gen_reg_rtx (DImode); 11459 11460 if (TARGET_64BIT) 11461 { 11462 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32), 11463 op1, 1, OPTAB_DIRECT); 11464 op0 = expand_simple_binop (DImode, IOR, op0, op1, 11465 op0, 1, OPTAB_DIRECT); 11466 } 11467 11468 emit_move_insn (target, op0); 11469 return target; 11470 11471 case IX86_BUILTIN_ENQCMD: 11472 case IX86_BUILTIN_ENQCMDS: 11473 case IX86_BUILTIN_MOVDIR64B: 11474 11475 arg0 = CALL_EXPR_ARG (exp, 0); 11476 arg1 = CALL_EXPR_ARG (exp, 1); 11477 op0 = expand_normal (arg0); 11478 op1 = expand_normal (arg1); 11479 11480 op0 = ix86_zero_extend_to_Pmode (op0); 11481 if (!address_operand (op1, VOIDmode)) 11482 { 11483 op1 = convert_memory_address (Pmode, op1); 11484 op1 = copy_addr_to_reg (op1); 11485 } 11486 op1 = gen_rtx_MEM (XImode, op1); 11487 11488 if (fcode == IX86_BUILTIN_MOVDIR64B) 11489 { 11490 emit_insn (gen_movdir64b (Pmode, op0, op1)); 11491 return 0; 11492 } 11493 else 11494 { 11495 rtx pat; 11496 11497 target = gen_reg_rtx (SImode); 11498 emit_move_insn (target, const0_rtx); 11499 target = gen_rtx_SUBREG (QImode, target, 0); 11500 11501 if (fcode == IX86_BUILTIN_ENQCMD) 11502 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1); 11503 else 11504 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1); 11505 11506 emit_insn (pat); 11507 11508 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), 11509 gen_rtx_fmt_ee (EQ, QImode, 11510 SET_DEST (pat), 11511 const0_rtx))); 11512 11513 return SUBREG_REG (target); 11514 } 11515 11516 case IX86_BUILTIN_FXSAVE: 11517 case IX86_BUILTIN_FXRSTOR: 11518 case IX86_BUILTIN_FXSAVE64: 11519 case IX86_BUILTIN_FXRSTOR64: 11520 case IX86_BUILTIN_FNSTENV: 11521 case IX86_BUILTIN_FLDENV: 11522 mode0 = BLKmode; 11523 switch (fcode) 11524 { 11525 case IX86_BUILTIN_FXSAVE: 11526 icode = CODE_FOR_fxsave; 11527 break; 11528 case IX86_BUILTIN_FXRSTOR: 11529 icode = CODE_FOR_fxrstor; 11530 break; 11531 case IX86_BUILTIN_FXSAVE64: 11532 icode = CODE_FOR_fxsave64; 11533 break; 11534 case IX86_BUILTIN_FXRSTOR64: 11535 icode = CODE_FOR_fxrstor64; 11536 break; 11537 case IX86_BUILTIN_FNSTENV: 11538 icode = CODE_FOR_fnstenv; 11539 break; 11540 case IX86_BUILTIN_FLDENV: 11541 icode = CODE_FOR_fldenv; 11542 break; 11543 default: 11544 gcc_unreachable (); 11545 } 11546 11547 arg0 = CALL_EXPR_ARG (exp, 0); 11548 op0 = expand_normal (arg0); 11549 11550 if (!address_operand (op0, VOIDmode)) 11551 { 11552 op0 = convert_memory_address (Pmode, op0); 11553 op0 = copy_addr_to_reg (op0); 11554 } 11555 op0 = gen_rtx_MEM (mode0, op0); 11556 11557 pat = GEN_FCN (icode) (op0); 11558 if (pat) 11559 emit_insn (pat); 11560 return 0; 11561 11562 case IX86_BUILTIN_XSETBV: 11563 arg0 = CALL_EXPR_ARG (exp, 0); 11564 arg1 = CALL_EXPR_ARG (exp, 1); 11565 op0 = expand_normal (arg0); 11566 op1 = expand_normal (arg1); 11567 11568 if (!REG_P (op0)) 11569 op0 = copy_to_mode_reg (SImode, op0); 11570 11571 op1 = force_reg (DImode, op1); 11572 11573 if (TARGET_64BIT) 11574 { 11575 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), 11576 NULL, 1, OPTAB_DIRECT); 11577 11578 icode = CODE_FOR_xsetbv_rex64; 11579 11580 op2 = gen_lowpart (SImode, op2); 11581 op1 = gen_lowpart (SImode, op1); 11582 pat = GEN_FCN (icode) (op0, op1, op2); 11583 } 11584 else 11585 { 11586 icode = CODE_FOR_xsetbv; 11587 11588 pat = GEN_FCN (icode) (op0, op1); 11589 } 11590 if (pat) 11591 emit_insn (pat); 11592 return 0; 11593 11594 case IX86_BUILTIN_XSAVE: 11595 case IX86_BUILTIN_XRSTOR: 11596 case IX86_BUILTIN_XSAVE64: 11597 case IX86_BUILTIN_XRSTOR64: 11598 case IX86_BUILTIN_XSAVEOPT: 11599 case IX86_BUILTIN_XSAVEOPT64: 11600 case IX86_BUILTIN_XSAVES: 11601 case IX86_BUILTIN_XRSTORS: 11602 case IX86_BUILTIN_XSAVES64: 11603 case IX86_BUILTIN_XRSTORS64: 11604 case IX86_BUILTIN_XSAVEC: 11605 case IX86_BUILTIN_XSAVEC64: 11606 arg0 = CALL_EXPR_ARG (exp, 0); 11607 arg1 = CALL_EXPR_ARG (exp, 1); 11608 op0 = expand_normal (arg0); 11609 op1 = expand_normal (arg1); 11610 11611 if (!address_operand (op0, VOIDmode)) 11612 { 11613 op0 = convert_memory_address (Pmode, op0); 11614 op0 = copy_addr_to_reg (op0); 11615 } 11616 op0 = gen_rtx_MEM (BLKmode, op0); 11617 11618 op1 = force_reg (DImode, op1); 11619 11620 if (TARGET_64BIT) 11621 { 11622 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), 11623 NULL, 1, OPTAB_DIRECT); 11624 switch (fcode) 11625 { 11626 case IX86_BUILTIN_XSAVE: 11627 icode = CODE_FOR_xsave_rex64; 11628 break; 11629 case IX86_BUILTIN_XRSTOR: 11630 icode = CODE_FOR_xrstor_rex64; 11631 break; 11632 case IX86_BUILTIN_XSAVE64: 11633 icode = CODE_FOR_xsave64; 11634 break; 11635 case IX86_BUILTIN_XRSTOR64: 11636 icode = CODE_FOR_xrstor64; 11637 break; 11638 case IX86_BUILTIN_XSAVEOPT: 11639 icode = CODE_FOR_xsaveopt_rex64; 11640 break; 11641 case IX86_BUILTIN_XSAVEOPT64: 11642 icode = CODE_FOR_xsaveopt64; 11643 break; 11644 case IX86_BUILTIN_XSAVES: 11645 icode = CODE_FOR_xsaves_rex64; 11646 break; 11647 case IX86_BUILTIN_XRSTORS: 11648 icode = CODE_FOR_xrstors_rex64; 11649 break; 11650 case IX86_BUILTIN_XSAVES64: 11651 icode = CODE_FOR_xsaves64; 11652 break; 11653 case IX86_BUILTIN_XRSTORS64: 11654 icode = CODE_FOR_xrstors64; 11655 break; 11656 case IX86_BUILTIN_XSAVEC: 11657 icode = CODE_FOR_xsavec_rex64; 11658 break; 11659 case IX86_BUILTIN_XSAVEC64: 11660 icode = CODE_FOR_xsavec64; 11661 break; 11662 default: 11663 gcc_unreachable (); 11664 } 11665 11666 op2 = gen_lowpart (SImode, op2); 11667 op1 = gen_lowpart (SImode, op1); 11668 pat = GEN_FCN (icode) (op0, op1, op2); 11669 } 11670 else 11671 { 11672 switch (fcode) 11673 { 11674 case IX86_BUILTIN_XSAVE: 11675 icode = CODE_FOR_xsave; 11676 break; 11677 case IX86_BUILTIN_XRSTOR: 11678 icode = CODE_FOR_xrstor; 11679 break; 11680 case IX86_BUILTIN_XSAVEOPT: 11681 icode = CODE_FOR_xsaveopt; 11682 break; 11683 case IX86_BUILTIN_XSAVES: 11684 icode = CODE_FOR_xsaves; 11685 break; 11686 case IX86_BUILTIN_XRSTORS: 11687 icode = CODE_FOR_xrstors; 11688 break; 11689 case IX86_BUILTIN_XSAVEC: 11690 icode = CODE_FOR_xsavec; 11691 break; 11692 default: 11693 gcc_unreachable (); 11694 } 11695 pat = GEN_FCN (icode) (op0, op1); 11696 } 11697 11698 if (pat) 11699 emit_insn (pat); 11700 return 0; 11701 11702 case IX86_BUILTIN_LLWPCB: 11703 arg0 = CALL_EXPR_ARG (exp, 0); 11704 op0 = expand_normal (arg0); 11705 icode = CODE_FOR_lwp_llwpcb; 11706 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 11707 op0 = ix86_zero_extend_to_Pmode (op0); 11708 emit_insn (gen_lwp_llwpcb (op0)); 11709 return 0; 11710 11711 case IX86_BUILTIN_SLWPCB: 11712 icode = CODE_FOR_lwp_slwpcb; 11713 if (!target 11714 || !insn_data[icode].operand[0].predicate (target, Pmode)) 11715 target = gen_reg_rtx (Pmode); 11716 emit_insn (gen_lwp_slwpcb (target)); 11717 return target; 11718 11719 case IX86_BUILTIN_BEXTRI32: 11720 case IX86_BUILTIN_BEXTRI64: 11721 arg0 = CALL_EXPR_ARG (exp, 0); 11722 arg1 = CALL_EXPR_ARG (exp, 1); 11723 op0 = expand_normal (arg0); 11724 op1 = expand_normal (arg1); 11725 icode = (fcode == IX86_BUILTIN_BEXTRI32 11726 ? CODE_FOR_tbm_bextri_si 11727 : CODE_FOR_tbm_bextri_di); 11728 if (!CONST_INT_P (op1)) 11729 { 11730 error ("last argument must be an immediate"); 11731 return const0_rtx; 11732 } 11733 else 11734 { 11735 unsigned char length = (INTVAL (op1) >> 8) & 0xFF; 11736 unsigned char lsb_index = INTVAL (op1) & 0xFF; 11737 op1 = GEN_INT (length); 11738 op2 = GEN_INT (lsb_index); 11739 11740 mode1 = insn_data[icode].operand[1].mode; 11741 if (!insn_data[icode].operand[1].predicate (op0, mode1)) 11742 op0 = copy_to_mode_reg (mode1, op0); 11743 11744 mode0 = insn_data[icode].operand[0].mode; 11745 if (target == 0 11746 || !register_operand (target, mode0)) 11747 target = gen_reg_rtx (mode0); 11748 11749 pat = GEN_FCN (icode) (target, op0, op1, op2); 11750 if (pat) 11751 emit_insn (pat); 11752 return target; 11753 } 11754 11755 case IX86_BUILTIN_RDRAND16_STEP: 11756 icode = CODE_FOR_rdrandhi_1; 11757 mode0 = HImode; 11758 goto rdrand_step; 11759 11760 case IX86_BUILTIN_RDRAND32_STEP: 11761 icode = CODE_FOR_rdrandsi_1; 11762 mode0 = SImode; 11763 goto rdrand_step; 11764 11765 case IX86_BUILTIN_RDRAND64_STEP: 11766 icode = CODE_FOR_rdranddi_1; 11767 mode0 = DImode; 11768 11769rdrand_step: 11770 arg0 = CALL_EXPR_ARG (exp, 0); 11771 op1 = expand_normal (arg0); 11772 if (!address_operand (op1, VOIDmode)) 11773 { 11774 op1 = convert_memory_address (Pmode, op1); 11775 op1 = copy_addr_to_reg (op1); 11776 } 11777 11778 op0 = gen_reg_rtx (mode0); 11779 emit_insn (GEN_FCN (icode) (op0)); 11780 11781 emit_move_insn (gen_rtx_MEM (mode0, op1), op0); 11782 11783 op1 = gen_reg_rtx (SImode); 11784 emit_move_insn (op1, CONST1_RTX (SImode)); 11785 11786 /* Emit SImode conditional move. */ 11787 if (mode0 == HImode) 11788 { 11789 if (TARGET_ZERO_EXTEND_WITH_AND 11790 && optimize_function_for_speed_p (cfun)) 11791 { 11792 op2 = force_reg (SImode, const0_rtx); 11793 11794 emit_insn (gen_movstricthi 11795 (gen_lowpart (HImode, op2), op0)); 11796 } 11797 else 11798 { 11799 op2 = gen_reg_rtx (SImode); 11800 11801 emit_insn (gen_zero_extendhisi2 (op2, op0)); 11802 } 11803 } 11804 else if (mode0 == SImode) 11805 op2 = op0; 11806 else 11807 op2 = gen_rtx_SUBREG (SImode, op0, 0); 11808 11809 if (target == 0 11810 || !register_operand (target, SImode)) 11811 target = gen_reg_rtx (SImode); 11812 11813 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG), 11814 const0_rtx); 11815 emit_insn (gen_rtx_SET (target, 11816 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); 11817 return target; 11818 11819 case IX86_BUILTIN_RDSEED16_STEP: 11820 icode = CODE_FOR_rdseedhi_1; 11821 mode0 = HImode; 11822 goto rdseed_step; 11823 11824 case IX86_BUILTIN_RDSEED32_STEP: 11825 icode = CODE_FOR_rdseedsi_1; 11826 mode0 = SImode; 11827 goto rdseed_step; 11828 11829 case IX86_BUILTIN_RDSEED64_STEP: 11830 icode = CODE_FOR_rdseeddi_1; 11831 mode0 = DImode; 11832 11833rdseed_step: 11834 arg0 = CALL_EXPR_ARG (exp, 0); 11835 op1 = expand_normal (arg0); 11836 if (!address_operand (op1, VOIDmode)) 11837 { 11838 op1 = convert_memory_address (Pmode, op1); 11839 op1 = copy_addr_to_reg (op1); 11840 } 11841 11842 op0 = gen_reg_rtx (mode0); 11843 emit_insn (GEN_FCN (icode) (op0)); 11844 11845 emit_move_insn (gen_rtx_MEM (mode0, op1), op0); 11846 11847 op2 = gen_reg_rtx (QImode); 11848 11849 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), 11850 const0_rtx); 11851 emit_insn (gen_rtx_SET (op2, pat)); 11852 11853 if (target == 0 11854 || !register_operand (target, SImode)) 11855 target = gen_reg_rtx (SImode); 11856 11857 emit_insn (gen_zero_extendqisi2 (target, op2)); 11858 return target; 11859 11860 case IX86_BUILTIN_SBB32: 11861 icode = CODE_FOR_subborrowsi; 11862 icode2 = CODE_FOR_subborrowsi_0; 11863 mode0 = SImode; 11864 mode1 = DImode; 11865 mode2 = CCmode; 11866 goto handlecarry; 11867 11868 case IX86_BUILTIN_SBB64: 11869 icode = CODE_FOR_subborrowdi; 11870 icode2 = CODE_FOR_subborrowdi_0; 11871 mode0 = DImode; 11872 mode1 = TImode; 11873 mode2 = CCmode; 11874 goto handlecarry; 11875 11876 case IX86_BUILTIN_ADDCARRYX32: 11877 icode = CODE_FOR_addcarrysi; 11878 icode2 = CODE_FOR_addcarrysi_0; 11879 mode0 = SImode; 11880 mode1 = DImode; 11881 mode2 = CCCmode; 11882 goto handlecarry; 11883 11884 case IX86_BUILTIN_ADDCARRYX64: 11885 icode = CODE_FOR_addcarrydi; 11886 icode2 = CODE_FOR_addcarrydi_0; 11887 mode0 = DImode; 11888 mode1 = TImode; 11889 mode2 = CCCmode; 11890 11891 handlecarry: 11892 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */ 11893 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */ 11894 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */ 11895 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */ 11896 11897 op1 = expand_normal (arg0); 11898 if (!integer_zerop (arg0)) 11899 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); 11900 11901 op2 = expand_normal (arg1); 11902 if (!register_operand (op2, mode0)) 11903 op2 = copy_to_mode_reg (mode0, op2); 11904 11905 op3 = expand_normal (arg2); 11906 if (!register_operand (op3, mode0)) 11907 op3 = copy_to_mode_reg (mode0, op3); 11908 11909 op4 = expand_normal (arg3); 11910 if (!address_operand (op4, VOIDmode)) 11911 { 11912 op4 = convert_memory_address (Pmode, op4); 11913 op4 = copy_addr_to_reg (op4); 11914 } 11915 11916 op0 = gen_reg_rtx (mode0); 11917 if (integer_zerop (arg0)) 11918 { 11919 /* If arg0 is 0, optimize right away into add or sub 11920 instruction that sets CCCmode flags. */ 11921 op1 = gen_rtx_REG (mode2, FLAGS_REG); 11922 emit_insn (GEN_FCN (icode2) (op0, op2, op3)); 11923 } 11924 else 11925 { 11926 /* Generate CF from input operand. */ 11927 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); 11928 11929 /* Generate instruction that consumes CF. */ 11930 op1 = gen_rtx_REG (CCCmode, FLAGS_REG); 11931 pat = gen_rtx_LTU (mode1, op1, const0_rtx); 11932 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx); 11933 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2)); 11934 } 11935 11936 /* Return current CF value. */ 11937 if (target == 0) 11938 target = gen_reg_rtx (QImode); 11939 11940 pat = gen_rtx_LTU (QImode, op1, const0_rtx); 11941 emit_insn (gen_rtx_SET (target, pat)); 11942 11943 /* Store the result. */ 11944 emit_move_insn (gen_rtx_MEM (mode0, op4), op0); 11945 11946 return target; 11947 11948 case IX86_BUILTIN_READ_FLAGS: 11949 if (ignore) 11950 return const0_rtx; 11951 11952 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG))); 11953 11954 if (optimize 11955 || target == NULL_RTX 11956 || !nonimmediate_operand (target, word_mode) 11957 || GET_MODE (target) != word_mode) 11958 target = gen_reg_rtx (word_mode); 11959 11960 emit_insn (gen_pop (target)); 11961 return target; 11962 11963 case IX86_BUILTIN_WRITE_FLAGS: 11964 11965 arg0 = CALL_EXPR_ARG (exp, 0); 11966 op0 = expand_normal (arg0); 11967 if (!general_no_elim_operand (op0, word_mode)) 11968 op0 = copy_to_mode_reg (word_mode, op0); 11969 11970 emit_insn (gen_push (op0)); 11971 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG))); 11972 return 0; 11973 11974 case IX86_BUILTIN_KTESTC8: 11975 icode = CODE_FOR_ktestqi; 11976 mode3 = CCCmode; 11977 goto kortest; 11978 11979 case IX86_BUILTIN_KTESTZ8: 11980 icode = CODE_FOR_ktestqi; 11981 mode3 = CCZmode; 11982 goto kortest; 11983 11984 case IX86_BUILTIN_KTESTC16: 11985 icode = CODE_FOR_ktesthi; 11986 mode3 = CCCmode; 11987 goto kortest; 11988 11989 case IX86_BUILTIN_KTESTZ16: 11990 icode = CODE_FOR_ktesthi; 11991 mode3 = CCZmode; 11992 goto kortest; 11993 11994 case IX86_BUILTIN_KTESTC32: 11995 icode = CODE_FOR_ktestsi; 11996 mode3 = CCCmode; 11997 goto kortest; 11998 11999 case IX86_BUILTIN_KTESTZ32: 12000 icode = CODE_FOR_ktestsi; 12001 mode3 = CCZmode; 12002 goto kortest; 12003 12004 case IX86_BUILTIN_KTESTC64: 12005 icode = CODE_FOR_ktestdi; 12006 mode3 = CCCmode; 12007 goto kortest; 12008 12009 case IX86_BUILTIN_KTESTZ64: 12010 icode = CODE_FOR_ktestdi; 12011 mode3 = CCZmode; 12012 goto kortest; 12013 12014 case IX86_BUILTIN_KORTESTC8: 12015 icode = CODE_FOR_kortestqi; 12016 mode3 = CCCmode; 12017 goto kortest; 12018 12019 case IX86_BUILTIN_KORTESTZ8: 12020 icode = CODE_FOR_kortestqi; 12021 mode3 = CCZmode; 12022 goto kortest; 12023 12024 case IX86_BUILTIN_KORTESTC16: 12025 icode = CODE_FOR_kortesthi; 12026 mode3 = CCCmode; 12027 goto kortest; 12028 12029 case IX86_BUILTIN_KORTESTZ16: 12030 icode = CODE_FOR_kortesthi; 12031 mode3 = CCZmode; 12032 goto kortest; 12033 12034 case IX86_BUILTIN_KORTESTC32: 12035 icode = CODE_FOR_kortestsi; 12036 mode3 = CCCmode; 12037 goto kortest; 12038 12039 case IX86_BUILTIN_KORTESTZ32: 12040 icode = CODE_FOR_kortestsi; 12041 mode3 = CCZmode; 12042 goto kortest; 12043 12044 case IX86_BUILTIN_KORTESTC64: 12045 icode = CODE_FOR_kortestdi; 12046 mode3 = CCCmode; 12047 goto kortest; 12048 12049 case IX86_BUILTIN_KORTESTZ64: 12050 icode = CODE_FOR_kortestdi; 12051 mode3 = CCZmode; 12052 12053 kortest: 12054 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */ 12055 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */ 12056 op0 = expand_normal (arg0); 12057 op1 = expand_normal (arg1); 12058 12059 mode0 = insn_data[icode].operand[0].mode; 12060 mode1 = insn_data[icode].operand[1].mode; 12061 12062 if (GET_MODE (op0) != VOIDmode) 12063 op0 = force_reg (GET_MODE (op0), op0); 12064 12065 op0 = gen_lowpart (mode0, op0); 12066 12067 if (!insn_data[icode].operand[0].predicate (op0, mode0)) 12068 op0 = copy_to_mode_reg (mode0, op0); 12069 12070 if (GET_MODE (op1) != VOIDmode) 12071 op1 = force_reg (GET_MODE (op1), op1); 12072 12073 op1 = gen_lowpart (mode1, op1); 12074 12075 if (!insn_data[icode].operand[1].predicate (op1, mode1)) 12076 op1 = copy_to_mode_reg (mode1, op1); 12077 12078 target = gen_reg_rtx (QImode); 12079 12080 /* Emit kortest. */ 12081 emit_insn (GEN_FCN (icode) (op0, op1)); 12082 /* And use setcc to return result from flags. */ 12083 ix86_expand_setcc (target, EQ, 12084 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx); 12085 return target; 12086 12087 case IX86_BUILTIN_GATHERSIV2DF: 12088 icode = CODE_FOR_avx2_gathersiv2df; 12089 goto gather_gen; 12090 case IX86_BUILTIN_GATHERSIV4DF: 12091 icode = CODE_FOR_avx2_gathersiv4df; 12092 goto gather_gen; 12093 case IX86_BUILTIN_GATHERDIV2DF: 12094 icode = CODE_FOR_avx2_gatherdiv2df; 12095 goto gather_gen; 12096 case IX86_BUILTIN_GATHERDIV4DF: 12097 icode = CODE_FOR_avx2_gatherdiv4df; 12098 goto gather_gen; 12099 case IX86_BUILTIN_GATHERSIV4SF: 12100 icode = CODE_FOR_avx2_gathersiv4sf; 12101 goto gather_gen; 12102 case IX86_BUILTIN_GATHERSIV8SF: 12103 icode = CODE_FOR_avx2_gathersiv8sf; 12104 goto gather_gen; 12105 case IX86_BUILTIN_GATHERDIV4SF: 12106 icode = CODE_FOR_avx2_gatherdiv4sf; 12107 goto gather_gen; 12108 case IX86_BUILTIN_GATHERDIV8SF: 12109 icode = CODE_FOR_avx2_gatherdiv8sf; 12110 goto gather_gen; 12111 case IX86_BUILTIN_GATHERSIV2DI: 12112 icode = CODE_FOR_avx2_gathersiv2di; 12113 goto gather_gen; 12114 case IX86_BUILTIN_GATHERSIV4DI: 12115 icode = CODE_FOR_avx2_gathersiv4di; 12116 goto gather_gen; 12117 case IX86_BUILTIN_GATHERDIV2DI: 12118 icode = CODE_FOR_avx2_gatherdiv2di; 12119 goto gather_gen; 12120 case IX86_BUILTIN_GATHERDIV4DI: 12121 icode = CODE_FOR_avx2_gatherdiv4di; 12122 goto gather_gen; 12123 case IX86_BUILTIN_GATHERSIV4SI: 12124 icode = CODE_FOR_avx2_gathersiv4si; 12125 goto gather_gen; 12126 case IX86_BUILTIN_GATHERSIV8SI: 12127 icode = CODE_FOR_avx2_gathersiv8si; 12128 goto gather_gen; 12129 case IX86_BUILTIN_GATHERDIV4SI: 12130 icode = CODE_FOR_avx2_gatherdiv4si; 12131 goto gather_gen; 12132 case IX86_BUILTIN_GATHERDIV8SI: 12133 icode = CODE_FOR_avx2_gatherdiv8si; 12134 goto gather_gen; 12135 case IX86_BUILTIN_GATHERALTSIV4DF: 12136 icode = CODE_FOR_avx2_gathersiv4df; 12137 goto gather_gen; 12138 case IX86_BUILTIN_GATHERALTDIV8SF: 12139 icode = CODE_FOR_avx2_gatherdiv8sf; 12140 goto gather_gen; 12141 case IX86_BUILTIN_GATHERALTSIV4DI: 12142 icode = CODE_FOR_avx2_gathersiv4di; 12143 goto gather_gen; 12144 case IX86_BUILTIN_GATHERALTDIV8SI: 12145 icode = CODE_FOR_avx2_gatherdiv8si; 12146 goto gather_gen; 12147 case IX86_BUILTIN_GATHER3SIV16SF: 12148 icode = CODE_FOR_avx512f_gathersiv16sf; 12149 goto gather_gen; 12150 case IX86_BUILTIN_GATHER3SIV8DF: 12151 icode = CODE_FOR_avx512f_gathersiv8df; 12152 goto gather_gen; 12153 case IX86_BUILTIN_GATHER3DIV16SF: 12154 icode = CODE_FOR_avx512f_gatherdiv16sf; 12155 goto gather_gen; 12156 case IX86_BUILTIN_GATHER3DIV8DF: 12157 icode = CODE_FOR_avx512f_gatherdiv8df; 12158 goto gather_gen; 12159 case IX86_BUILTIN_GATHER3SIV16SI: 12160 icode = CODE_FOR_avx512f_gathersiv16si; 12161 goto gather_gen; 12162 case IX86_BUILTIN_GATHER3SIV8DI: 12163 icode = CODE_FOR_avx512f_gathersiv8di; 12164 goto gather_gen; 12165 case IX86_BUILTIN_GATHER3DIV16SI: 12166 icode = CODE_FOR_avx512f_gatherdiv16si; 12167 goto gather_gen; 12168 case IX86_BUILTIN_GATHER3DIV8DI: 12169 icode = CODE_FOR_avx512f_gatherdiv8di; 12170 goto gather_gen; 12171 case IX86_BUILTIN_GATHER3ALTSIV8DF: 12172 icode = CODE_FOR_avx512f_gathersiv8df; 12173 goto gather_gen; 12174 case IX86_BUILTIN_GATHER3ALTDIV16SF: 12175 icode = CODE_FOR_avx512f_gatherdiv16sf; 12176 goto gather_gen; 12177 case IX86_BUILTIN_GATHER3ALTSIV8DI: 12178 icode = CODE_FOR_avx512f_gathersiv8di; 12179 goto gather_gen; 12180 case IX86_BUILTIN_GATHER3ALTDIV16SI: 12181 icode = CODE_FOR_avx512f_gatherdiv16si; 12182 goto gather_gen; 12183 case IX86_BUILTIN_GATHER3SIV2DF: 12184 icode = CODE_FOR_avx512vl_gathersiv2df; 12185 goto gather_gen; 12186 case IX86_BUILTIN_GATHER3SIV4DF: 12187 icode = CODE_FOR_avx512vl_gathersiv4df; 12188 goto gather_gen; 12189 case IX86_BUILTIN_GATHER3DIV2DF: 12190 icode = CODE_FOR_avx512vl_gatherdiv2df; 12191 goto gather_gen; 12192 case IX86_BUILTIN_GATHER3DIV4DF: 12193 icode = CODE_FOR_avx512vl_gatherdiv4df; 12194 goto gather_gen; 12195 case IX86_BUILTIN_GATHER3SIV4SF: 12196 icode = CODE_FOR_avx512vl_gathersiv4sf; 12197 goto gather_gen; 12198 case IX86_BUILTIN_GATHER3SIV8SF: 12199 icode = CODE_FOR_avx512vl_gathersiv8sf; 12200 goto gather_gen; 12201 case IX86_BUILTIN_GATHER3DIV4SF: 12202 icode = CODE_FOR_avx512vl_gatherdiv4sf; 12203 goto gather_gen; 12204 case IX86_BUILTIN_GATHER3DIV8SF: 12205 icode = CODE_FOR_avx512vl_gatherdiv8sf; 12206 goto gather_gen; 12207 case IX86_BUILTIN_GATHER3SIV2DI: 12208 icode = CODE_FOR_avx512vl_gathersiv2di; 12209 goto gather_gen; 12210 case IX86_BUILTIN_GATHER3SIV4DI: 12211 icode = CODE_FOR_avx512vl_gathersiv4di; 12212 goto gather_gen; 12213 case IX86_BUILTIN_GATHER3DIV2DI: 12214 icode = CODE_FOR_avx512vl_gatherdiv2di; 12215 goto gather_gen; 12216 case IX86_BUILTIN_GATHER3DIV4DI: 12217 icode = CODE_FOR_avx512vl_gatherdiv4di; 12218 goto gather_gen; 12219 case IX86_BUILTIN_GATHER3SIV4SI: 12220 icode = CODE_FOR_avx512vl_gathersiv4si; 12221 goto gather_gen; 12222 case IX86_BUILTIN_GATHER3SIV8SI: 12223 icode = CODE_FOR_avx512vl_gathersiv8si; 12224 goto gather_gen; 12225 case IX86_BUILTIN_GATHER3DIV4SI: 12226 icode = CODE_FOR_avx512vl_gatherdiv4si; 12227 goto gather_gen; 12228 case IX86_BUILTIN_GATHER3DIV8SI: 12229 icode = CODE_FOR_avx512vl_gatherdiv8si; 12230 goto gather_gen; 12231 case IX86_BUILTIN_GATHER3ALTSIV4DF: 12232 icode = CODE_FOR_avx512vl_gathersiv4df; 12233 goto gather_gen; 12234 case IX86_BUILTIN_GATHER3ALTDIV8SF: 12235 icode = CODE_FOR_avx512vl_gatherdiv8sf; 12236 goto gather_gen; 12237 case IX86_BUILTIN_GATHER3ALTSIV4DI: 12238 icode = CODE_FOR_avx512vl_gathersiv4di; 12239 goto gather_gen; 12240 case IX86_BUILTIN_GATHER3ALTDIV8SI: 12241 icode = CODE_FOR_avx512vl_gatherdiv8si; 12242 goto gather_gen; 12243 case IX86_BUILTIN_SCATTERSIV16SF: 12244 icode = CODE_FOR_avx512f_scattersiv16sf; 12245 goto scatter_gen; 12246 case IX86_BUILTIN_SCATTERSIV8DF: 12247 icode = CODE_FOR_avx512f_scattersiv8df; 12248 goto scatter_gen; 12249 case IX86_BUILTIN_SCATTERDIV16SF: 12250 icode = CODE_FOR_avx512f_scatterdiv16sf; 12251 goto scatter_gen; 12252 case IX86_BUILTIN_SCATTERDIV8DF: 12253 icode = CODE_FOR_avx512f_scatterdiv8df; 12254 goto scatter_gen; 12255 case IX86_BUILTIN_SCATTERSIV16SI: 12256 icode = CODE_FOR_avx512f_scattersiv16si; 12257 goto scatter_gen; 12258 case IX86_BUILTIN_SCATTERSIV8DI: 12259 icode = CODE_FOR_avx512f_scattersiv8di; 12260 goto scatter_gen; 12261 case IX86_BUILTIN_SCATTERDIV16SI: 12262 icode = CODE_FOR_avx512f_scatterdiv16si; 12263 goto scatter_gen; 12264 case IX86_BUILTIN_SCATTERDIV8DI: 12265 icode = CODE_FOR_avx512f_scatterdiv8di; 12266 goto scatter_gen; 12267 case IX86_BUILTIN_SCATTERSIV8SF: 12268 icode = CODE_FOR_avx512vl_scattersiv8sf; 12269 goto scatter_gen; 12270 case IX86_BUILTIN_SCATTERSIV4SF: 12271 icode = CODE_FOR_avx512vl_scattersiv4sf; 12272 goto scatter_gen; 12273 case IX86_BUILTIN_SCATTERSIV4DF: 12274 icode = CODE_FOR_avx512vl_scattersiv4df; 12275 goto scatter_gen; 12276 case IX86_BUILTIN_SCATTERSIV2DF: 12277 icode = CODE_FOR_avx512vl_scattersiv2df; 12278 goto scatter_gen; 12279 case IX86_BUILTIN_SCATTERDIV8SF: 12280 icode = CODE_FOR_avx512vl_scatterdiv8sf; 12281 goto scatter_gen; 12282 case IX86_BUILTIN_SCATTERDIV4SF: 12283 icode = CODE_FOR_avx512vl_scatterdiv4sf; 12284 goto scatter_gen; 12285 case IX86_BUILTIN_SCATTERDIV4DF: 12286 icode = CODE_FOR_avx512vl_scatterdiv4df; 12287 goto scatter_gen; 12288 case IX86_BUILTIN_SCATTERDIV2DF: 12289 icode = CODE_FOR_avx512vl_scatterdiv2df; 12290 goto scatter_gen; 12291 case IX86_BUILTIN_SCATTERSIV8SI: 12292 icode = CODE_FOR_avx512vl_scattersiv8si; 12293 goto scatter_gen; 12294 case IX86_BUILTIN_SCATTERSIV4SI: 12295 icode = CODE_FOR_avx512vl_scattersiv4si; 12296 goto scatter_gen; 12297 case IX86_BUILTIN_SCATTERSIV4DI: 12298 icode = CODE_FOR_avx512vl_scattersiv4di; 12299 goto scatter_gen; 12300 case IX86_BUILTIN_SCATTERSIV2DI: 12301 icode = CODE_FOR_avx512vl_scattersiv2di; 12302 goto scatter_gen; 12303 case IX86_BUILTIN_SCATTERDIV8SI: 12304 icode = CODE_FOR_avx512vl_scatterdiv8si; 12305 goto scatter_gen; 12306 case IX86_BUILTIN_SCATTERDIV4SI: 12307 icode = CODE_FOR_avx512vl_scatterdiv4si; 12308 goto scatter_gen; 12309 case IX86_BUILTIN_SCATTERDIV4DI: 12310 icode = CODE_FOR_avx512vl_scatterdiv4di; 12311 goto scatter_gen; 12312 case IX86_BUILTIN_SCATTERDIV2DI: 12313 icode = CODE_FOR_avx512vl_scatterdiv2di; 12314 goto scatter_gen; 12315 case IX86_BUILTIN_GATHERPFDPD: 12316 icode = CODE_FOR_avx512pf_gatherpfv8sidf; 12317 goto vec_prefetch_gen; 12318 case IX86_BUILTIN_SCATTERALTSIV8DF: 12319 icode = CODE_FOR_avx512f_scattersiv8df; 12320 goto scatter_gen; 12321 case IX86_BUILTIN_SCATTERALTDIV16SF: 12322 icode = CODE_FOR_avx512f_scatterdiv16sf; 12323 goto scatter_gen; 12324 case IX86_BUILTIN_SCATTERALTSIV8DI: 12325 icode = CODE_FOR_avx512f_scattersiv8di; 12326 goto scatter_gen; 12327 case IX86_BUILTIN_SCATTERALTDIV16SI: 12328 icode = CODE_FOR_avx512f_scatterdiv16si; 12329 goto scatter_gen; 12330 case IX86_BUILTIN_SCATTERALTSIV4DF: 12331 icode = CODE_FOR_avx512vl_scattersiv4df; 12332 goto scatter_gen; 12333 case IX86_BUILTIN_SCATTERALTDIV8SF: 12334 icode = CODE_FOR_avx512vl_scatterdiv8sf; 12335 goto scatter_gen; 12336 case IX86_BUILTIN_SCATTERALTSIV4DI: 12337 icode = CODE_FOR_avx512vl_scattersiv4di; 12338 goto scatter_gen; 12339 case IX86_BUILTIN_SCATTERALTDIV8SI: 12340 icode = CODE_FOR_avx512vl_scatterdiv8si; 12341 goto scatter_gen; 12342 case IX86_BUILTIN_SCATTERALTSIV2DF: 12343 icode = CODE_FOR_avx512vl_scattersiv2df; 12344 goto scatter_gen; 12345 case IX86_BUILTIN_SCATTERALTDIV4SF: 12346 icode = CODE_FOR_avx512vl_scatterdiv4sf; 12347 goto scatter_gen; 12348 case IX86_BUILTIN_SCATTERALTSIV2DI: 12349 icode = CODE_FOR_avx512vl_scattersiv2di; 12350 goto scatter_gen; 12351 case IX86_BUILTIN_SCATTERALTDIV4SI: 12352 icode = CODE_FOR_avx512vl_scatterdiv4si; 12353 goto scatter_gen; 12354 case IX86_BUILTIN_GATHERPFDPS: 12355 icode = CODE_FOR_avx512pf_gatherpfv16sisf; 12356 goto vec_prefetch_gen; 12357 case IX86_BUILTIN_GATHERPFQPD: 12358 icode = CODE_FOR_avx512pf_gatherpfv8didf; 12359 goto vec_prefetch_gen; 12360 case IX86_BUILTIN_GATHERPFQPS: 12361 icode = CODE_FOR_avx512pf_gatherpfv8disf; 12362 goto vec_prefetch_gen; 12363 case IX86_BUILTIN_SCATTERPFDPD: 12364 icode = CODE_FOR_avx512pf_scatterpfv8sidf; 12365 goto vec_prefetch_gen; 12366 case IX86_BUILTIN_SCATTERPFDPS: 12367 icode = CODE_FOR_avx512pf_scatterpfv16sisf; 12368 goto vec_prefetch_gen; 12369 case IX86_BUILTIN_SCATTERPFQPD: 12370 icode = CODE_FOR_avx512pf_scatterpfv8didf; 12371 goto vec_prefetch_gen; 12372 case IX86_BUILTIN_SCATTERPFQPS: 12373 icode = CODE_FOR_avx512pf_scatterpfv8disf; 12374 goto vec_prefetch_gen; 12375 12376 gather_gen: 12377 rtx half; 12378 rtx (*gen) (rtx, rtx); 12379 12380 arg0 = CALL_EXPR_ARG (exp, 0); 12381 arg1 = CALL_EXPR_ARG (exp, 1); 12382 arg2 = CALL_EXPR_ARG (exp, 2); 12383 arg3 = CALL_EXPR_ARG (exp, 3); 12384 arg4 = CALL_EXPR_ARG (exp, 4); 12385 op0 = expand_normal (arg0); 12386 op1 = expand_normal (arg1); 12387 op2 = expand_normal (arg2); 12388 op3 = expand_normal (arg3); 12389 op4 = expand_normal (arg4); 12390 /* Note the arg order is different from the operand order. */ 12391 mode0 = insn_data[icode].operand[1].mode; 12392 mode2 = insn_data[icode].operand[3].mode; 12393 mode3 = insn_data[icode].operand[4].mode; 12394 mode4 = insn_data[icode].operand[5].mode; 12395 12396 if (target == NULL_RTX 12397 || GET_MODE (target) != insn_data[icode].operand[0].mode 12398 || !insn_data[icode].operand[0].predicate (target, 12399 GET_MODE (target))) 12400 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); 12401 else 12402 subtarget = target; 12403 12404 switch (fcode) 12405 { 12406 case IX86_BUILTIN_GATHER3ALTSIV8DF: 12407 case IX86_BUILTIN_GATHER3ALTSIV8DI: 12408 half = gen_reg_rtx (V8SImode); 12409 if (!nonimmediate_operand (op2, V16SImode)) 12410 op2 = copy_to_mode_reg (V16SImode, op2); 12411 emit_insn (gen_vec_extract_lo_v16si (half, op2)); 12412 op2 = half; 12413 break; 12414 case IX86_BUILTIN_GATHER3ALTSIV4DF: 12415 case IX86_BUILTIN_GATHER3ALTSIV4DI: 12416 case IX86_BUILTIN_GATHERALTSIV4DF: 12417 case IX86_BUILTIN_GATHERALTSIV4DI: 12418 half = gen_reg_rtx (V4SImode); 12419 if (!nonimmediate_operand (op2, V8SImode)) 12420 op2 = copy_to_mode_reg (V8SImode, op2); 12421 emit_insn (gen_vec_extract_lo_v8si (half, op2)); 12422 op2 = half; 12423 break; 12424 case IX86_BUILTIN_GATHER3ALTDIV16SF: 12425 case IX86_BUILTIN_GATHER3ALTDIV16SI: 12426 half = gen_reg_rtx (mode0); 12427 if (mode0 == V8SFmode) 12428 gen = gen_vec_extract_lo_v16sf; 12429 else 12430 gen = gen_vec_extract_lo_v16si; 12431 if (!nonimmediate_operand (op0, GET_MODE (op0))) 12432 op0 = copy_to_mode_reg (GET_MODE (op0), op0); 12433 emit_insn (gen (half, op0)); 12434 op0 = half; 12435 op3 = lowpart_subreg (QImode, op3, HImode); 12436 break; 12437 case IX86_BUILTIN_GATHER3ALTDIV8SF: 12438 case IX86_BUILTIN_GATHER3ALTDIV8SI: 12439 case IX86_BUILTIN_GATHERALTDIV8SF: 12440 case IX86_BUILTIN_GATHERALTDIV8SI: 12441 half = gen_reg_rtx (mode0); 12442 if (mode0 == V4SFmode) 12443 gen = gen_vec_extract_lo_v8sf; 12444 else 12445 gen = gen_vec_extract_lo_v8si; 12446 if (!nonimmediate_operand (op0, GET_MODE (op0))) 12447 op0 = copy_to_mode_reg (GET_MODE (op0), op0); 12448 emit_insn (gen (half, op0)); 12449 op0 = half; 12450 if (VECTOR_MODE_P (GET_MODE (op3))) 12451 { 12452 half = gen_reg_rtx (mode0); 12453 if (!nonimmediate_operand (op3, GET_MODE (op3))) 12454 op3 = copy_to_mode_reg (GET_MODE (op3), op3); 12455 emit_insn (gen (half, op3)); 12456 op3 = half; 12457 } 12458 break; 12459 default: 12460 break; 12461 } 12462 12463 /* Force memory operand only with base register here. But we 12464 don't want to do it on memory operand for other builtin 12465 functions. */ 12466 op1 = ix86_zero_extend_to_Pmode (op1); 12467 12468 if (!insn_data[icode].operand[1].predicate (op0, mode0)) 12469 op0 = copy_to_mode_reg (mode0, op0); 12470 if (!insn_data[icode].operand[2].predicate (op1, Pmode)) 12471 op1 = copy_to_mode_reg (Pmode, op1); 12472 if (!insn_data[icode].operand[3].predicate (op2, mode2)) 12473 op2 = copy_to_mode_reg (mode2, op2); 12474 12475 op3 = fixup_modeless_constant (op3, mode3); 12476 12477 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode) 12478 { 12479 if (!insn_data[icode].operand[4].predicate (op3, mode3)) 12480 op3 = copy_to_mode_reg (mode3, op3); 12481 } 12482 else 12483 { 12484 op3 = copy_to_reg (op3); 12485 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3)); 12486 } 12487 if (!insn_data[icode].operand[5].predicate (op4, mode4)) 12488 { 12489 error ("the last argument must be scale 1, 2, 4, 8"); 12490 return const0_rtx; 12491 } 12492 12493 /* Optimize. If mask is known to have all high bits set, 12494 replace op0 with pc_rtx to signal that the instruction 12495 overwrites the whole destination and doesn't use its 12496 previous contents. */ 12497 if (optimize) 12498 { 12499 if (TREE_CODE (arg3) == INTEGER_CST) 12500 { 12501 if (integer_all_onesp (arg3)) 12502 op0 = pc_rtx; 12503 } 12504 else if (TREE_CODE (arg3) == VECTOR_CST) 12505 { 12506 unsigned int negative = 0; 12507 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i) 12508 { 12509 tree cst = VECTOR_CST_ELT (arg3, i); 12510 if (TREE_CODE (cst) == INTEGER_CST 12511 && tree_int_cst_sign_bit (cst)) 12512 negative++; 12513 else if (TREE_CODE (cst) == REAL_CST 12514 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst))) 12515 negative++; 12516 } 12517 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) 12518 op0 = pc_rtx; 12519 } 12520 else if (TREE_CODE (arg3) == SSA_NAME 12521 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE) 12522 { 12523 /* Recognize also when mask is like: 12524 __v2df src = _mm_setzero_pd (); 12525 __v2df mask = _mm_cmpeq_pd (src, src); 12526 or 12527 __v8sf src = _mm256_setzero_ps (); 12528 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 12529 as that is a cheaper way to load all ones into 12530 a register than having to load a constant from 12531 memory. */ 12532 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3); 12533 if (is_gimple_call (def_stmt)) 12534 { 12535 tree fndecl = gimple_call_fndecl (def_stmt); 12536 if (fndecl 12537 && fndecl_built_in_p (fndecl, BUILT_IN_MD)) 12538 switch (DECL_MD_FUNCTION_CODE (fndecl)) 12539 { 12540 case IX86_BUILTIN_CMPPD: 12541 case IX86_BUILTIN_CMPPS: 12542 case IX86_BUILTIN_CMPPD256: 12543 case IX86_BUILTIN_CMPPS256: 12544 if (!integer_zerop (gimple_call_arg (def_stmt, 2))) 12545 break; 12546 /* FALLTHRU */ 12547 case IX86_BUILTIN_CMPEQPD: 12548 case IX86_BUILTIN_CMPEQPS: 12549 if (initializer_zerop (gimple_call_arg (def_stmt, 0)) 12550 && initializer_zerop (gimple_call_arg (def_stmt, 12551 1))) 12552 op0 = pc_rtx; 12553 break; 12554 default: 12555 break; 12556 } 12557 } 12558 } 12559 } 12560 12561 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); 12562 if (! pat) 12563 return const0_rtx; 12564 emit_insn (pat); 12565 12566 switch (fcode) 12567 { 12568 case IX86_BUILTIN_GATHER3DIV16SF: 12569 if (target == NULL_RTX) 12570 target = gen_reg_rtx (V8SFmode); 12571 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget)); 12572 break; 12573 case IX86_BUILTIN_GATHER3DIV16SI: 12574 if (target == NULL_RTX) 12575 target = gen_reg_rtx (V8SImode); 12576 emit_insn (gen_vec_extract_lo_v16si (target, subtarget)); 12577 break; 12578 case IX86_BUILTIN_GATHER3DIV8SF: 12579 case IX86_BUILTIN_GATHERDIV8SF: 12580 if (target == NULL_RTX) 12581 target = gen_reg_rtx (V4SFmode); 12582 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); 12583 break; 12584 case IX86_BUILTIN_GATHER3DIV8SI: 12585 case IX86_BUILTIN_GATHERDIV8SI: 12586 if (target == NULL_RTX) 12587 target = gen_reg_rtx (V4SImode); 12588 emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); 12589 break; 12590 default: 12591 target = subtarget; 12592 break; 12593 } 12594 return target; 12595 12596 scatter_gen: 12597 arg0 = CALL_EXPR_ARG (exp, 0); 12598 arg1 = CALL_EXPR_ARG (exp, 1); 12599 arg2 = CALL_EXPR_ARG (exp, 2); 12600 arg3 = CALL_EXPR_ARG (exp, 3); 12601 arg4 = CALL_EXPR_ARG (exp, 4); 12602 op0 = expand_normal (arg0); 12603 op1 = expand_normal (arg1); 12604 op2 = expand_normal (arg2); 12605 op3 = expand_normal (arg3); 12606 op4 = expand_normal (arg4); 12607 mode1 = insn_data[icode].operand[1].mode; 12608 mode2 = insn_data[icode].operand[2].mode; 12609 mode3 = insn_data[icode].operand[3].mode; 12610 mode4 = insn_data[icode].operand[4].mode; 12611 12612 /* Scatter instruction stores operand op3 to memory with 12613 indices from op2 and scale from op4 under writemask op1. 12614 If index operand op2 has more elements then source operand 12615 op3 one need to use only its low half. And vice versa. */ 12616 switch (fcode) 12617 { 12618 case IX86_BUILTIN_SCATTERALTSIV8DF: 12619 case IX86_BUILTIN_SCATTERALTSIV8DI: 12620 half = gen_reg_rtx (V8SImode); 12621 if (!nonimmediate_operand (op2, V16SImode)) 12622 op2 = copy_to_mode_reg (V16SImode, op2); 12623 emit_insn (gen_vec_extract_lo_v16si (half, op2)); 12624 op2 = half; 12625 break; 12626 case IX86_BUILTIN_SCATTERALTDIV16SF: 12627 case IX86_BUILTIN_SCATTERALTDIV16SI: 12628 half = gen_reg_rtx (mode3); 12629 if (mode3 == V8SFmode) 12630 gen = gen_vec_extract_lo_v16sf; 12631 else 12632 gen = gen_vec_extract_lo_v16si; 12633 if (!nonimmediate_operand (op3, GET_MODE (op3))) 12634 op3 = copy_to_mode_reg (GET_MODE (op3), op3); 12635 emit_insn (gen (half, op3)); 12636 op3 = half; 12637 break; 12638 case IX86_BUILTIN_SCATTERALTSIV4DF: 12639 case IX86_BUILTIN_SCATTERALTSIV4DI: 12640 half = gen_reg_rtx (V4SImode); 12641 if (!nonimmediate_operand (op2, V8SImode)) 12642 op2 = copy_to_mode_reg (V8SImode, op2); 12643 emit_insn (gen_vec_extract_lo_v8si (half, op2)); 12644 op2 = half; 12645 break; 12646 case IX86_BUILTIN_SCATTERALTDIV8SF: 12647 case IX86_BUILTIN_SCATTERALTDIV8SI: 12648 half = gen_reg_rtx (mode3); 12649 if (mode3 == V4SFmode) 12650 gen = gen_vec_extract_lo_v8sf; 12651 else 12652 gen = gen_vec_extract_lo_v8si; 12653 if (!nonimmediate_operand (op3, GET_MODE (op3))) 12654 op3 = copy_to_mode_reg (GET_MODE (op3), op3); 12655 emit_insn (gen (half, op3)); 12656 op3 = half; 12657 break; 12658 case IX86_BUILTIN_SCATTERALTSIV2DF: 12659 case IX86_BUILTIN_SCATTERALTSIV2DI: 12660 if (!nonimmediate_operand (op2, V4SImode)) 12661 op2 = copy_to_mode_reg (V4SImode, op2); 12662 break; 12663 case IX86_BUILTIN_SCATTERALTDIV4SF: 12664 case IX86_BUILTIN_SCATTERALTDIV4SI: 12665 if (!nonimmediate_operand (op3, GET_MODE (op3))) 12666 op3 = copy_to_mode_reg (GET_MODE (op3), op3); 12667 break; 12668 default: 12669 break; 12670 } 12671 12672 /* Force memory operand only with base register here. But we 12673 don't want to do it on memory operand for other builtin 12674 functions. */ 12675 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1)); 12676 12677 if (!insn_data[icode].operand[0].predicate (op0, Pmode)) 12678 op0 = copy_to_mode_reg (Pmode, op0); 12679 12680 op1 = fixup_modeless_constant (op1, mode1); 12681 12682 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode) 12683 { 12684 if (!insn_data[icode].operand[1].predicate (op1, mode1)) 12685 op1 = copy_to_mode_reg (mode1, op1); 12686 } 12687 else 12688 { 12689 op1 = copy_to_reg (op1); 12690 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1)); 12691 } 12692 12693 if (!insn_data[icode].operand[2].predicate (op2, mode2)) 12694 op2 = copy_to_mode_reg (mode2, op2); 12695 12696 if (!insn_data[icode].operand[3].predicate (op3, mode3)) 12697 op3 = copy_to_mode_reg (mode3, op3); 12698 12699 if (!insn_data[icode].operand[4].predicate (op4, mode4)) 12700 { 12701 error ("the last argument must be scale 1, 2, 4, 8"); 12702 return const0_rtx; 12703 } 12704 12705 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); 12706 if (! pat) 12707 return const0_rtx; 12708 12709 emit_insn (pat); 12710 return 0; 12711 12712 vec_prefetch_gen: 12713 arg0 = CALL_EXPR_ARG (exp, 0); 12714 arg1 = CALL_EXPR_ARG (exp, 1); 12715 arg2 = CALL_EXPR_ARG (exp, 2); 12716 arg3 = CALL_EXPR_ARG (exp, 3); 12717 arg4 = CALL_EXPR_ARG (exp, 4); 12718 op0 = expand_normal (arg0); 12719 op1 = expand_normal (arg1); 12720 op2 = expand_normal (arg2); 12721 op3 = expand_normal (arg3); 12722 op4 = expand_normal (arg4); 12723 mode0 = insn_data[icode].operand[0].mode; 12724 mode1 = insn_data[icode].operand[1].mode; 12725 mode3 = insn_data[icode].operand[3].mode; 12726 mode4 = insn_data[icode].operand[4].mode; 12727 12728 op0 = fixup_modeless_constant (op0, mode0); 12729 12730 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode) 12731 { 12732 if (!insn_data[icode].operand[0].predicate (op0, mode0)) 12733 op0 = copy_to_mode_reg (mode0, op0); 12734 } 12735 else 12736 { 12737 op0 = copy_to_reg (op0); 12738 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0)); 12739 } 12740 12741 if (!insn_data[icode].operand[1].predicate (op1, mode1)) 12742 op1 = copy_to_mode_reg (mode1, op1); 12743 12744 /* Force memory operand only with base register here. But we 12745 don't want to do it on memory operand for other builtin 12746 functions. */ 12747 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1)); 12748 12749 if (!insn_data[icode].operand[2].predicate (op2, Pmode)) 12750 op2 = copy_to_mode_reg (Pmode, op2); 12751 12752 if (!insn_data[icode].operand[3].predicate (op3, mode3)) 12753 { 12754 error ("the forth argument must be scale 1, 2, 4, 8"); 12755 return const0_rtx; 12756 } 12757 12758 if (!insn_data[icode].operand[4].predicate (op4, mode4)) 12759 { 12760 error ("incorrect hint operand"); 12761 return const0_rtx; 12762 } 12763 12764 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); 12765 if (! pat) 12766 return const0_rtx; 12767 12768 emit_insn (pat); 12769 12770 return 0; 12771 12772 case IX86_BUILTIN_XABORT: 12773 icode = CODE_FOR_xabort; 12774 arg0 = CALL_EXPR_ARG (exp, 0); 12775 op0 = expand_normal (arg0); 12776 mode0 = insn_data[icode].operand[0].mode; 12777 if (!insn_data[icode].operand[0].predicate (op0, mode0)) 12778 { 12779 error ("the argument to %<xabort%> intrinsic must " 12780 "be an 8-bit immediate"); 12781 return const0_rtx; 12782 } 12783 emit_insn (gen_xabort (op0)); 12784 return 0; 12785 12786 case IX86_BUILTIN_RSTORSSP: 12787 case IX86_BUILTIN_CLRSSBSY: 12788 arg0 = CALL_EXPR_ARG (exp, 0); 12789 op0 = expand_normal (arg0); 12790 icode = (fcode == IX86_BUILTIN_RSTORSSP 12791 ? CODE_FOR_rstorssp 12792 : CODE_FOR_clrssbsy); 12793 if (!address_operand (op0, VOIDmode)) 12794 { 12795 op1 = convert_memory_address (Pmode, op0); 12796 op0 = copy_addr_to_reg (op1); 12797 } 12798 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0))); 12799 return 0; 12800 12801 case IX86_BUILTIN_WRSSD: 12802 case IX86_BUILTIN_WRSSQ: 12803 case IX86_BUILTIN_WRUSSD: 12804 case IX86_BUILTIN_WRUSSQ: 12805 arg0 = CALL_EXPR_ARG (exp, 0); 12806 op0 = expand_normal (arg0); 12807 arg1 = CALL_EXPR_ARG (exp, 1); 12808 op1 = expand_normal (arg1); 12809 switch (fcode) 12810 { 12811 case IX86_BUILTIN_WRSSD: 12812 icode = CODE_FOR_wrsssi; 12813 mode = SImode; 12814 break; 12815 case IX86_BUILTIN_WRSSQ: 12816 icode = CODE_FOR_wrssdi; 12817 mode = DImode; 12818 break; 12819 case IX86_BUILTIN_WRUSSD: 12820 icode = CODE_FOR_wrusssi; 12821 mode = SImode; 12822 break; 12823 case IX86_BUILTIN_WRUSSQ: 12824 icode = CODE_FOR_wrussdi; 12825 mode = DImode; 12826 break; 12827 } 12828 op0 = force_reg (mode, op0); 12829 if (!address_operand (op1, VOIDmode)) 12830 { 12831 op2 = convert_memory_address (Pmode, op1); 12832 op1 = copy_addr_to_reg (op2); 12833 } 12834 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1))); 12835 return 0; 12836 12837 case IX86_BUILTIN_VZEROUPPER: 12838 cfun->machine->has_explicit_vzeroupper = true; 12839 break; 12840 12841 default: 12842 break; 12843 } 12844 12845 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST 12846 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST) 12847 { 12848 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST; 12849 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp, 12850 target); 12851 } 12852 12853 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST 12854 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST) 12855 { 12856 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST; 12857 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL; 12858 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx); 12859 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx); 12860 int masked = 1; 12861 machine_mode mode, wide_mode, nar_mode; 12862 12863 nar_mode = V4SFmode; 12864 mode = V16SFmode; 12865 wide_mode = V64SFmode; 12866 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask; 12867 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz; 12868 12869 switch (fcode) 12870 { 12871 case IX86_BUILTIN_4FMAPS: 12872 fcn = gen_avx5124fmaddps_4fmaddps; 12873 masked = 0; 12874 goto v4fma_expand; 12875 12876 case IX86_BUILTIN_4DPWSSD: 12877 nar_mode = V4SImode; 12878 mode = V16SImode; 12879 wide_mode = V64SImode; 12880 fcn = gen_avx5124vnniw_vp4dpwssd; 12881 masked = 0; 12882 goto v4fma_expand; 12883 12884 case IX86_BUILTIN_4DPWSSDS: 12885 nar_mode = V4SImode; 12886 mode = V16SImode; 12887 wide_mode = V64SImode; 12888 fcn = gen_avx5124vnniw_vp4dpwssds; 12889 masked = 0; 12890 goto v4fma_expand; 12891 12892 case IX86_BUILTIN_4FNMAPS: 12893 fcn = gen_avx5124fmaddps_4fnmaddps; 12894 masked = 0; 12895 goto v4fma_expand; 12896 12897 case IX86_BUILTIN_4FNMAPS_MASK: 12898 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask; 12899 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz; 12900 goto v4fma_expand; 12901 12902 case IX86_BUILTIN_4DPWSSD_MASK: 12903 nar_mode = V4SImode; 12904 mode = V16SImode; 12905 wide_mode = V64SImode; 12906 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask; 12907 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz; 12908 goto v4fma_expand; 12909 12910 case IX86_BUILTIN_4DPWSSDS_MASK: 12911 nar_mode = V4SImode; 12912 mode = V16SImode; 12913 wide_mode = V64SImode; 12914 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask; 12915 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz; 12916 goto v4fma_expand; 12917 12918 case IX86_BUILTIN_4FMAPS_MASK: 12919 { 12920 tree args[4]; 12921 rtx ops[4]; 12922 rtx wide_reg; 12923 rtx accum; 12924 rtx addr; 12925 rtx mem; 12926 12927v4fma_expand: 12928 wide_reg = gen_reg_rtx (wide_mode); 12929 for (i = 0; i < 4; i++) 12930 { 12931 args[i] = CALL_EXPR_ARG (exp, i); 12932 ops[i] = expand_normal (args[i]); 12933 12934 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64), 12935 ops[i]); 12936 } 12937 12938 accum = expand_normal (CALL_EXPR_ARG (exp, 4)); 12939 accum = force_reg (mode, accum); 12940 12941 addr = expand_normal (CALL_EXPR_ARG (exp, 5)); 12942 addr = force_reg (Pmode, addr); 12943 12944 mem = gen_rtx_MEM (nar_mode, addr); 12945 12946 target = gen_reg_rtx (mode); 12947 12948 emit_move_insn (target, accum); 12949 12950 if (! masked) 12951 emit_insn (fcn (target, accum, wide_reg, mem)); 12952 else 12953 { 12954 rtx merge, mask; 12955 merge = expand_normal (CALL_EXPR_ARG (exp, 6)); 12956 12957 mask = expand_normal (CALL_EXPR_ARG (exp, 7)); 12958 12959 if (CONST_INT_P (mask)) 12960 mask = fixup_modeless_constant (mask, HImode); 12961 12962 mask = force_reg (HImode, mask); 12963 12964 if (GET_MODE (mask) != HImode) 12965 mask = gen_rtx_SUBREG (HImode, mask, 0); 12966 12967 /* If merge is 0 then we're about to emit z-masked variant. */ 12968 if (const0_operand (merge, mode)) 12969 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); 12970 /* If merge is the same as accum then emit merge-masked variant. */ 12971 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) 12972 { 12973 merge = force_reg (mode, merge); 12974 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); 12975 } 12976 /* Merge with something unknown might happen if we z-mask w/ -O0. */ 12977 else 12978 { 12979 target = gen_reg_rtx (mode); 12980 emit_move_insn (target, merge); 12981 emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); 12982 } 12983 } 12984 return target; 12985 } 12986 12987 case IX86_BUILTIN_4FNMASS: 12988 fcn = gen_avx5124fmaddps_4fnmaddss; 12989 masked = 0; 12990 goto s4fma_expand; 12991 12992 case IX86_BUILTIN_4FMASS: 12993 fcn = gen_avx5124fmaddps_4fmaddss; 12994 masked = 0; 12995 goto s4fma_expand; 12996 12997 case IX86_BUILTIN_4FNMASS_MASK: 12998 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask; 12999 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz; 13000 goto s4fma_expand; 13001 13002 case IX86_BUILTIN_4FMASS_MASK: 13003 { 13004 tree args[4]; 13005 rtx ops[4]; 13006 rtx wide_reg; 13007 rtx accum; 13008 rtx addr; 13009 rtx mem; 13010 13011 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask; 13012 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz; 13013 13014s4fma_expand: 13015 mode = V4SFmode; 13016 wide_reg = gen_reg_rtx (V64SFmode); 13017 for (i = 0; i < 4; i++) 13018 { 13019 rtx tmp; 13020 args[i] = CALL_EXPR_ARG (exp, i); 13021 ops[i] = expand_normal (args[i]); 13022 13023 tmp = gen_reg_rtx (SFmode); 13024 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0)); 13025 13026 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64), 13027 gen_rtx_SUBREG (V16SFmode, tmp, 0)); 13028 } 13029 13030 accum = expand_normal (CALL_EXPR_ARG (exp, 4)); 13031 accum = force_reg (V4SFmode, accum); 13032 13033 addr = expand_normal (CALL_EXPR_ARG (exp, 5)); 13034 addr = force_reg (Pmode, addr); 13035 13036 mem = gen_rtx_MEM (V4SFmode, addr); 13037 13038 target = gen_reg_rtx (V4SFmode); 13039 13040 emit_move_insn (target, accum); 13041 13042 if (! masked) 13043 emit_insn (fcn (target, accum, wide_reg, mem)); 13044 else 13045 { 13046 rtx merge, mask; 13047 merge = expand_normal (CALL_EXPR_ARG (exp, 6)); 13048 13049 mask = expand_normal (CALL_EXPR_ARG (exp, 7)); 13050 13051 if (CONST_INT_P (mask)) 13052 mask = fixup_modeless_constant (mask, QImode); 13053 13054 mask = force_reg (QImode, mask); 13055 13056 if (GET_MODE (mask) != QImode) 13057 mask = gen_rtx_SUBREG (QImode, mask, 0); 13058 13059 /* If merge is 0 then we're about to emit z-masked variant. */ 13060 if (const0_operand (merge, mode)) 13061 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); 13062 /* If merge is the same as accum then emit merge-masked 13063 variant. */ 13064 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) 13065 { 13066 merge = force_reg (mode, merge); 13067 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); 13068 } 13069 /* Merge with something unknown might happen if we z-mask 13070 w/ -O0. */ 13071 else 13072 { 13073 target = gen_reg_rtx (mode); 13074 emit_move_insn (target, merge); 13075 emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); 13076 } 13077 } 13078 return target; 13079 } 13080 case IX86_BUILTIN_RDPID: 13081 return ix86_expand_special_args_builtin (bdesc_args + i, exp, 13082 target); 13083 case IX86_BUILTIN_FABSQ: 13084 case IX86_BUILTIN_COPYSIGNQ: 13085 if (!TARGET_SSE) 13086 /* Emit a normal call if SSE isn't available. */ 13087 return expand_call (exp, target, ignore); 13088 /* FALLTHRU */ 13089 default: 13090 return ix86_expand_args_builtin (bdesc_args + i, exp, target); 13091 } 13092 } 13093 13094 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST 13095 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST) 13096 { 13097 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST; 13098 return ix86_expand_sse_comi (bdesc_comi + i, exp, target); 13099 } 13100 13101 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST 13102 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST) 13103 { 13104 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST; 13105 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target); 13106 } 13107 13108 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST 13109 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST) 13110 { 13111 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST; 13112 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target); 13113 } 13114 13115 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST 13116 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST) 13117 { 13118 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST; 13119 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target); 13120 } 13121 13122 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST 13123 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST) 13124 { 13125 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST; 13126 const struct builtin_description *d = bdesc_multi_arg + i; 13127 return ix86_expand_multi_arg_builtin (d->icode, exp, target, 13128 (enum ix86_builtin_func_type) 13129 d->flag, d->comparison); 13130 } 13131 13132 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST 13133 && fcode <= IX86_BUILTIN__BDESC_CET_LAST) 13134 { 13135 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST; 13136 return ix86_expand_special_args_builtin (bdesc_cet + i, exp, 13137 target); 13138 } 13139 13140 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST 13141 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST) 13142 { 13143 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST; 13144 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp, 13145 target); 13146 } 13147 13148 gcc_unreachable (); 13149} 13150 13151/* A subroutine of ix86_expand_vector_init_duplicate. Tries to 13152 fill target with val via vec_duplicate. */ 13153 13154static bool 13155ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val) 13156{ 13157 bool ok; 13158 rtx_insn *insn; 13159 rtx dup; 13160 13161 /* First attempt to recognize VAL as-is. */ 13162 dup = gen_vec_duplicate (mode, val); 13163 insn = emit_insn (gen_rtx_SET (target, dup)); 13164 if (recog_memoized (insn) < 0) 13165 { 13166 rtx_insn *seq; 13167 machine_mode innermode = GET_MODE_INNER (mode); 13168 rtx reg; 13169 13170 /* If that fails, force VAL into a register. */ 13171 13172 start_sequence (); 13173 reg = force_reg (innermode, val); 13174 if (GET_MODE (reg) != innermode) 13175 reg = gen_lowpart (innermode, reg); 13176 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg); 13177 seq = get_insns (); 13178 end_sequence (); 13179 if (seq) 13180 emit_insn_before (seq, insn); 13181 13182 ok = recog_memoized (insn) >= 0; 13183 gcc_assert (ok); 13184 } 13185 return true; 13186} 13187 13188/* Get a vector mode of the same size as the original but with elements 13189 twice as wide. This is only guaranteed to apply to integral vectors. */ 13190 13191static machine_mode 13192get_mode_wider_vector (machine_mode o) 13193{ 13194 /* ??? Rely on the ordering that genmodes.c gives to vectors. */ 13195 machine_mode n = GET_MODE_WIDER_MODE (o).require (); 13196 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); 13197 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); 13198 return n; 13199} 13200 13201static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); 13202static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); 13203 13204/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector 13205 with all elements equal to VAR. Return true if successful. */ 13206 13207static bool 13208ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, 13209 rtx target, rtx val) 13210{ 13211 bool ok; 13212 13213 switch (mode) 13214 { 13215 case E_V2SImode: 13216 case E_V2SFmode: 13217 if (!mmx_ok) 13218 return false; 13219 /* FALLTHRU */ 13220 13221 case E_V4DFmode: 13222 case E_V4DImode: 13223 case E_V8SFmode: 13224 case E_V8SImode: 13225 case E_V2DFmode: 13226 case E_V2DImode: 13227 case E_V4SFmode: 13228 case E_V4SImode: 13229 case E_V16SImode: 13230 case E_V8DImode: 13231 case E_V16SFmode: 13232 case E_V8DFmode: 13233 return ix86_vector_duplicate_value (mode, target, val); 13234 13235 case E_V4HImode: 13236 if (!mmx_ok) 13237 return false; 13238 if (TARGET_SSE || TARGET_3DNOW_A) 13239 { 13240 rtx x; 13241 13242 val = gen_lowpart (SImode, val); 13243 x = gen_rtx_TRUNCATE (HImode, val); 13244 x = gen_rtx_VEC_DUPLICATE (mode, x); 13245 emit_insn (gen_rtx_SET (target, x)); 13246 return true; 13247 } 13248 goto widen; 13249 13250 case E_V8QImode: 13251 if (!mmx_ok) 13252 return false; 13253 goto widen; 13254 13255 case E_V8HImode: 13256 if (TARGET_AVX2) 13257 return ix86_vector_duplicate_value (mode, target, val); 13258 13259 if (TARGET_SSE2) 13260 { 13261 struct expand_vec_perm_d dperm; 13262 rtx tmp1, tmp2; 13263 13264 permute: 13265 memset (&dperm, 0, sizeof (dperm)); 13266 dperm.target = target; 13267 dperm.vmode = mode; 13268 dperm.nelt = GET_MODE_NUNITS (mode); 13269 dperm.op0 = dperm.op1 = gen_reg_rtx (mode); 13270 dperm.one_operand_p = true; 13271 13272 /* Extend to SImode using a paradoxical SUBREG. */ 13273 tmp1 = gen_reg_rtx (SImode); 13274 emit_move_insn (tmp1, gen_lowpart (SImode, val)); 13275 13276 /* Insert the SImode value as low element of a V4SImode vector. */ 13277 tmp2 = gen_reg_rtx (V4SImode); 13278 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); 13279 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); 13280 13281 ok = (expand_vec_perm_1 (&dperm) 13282 || expand_vec_perm_broadcast_1 (&dperm)); 13283 gcc_assert (ok); 13284 return ok; 13285 } 13286 goto widen; 13287 13288 case E_V16QImode: 13289 if (TARGET_AVX2) 13290 return ix86_vector_duplicate_value (mode, target, val); 13291 13292 if (TARGET_SSE2) 13293 goto permute; 13294 goto widen; 13295 13296 widen: 13297 /* Replicate the value once into the next wider mode and recurse. */ 13298 { 13299 machine_mode smode, wsmode, wvmode; 13300 rtx x; 13301 13302 smode = GET_MODE_INNER (mode); 13303 wvmode = get_mode_wider_vector (mode); 13304 wsmode = GET_MODE_INNER (wvmode); 13305 13306 val = convert_modes (wsmode, smode, val, true); 13307 x = expand_simple_binop (wsmode, ASHIFT, val, 13308 GEN_INT (GET_MODE_BITSIZE (smode)), 13309 NULL_RTX, 1, OPTAB_LIB_WIDEN); 13310 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); 13311 13312 x = gen_reg_rtx (wvmode); 13313 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); 13314 gcc_assert (ok); 13315 emit_move_insn (target, gen_lowpart (GET_MODE (target), x)); 13316 return ok; 13317 } 13318 13319 case E_V16HImode: 13320 case E_V32QImode: 13321 if (TARGET_AVX2) 13322 return ix86_vector_duplicate_value (mode, target, val); 13323 else 13324 { 13325 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); 13326 rtx x = gen_reg_rtx (hvmode); 13327 13328 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); 13329 gcc_assert (ok); 13330 13331 x = gen_rtx_VEC_CONCAT (mode, x, x); 13332 emit_insn (gen_rtx_SET (target, x)); 13333 } 13334 return true; 13335 13336 case E_V64QImode: 13337 case E_V32HImode: 13338 if (TARGET_AVX512BW) 13339 return ix86_vector_duplicate_value (mode, target, val); 13340 else 13341 { 13342 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode); 13343 rtx x = gen_reg_rtx (hvmode); 13344 13345 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); 13346 gcc_assert (ok); 13347 13348 x = gen_rtx_VEC_CONCAT (mode, x, x); 13349 emit_insn (gen_rtx_SET (target, x)); 13350 } 13351 return true; 13352 13353 default: 13354 return false; 13355 } 13356} 13357 13358/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector 13359 whose ONE_VAR element is VAR, and other elements are zero. Return true 13360 if successful. */ 13361 13362static bool 13363ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, 13364 rtx target, rtx var, int one_var) 13365{ 13366 machine_mode vsimode; 13367 rtx new_target; 13368 rtx x, tmp; 13369 bool use_vector_set = false; 13370 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; 13371 13372 switch (mode) 13373 { 13374 case E_V2DImode: 13375 /* For SSE4.1, we normally use vector set. But if the second 13376 element is zero and inter-unit moves are OK, we use movq 13377 instead. */ 13378 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1 13379 && !(TARGET_INTER_UNIT_MOVES_TO_VEC 13380 && one_var == 0)); 13381 break; 13382 case E_V16QImode: 13383 case E_V4SImode: 13384 case E_V4SFmode: 13385 use_vector_set = TARGET_SSE4_1; 13386 break; 13387 case E_V8HImode: 13388 use_vector_set = TARGET_SSE2; 13389 break; 13390 case E_V8QImode: 13391 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; 13392 break; 13393 case E_V4HImode: 13394 use_vector_set = TARGET_SSE || TARGET_3DNOW_A; 13395 break; 13396 case E_V32QImode: 13397 case E_V16HImode: 13398 use_vector_set = TARGET_AVX; 13399 break; 13400 case E_V8SImode: 13401 use_vector_set = TARGET_AVX; 13402 gen_vec_set_0 = gen_vec_setv8si_0; 13403 break; 13404 case E_V8SFmode: 13405 use_vector_set = TARGET_AVX; 13406 gen_vec_set_0 = gen_vec_setv8sf_0; 13407 break; 13408 case E_V4DFmode: 13409 use_vector_set = TARGET_AVX; 13410 gen_vec_set_0 = gen_vec_setv4df_0; 13411 break; 13412 case E_V4DImode: 13413 /* Use ix86_expand_vector_set in 64bit mode only. */ 13414 use_vector_set = TARGET_AVX && TARGET_64BIT; 13415 gen_vec_set_0 = gen_vec_setv4di_0; 13416 break; 13417 case E_V16SImode: 13418 use_vector_set = TARGET_AVX512F && one_var == 0; 13419 gen_vec_set_0 = gen_vec_setv16si_0; 13420 break; 13421 case E_V16SFmode: 13422 use_vector_set = TARGET_AVX512F && one_var == 0; 13423 gen_vec_set_0 = gen_vec_setv16sf_0; 13424 break; 13425 case E_V8DFmode: 13426 use_vector_set = TARGET_AVX512F && one_var == 0; 13427 gen_vec_set_0 = gen_vec_setv8df_0; 13428 break; 13429 case E_V8DImode: 13430 /* Use ix86_expand_vector_set in 64bit mode only. */ 13431 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; 13432 gen_vec_set_0 = gen_vec_setv8di_0; 13433 break; 13434 default: 13435 break; 13436 } 13437 13438 if (use_vector_set) 13439 { 13440 if (gen_vec_set_0 && one_var == 0) 13441 { 13442 var = force_reg (GET_MODE_INNER (mode), var); 13443 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); 13444 return true; 13445 } 13446 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); 13447 var = force_reg (GET_MODE_INNER (mode), var); 13448 ix86_expand_vector_set (mmx_ok, target, var, one_var); 13449 return true; 13450 } 13451 13452 switch (mode) 13453 { 13454 case E_V2SFmode: 13455 case E_V2SImode: 13456 if (!mmx_ok) 13457 return false; 13458 /* FALLTHRU */ 13459 13460 case E_V2DFmode: 13461 case E_V2DImode: 13462 if (one_var != 0) 13463 return false; 13464 var = force_reg (GET_MODE_INNER (mode), var); 13465 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); 13466 emit_insn (gen_rtx_SET (target, x)); 13467 return true; 13468 13469 case E_V4SFmode: 13470 case E_V4SImode: 13471 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) 13472 new_target = gen_reg_rtx (mode); 13473 else 13474 new_target = target; 13475 var = force_reg (GET_MODE_INNER (mode), var); 13476 x = gen_rtx_VEC_DUPLICATE (mode, var); 13477 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); 13478 emit_insn (gen_rtx_SET (new_target, x)); 13479 if (one_var != 0) 13480 { 13481 /* We need to shuffle the value to the correct position, so 13482 create a new pseudo to store the intermediate result. */ 13483 13484 /* With SSE2, we can use the integer shuffle insns. */ 13485 if (mode != V4SFmode && TARGET_SSE2) 13486 { 13487 emit_insn (gen_sse2_pshufd_1 (new_target, new_target, 13488 const1_rtx, 13489 GEN_INT (one_var == 1 ? 0 : 1), 13490 GEN_INT (one_var == 2 ? 0 : 1), 13491 GEN_INT (one_var == 3 ? 0 : 1))); 13492 if (target != new_target) 13493 emit_move_insn (target, new_target); 13494 return true; 13495 } 13496 13497 /* Otherwise convert the intermediate result to V4SFmode and 13498 use the SSE1 shuffle instructions. */ 13499 if (mode != V4SFmode) 13500 { 13501 tmp = gen_reg_rtx (V4SFmode); 13502 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); 13503 } 13504 else 13505 tmp = new_target; 13506 13507 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp, 13508 const1_rtx, 13509 GEN_INT (one_var == 1 ? 0 : 1), 13510 GEN_INT (one_var == 2 ? 0+4 : 1+4), 13511 GEN_INT (one_var == 3 ? 0+4 : 1+4))); 13512 13513 if (mode != V4SFmode) 13514 emit_move_insn (target, gen_lowpart (V4SImode, tmp)); 13515 else if (tmp != target) 13516 emit_move_insn (target, tmp); 13517 } 13518 else if (target != new_target) 13519 emit_move_insn (target, new_target); 13520 return true; 13521 13522 case E_V8HImode: 13523 case E_V16QImode: 13524 vsimode = V4SImode; 13525 goto widen; 13526 case E_V4HImode: 13527 case E_V8QImode: 13528 if (!mmx_ok) 13529 return false; 13530 vsimode = V2SImode; 13531 goto widen; 13532 widen: 13533 if (one_var != 0) 13534 return false; 13535 13536 /* Zero extend the variable element to SImode and recurse. */ 13537 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); 13538 13539 x = gen_reg_rtx (vsimode); 13540 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, 13541 var, one_var)) 13542 gcc_unreachable (); 13543 13544 emit_move_insn (target, gen_lowpart (mode, x)); 13545 return true; 13546 13547 default: 13548 return false; 13549 } 13550} 13551 13552/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector 13553 consisting of the values in VALS. It is known that all elements 13554 except ONE_VAR are constants. Return true if successful. */ 13555 13556static bool 13557ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, 13558 rtx target, rtx vals, int one_var) 13559{ 13560 rtx var = XVECEXP (vals, 0, one_var); 13561 machine_mode wmode; 13562 rtx const_vec, x; 13563 13564 const_vec = copy_rtx (vals); 13565 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode)); 13566 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0)); 13567 13568 switch (mode) 13569 { 13570 case E_V2DFmode: 13571 case E_V2DImode: 13572 case E_V2SFmode: 13573 case E_V2SImode: 13574 /* For the two element vectors, it's just as easy to use 13575 the general case. */ 13576 return false; 13577 13578 case E_V4DImode: 13579 /* Use ix86_expand_vector_set in 64bit mode only. */ 13580 if (!TARGET_64BIT) 13581 return false; 13582 /* FALLTHRU */ 13583 case E_V4DFmode: 13584 case E_V8SFmode: 13585 case E_V8SImode: 13586 case E_V16HImode: 13587 case E_V32QImode: 13588 case E_V4SFmode: 13589 case E_V4SImode: 13590 case E_V8HImode: 13591 case E_V4HImode: 13592 break; 13593 13594 case E_V16QImode: 13595 if (TARGET_SSE4_1) 13596 break; 13597 wmode = V8HImode; 13598 goto widen; 13599 case E_V8QImode: 13600 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1) 13601 break; 13602 wmode = V4HImode; 13603 goto widen; 13604 widen: 13605 /* There's no way to set one QImode entry easily. Combine 13606 the variable value with its adjacent constant value, and 13607 promote to an HImode set. */ 13608 x = XVECEXP (vals, 0, one_var ^ 1); 13609 if (one_var & 1) 13610 { 13611 var = convert_modes (HImode, QImode, var, true); 13612 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8), 13613 NULL_RTX, 1, OPTAB_LIB_WIDEN); 13614 x = GEN_INT (INTVAL (x) & 0xff); 13615 } 13616 else 13617 { 13618 var = convert_modes (HImode, QImode, var, true); 13619 x = gen_int_mode (UINTVAL (x) << 8, HImode); 13620 } 13621 if (x != const0_rtx) 13622 var = expand_simple_binop (HImode, IOR, var, x, var, 13623 1, OPTAB_LIB_WIDEN); 13624 13625 x = gen_reg_rtx (wmode); 13626 emit_move_insn (x, gen_lowpart (wmode, const_vec)); 13627 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1); 13628 13629 emit_move_insn (target, gen_lowpart (mode, x)); 13630 return true; 13631 13632 default: 13633 return false; 13634 } 13635 13636 emit_move_insn (target, const_vec); 13637 ix86_expand_vector_set (mmx_ok, target, var, one_var); 13638 return true; 13639} 13640 13641/* A subroutine of ix86_expand_vector_init_general. Use vector 13642 concatenate to handle the most general case: all values variable, 13643 and none identical. */ 13644 13645static void 13646ix86_expand_vector_init_concat (machine_mode mode, 13647 rtx target, rtx *ops, int n) 13648{ 13649 machine_mode half_mode = VOIDmode; 13650 rtx half[2]; 13651 rtvec v; 13652 int i, j; 13653 13654 switch (n) 13655 { 13656 case 2: 13657 switch (mode) 13658 { 13659 case E_V16SImode: 13660 half_mode = V8SImode; 13661 break; 13662 case E_V16SFmode: 13663 half_mode = V8SFmode; 13664 break; 13665 case E_V8DImode: 13666 half_mode = V4DImode; 13667 break; 13668 case E_V8DFmode: 13669 half_mode = V4DFmode; 13670 break; 13671 case E_V8SImode: 13672 half_mode = V4SImode; 13673 break; 13674 case E_V8SFmode: 13675 half_mode = V4SFmode; 13676 break; 13677 case E_V4DImode: 13678 half_mode = V2DImode; 13679 break; 13680 case E_V4DFmode: 13681 half_mode = V2DFmode; 13682 break; 13683 case E_V4SImode: 13684 half_mode = V2SImode; 13685 break; 13686 case E_V4SFmode: 13687 half_mode = V2SFmode; 13688 break; 13689 case E_V2DImode: 13690 half_mode = DImode; 13691 break; 13692 case E_V2SImode: 13693 half_mode = SImode; 13694 break; 13695 case E_V2DFmode: 13696 half_mode = DFmode; 13697 break; 13698 case E_V2SFmode: 13699 half_mode = SFmode; 13700 break; 13701 default: 13702 gcc_unreachable (); 13703 } 13704 13705 if (!register_operand (ops[1], half_mode)) 13706 ops[1] = force_reg (half_mode, ops[1]); 13707 if (!register_operand (ops[0], half_mode)) 13708 ops[0] = force_reg (half_mode, ops[0]); 13709 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0], 13710 ops[1]))); 13711 break; 13712 13713 case 4: 13714 switch (mode) 13715 { 13716 case E_V4DImode: 13717 half_mode = V2DImode; 13718 break; 13719 case E_V4DFmode: 13720 half_mode = V2DFmode; 13721 break; 13722 case E_V4SImode: 13723 half_mode = V2SImode; 13724 break; 13725 case E_V4SFmode: 13726 half_mode = V2SFmode; 13727 break; 13728 default: 13729 gcc_unreachable (); 13730 } 13731 goto half; 13732 13733 case 8: 13734 switch (mode) 13735 { 13736 case E_V8DImode: 13737 half_mode = V4DImode; 13738 break; 13739 case E_V8DFmode: 13740 half_mode = V4DFmode; 13741 break; 13742 case E_V8SImode: 13743 half_mode = V4SImode; 13744 break; 13745 case E_V8SFmode: 13746 half_mode = V4SFmode; 13747 break; 13748 default: 13749 gcc_unreachable (); 13750 } 13751 goto half; 13752 13753 case 16: 13754 switch (mode) 13755 { 13756 case E_V16SImode: 13757 half_mode = V8SImode; 13758 break; 13759 case E_V16SFmode: 13760 half_mode = V8SFmode; 13761 break; 13762 default: 13763 gcc_unreachable (); 13764 } 13765 goto half; 13766 13767half: 13768 /* FIXME: We process inputs backward to help RA. PR 36222. */ 13769 i = n - 1; 13770 for (j = 1; j != -1; j--) 13771 { 13772 half[j] = gen_reg_rtx (half_mode); 13773 switch (n >> 1) 13774 { 13775 case 2: 13776 v = gen_rtvec (2, ops[i-1], ops[i]); 13777 i -= 2; 13778 break; 13779 case 4: 13780 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]); 13781 i -= 4; 13782 break; 13783 case 8: 13784 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4], 13785 ops[i-3], ops[i-2], ops[i-1], ops[i]); 13786 i -= 8; 13787 break; 13788 default: 13789 gcc_unreachable (); 13790 } 13791 ix86_expand_vector_init (false, half[j], 13792 gen_rtx_PARALLEL (half_mode, v)); 13793 } 13794 13795 ix86_expand_vector_init_concat (mode, target, half, 2); 13796 break; 13797 13798 default: 13799 gcc_unreachable (); 13800 } 13801} 13802 13803/* A subroutine of ix86_expand_vector_init_general. Use vector 13804 interleave to handle the most general case: all values variable, 13805 and none identical. */ 13806 13807static void 13808ix86_expand_vector_init_interleave (machine_mode mode, 13809 rtx target, rtx *ops, int n) 13810{ 13811 machine_mode first_imode, second_imode, third_imode, inner_mode; 13812 int i, j; 13813 rtx op0, op1; 13814 rtx (*gen_load_even) (rtx, rtx, rtx); 13815 rtx (*gen_interleave_first_low) (rtx, rtx, rtx); 13816 rtx (*gen_interleave_second_low) (rtx, rtx, rtx); 13817 13818 switch (mode) 13819 { 13820 case E_V8HImode: 13821 gen_load_even = gen_vec_setv8hi; 13822 gen_interleave_first_low = gen_vec_interleave_lowv4si; 13823 gen_interleave_second_low = gen_vec_interleave_lowv2di; 13824 inner_mode = HImode; 13825 first_imode = V4SImode; 13826 second_imode = V2DImode; 13827 third_imode = VOIDmode; 13828 break; 13829 case E_V16QImode: 13830 gen_load_even = gen_vec_setv16qi; 13831 gen_interleave_first_low = gen_vec_interleave_lowv8hi; 13832 gen_interleave_second_low = gen_vec_interleave_lowv4si; 13833 inner_mode = QImode; 13834 first_imode = V8HImode; 13835 second_imode = V4SImode; 13836 third_imode = V2DImode; 13837 break; 13838 default: 13839 gcc_unreachable (); 13840 } 13841 13842 for (i = 0; i < n; i++) 13843 { 13844 /* Extend the odd elment to SImode using a paradoxical SUBREG. */ 13845 op0 = gen_reg_rtx (SImode); 13846 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); 13847 13848 /* Insert the SImode value as low element of V4SImode vector. */ 13849 op1 = gen_reg_rtx (V4SImode); 13850 op0 = gen_rtx_VEC_MERGE (V4SImode, 13851 gen_rtx_VEC_DUPLICATE (V4SImode, 13852 op0), 13853 CONST0_RTX (V4SImode), 13854 const1_rtx); 13855 emit_insn (gen_rtx_SET (op1, op0)); 13856 13857 /* Cast the V4SImode vector back to a vector in orignal mode. */ 13858 op0 = gen_reg_rtx (mode); 13859 emit_move_insn (op0, gen_lowpart (mode, op1)); 13860 13861 /* Load even elements into the second position. */ 13862 emit_insn (gen_load_even (op0, 13863 force_reg (inner_mode, 13864 ops [i + i + 1]), 13865 const1_rtx)); 13866 13867 /* Cast vector to FIRST_IMODE vector. */ 13868 ops[i] = gen_reg_rtx (first_imode); 13869 emit_move_insn (ops[i], gen_lowpart (first_imode, op0)); 13870 } 13871 13872 /* Interleave low FIRST_IMODE vectors. */ 13873 for (i = j = 0; i < n; i += 2, j++) 13874 { 13875 op0 = gen_reg_rtx (first_imode); 13876 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1])); 13877 13878 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */ 13879 ops[j] = gen_reg_rtx (second_imode); 13880 emit_move_insn (ops[j], gen_lowpart (second_imode, op0)); 13881 } 13882 13883 /* Interleave low SECOND_IMODE vectors. */ 13884 switch (second_imode) 13885 { 13886 case E_V4SImode: 13887 for (i = j = 0; i < n / 2; i += 2, j++) 13888 { 13889 op0 = gen_reg_rtx (second_imode); 13890 emit_insn (gen_interleave_second_low (op0, ops[i], 13891 ops[i + 1])); 13892 13893 /* Cast the SECOND_IMODE vector to the THIRD_IMODE 13894 vector. */ 13895 ops[j] = gen_reg_rtx (third_imode); 13896 emit_move_insn (ops[j], gen_lowpart (third_imode, op0)); 13897 } 13898 second_imode = V2DImode; 13899 gen_interleave_second_low = gen_vec_interleave_lowv2di; 13900 /* FALLTHRU */ 13901 13902 case E_V2DImode: 13903 op0 = gen_reg_rtx (second_imode); 13904 emit_insn (gen_interleave_second_low (op0, ops[0], 13905 ops[1])); 13906 13907 /* Cast the SECOND_IMODE vector back to a vector on original 13908 mode. */ 13909 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0))); 13910 break; 13911 13912 default: 13913 gcc_unreachable (); 13914 } 13915} 13916 13917/* A subroutine of ix86_expand_vector_init. Handle the most general case: 13918 all values variable, and none identical. */ 13919 13920static void 13921ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, 13922 rtx target, rtx vals) 13923{ 13924 rtx ops[64], op0, op1, op2, op3, op4, op5; 13925 machine_mode half_mode = VOIDmode; 13926 machine_mode quarter_mode = VOIDmode; 13927 int n, i; 13928 13929 switch (mode) 13930 { 13931 case E_V2SFmode: 13932 case E_V2SImode: 13933 if (!mmx_ok && !TARGET_SSE) 13934 break; 13935 /* FALLTHRU */ 13936 13937 case E_V16SImode: 13938 case E_V16SFmode: 13939 case E_V8DFmode: 13940 case E_V8DImode: 13941 case E_V8SFmode: 13942 case E_V8SImode: 13943 case E_V4DFmode: 13944 case E_V4DImode: 13945 case E_V4SFmode: 13946 case E_V4SImode: 13947 case E_V2DFmode: 13948 case E_V2DImode: 13949 n = GET_MODE_NUNITS (mode); 13950 for (i = 0; i < n; i++) 13951 ops[i] = XVECEXP (vals, 0, i); 13952 ix86_expand_vector_init_concat (mode, target, ops, n); 13953 return; 13954 13955 case E_V2TImode: 13956 for (i = 0; i < 2; i++) 13957 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); 13958 op0 = gen_reg_rtx (V4DImode); 13959 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); 13960 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); 13961 return; 13962 13963 case E_V4TImode: 13964 for (i = 0; i < 4; i++) 13965 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); 13966 ops[4] = gen_reg_rtx (V4DImode); 13967 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); 13968 ops[5] = gen_reg_rtx (V4DImode); 13969 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); 13970 op0 = gen_reg_rtx (V8DImode); 13971 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); 13972 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); 13973 return; 13974 13975 case E_V32QImode: 13976 half_mode = V16QImode; 13977 goto half; 13978 13979 case E_V16HImode: 13980 half_mode = V8HImode; 13981 goto half; 13982 13983half: 13984 n = GET_MODE_NUNITS (mode); 13985 for (i = 0; i < n; i++) 13986 ops[i] = XVECEXP (vals, 0, i); 13987 op0 = gen_reg_rtx (half_mode); 13988 op1 = gen_reg_rtx (half_mode); 13989 ix86_expand_vector_init_interleave (half_mode, op0, ops, 13990 n >> 2); 13991 ix86_expand_vector_init_interleave (half_mode, op1, 13992 &ops [n >> 1], n >> 2); 13993 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1))); 13994 return; 13995 13996 case E_V64QImode: 13997 quarter_mode = V16QImode; 13998 half_mode = V32QImode; 13999 goto quarter; 14000 14001 case E_V32HImode: 14002 quarter_mode = V8HImode; 14003 half_mode = V16HImode; 14004 goto quarter; 14005 14006quarter: 14007 n = GET_MODE_NUNITS (mode); 14008 for (i = 0; i < n; i++) 14009 ops[i] = XVECEXP (vals, 0, i); 14010 op0 = gen_reg_rtx (quarter_mode); 14011 op1 = gen_reg_rtx (quarter_mode); 14012 op2 = gen_reg_rtx (quarter_mode); 14013 op3 = gen_reg_rtx (quarter_mode); 14014 op4 = gen_reg_rtx (half_mode); 14015 op5 = gen_reg_rtx (half_mode); 14016 ix86_expand_vector_init_interleave (quarter_mode, op0, ops, 14017 n >> 3); 14018 ix86_expand_vector_init_interleave (quarter_mode, op1, 14019 &ops [n >> 2], n >> 3); 14020 ix86_expand_vector_init_interleave (quarter_mode, op2, 14021 &ops [n >> 1], n >> 3); 14022 ix86_expand_vector_init_interleave (quarter_mode, op3, 14023 &ops [(n >> 1) | (n >> 2)], n >> 3); 14024 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1))); 14025 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3))); 14026 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5))); 14027 return; 14028 14029 case E_V16QImode: 14030 if (!TARGET_SSE4_1) 14031 break; 14032 /* FALLTHRU */ 14033 14034 case E_V8HImode: 14035 if (!TARGET_SSE2) 14036 break; 14037 14038 /* Don't use ix86_expand_vector_init_interleave if we can't 14039 move from GPR to SSE register directly. */ 14040 if (!TARGET_INTER_UNIT_MOVES_TO_VEC) 14041 break; 14042 14043 n = GET_MODE_NUNITS (mode); 14044 for (i = 0; i < n; i++) 14045 ops[i] = XVECEXP (vals, 0, i); 14046 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1); 14047 return; 14048 14049 case E_V4HImode: 14050 case E_V8QImode: 14051 break; 14052 14053 default: 14054 gcc_unreachable (); 14055 } 14056 14057 { 14058 int i, j, n_elts, n_words, n_elt_per_word; 14059 machine_mode inner_mode; 14060 rtx words[4], shift; 14061 14062 inner_mode = GET_MODE_INNER (mode); 14063 n_elts = GET_MODE_NUNITS (mode); 14064 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; 14065 n_elt_per_word = n_elts / n_words; 14066 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); 14067 14068 for (i = 0; i < n_words; ++i) 14069 { 14070 rtx word = NULL_RTX; 14071 14072 for (j = 0; j < n_elt_per_word; ++j) 14073 { 14074 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); 14075 elt = convert_modes (word_mode, inner_mode, elt, true); 14076 14077 if (j == 0) 14078 word = elt; 14079 else 14080 { 14081 word = expand_simple_binop (word_mode, ASHIFT, word, shift, 14082 NULL_RTX, 1, OPTAB_LIB_WIDEN); 14083 word = expand_simple_binop (word_mode, IOR, word, elt, 14084 NULL_RTX, 1, OPTAB_LIB_WIDEN); 14085 } 14086 } 14087 14088 words[i] = word; 14089 } 14090 14091 if (n_words == 1) 14092 emit_move_insn (target, gen_lowpart (mode, words[0])); 14093 else if (n_words == 2) 14094 { 14095 rtx tmp = gen_reg_rtx (mode); 14096 emit_clobber (tmp); 14097 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); 14098 emit_move_insn (gen_highpart (word_mode, tmp), words[1]); 14099 emit_move_insn (target, tmp); 14100 } 14101 else if (n_words == 4) 14102 { 14103 rtx tmp = gen_reg_rtx (V4SImode); 14104 gcc_assert (word_mode == SImode); 14105 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); 14106 ix86_expand_vector_init_general (false, V4SImode, tmp, vals); 14107 emit_move_insn (target, gen_lowpart (mode, tmp)); 14108 } 14109 else 14110 gcc_unreachable (); 14111 } 14112} 14113 14114/* Initialize vector TARGET via VALS. Suppress the use of MMX 14115 instructions unless MMX_OK is true. */ 14116 14117void 14118ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) 14119{ 14120 machine_mode mode = GET_MODE (target); 14121 machine_mode inner_mode = GET_MODE_INNER (mode); 14122 int n_elts = GET_MODE_NUNITS (mode); 14123 int n_var = 0, one_var = -1; 14124 bool all_same = true, all_const_zero = true; 14125 int i; 14126 rtx x; 14127 14128 /* Handle first initialization from vector elts. */ 14129 if (n_elts != XVECLEN (vals, 0)) 14130 { 14131 rtx subtarget = target; 14132 x = XVECEXP (vals, 0, 0); 14133 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode); 14134 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts) 14135 { 14136 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) }; 14137 if (inner_mode == QImode 14138 || inner_mode == HImode 14139 || inner_mode == TImode) 14140 { 14141 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode); 14142 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode; 14143 n_bits /= GET_MODE_SIZE (elt_mode); 14144 mode = mode_for_vector (elt_mode, n_bits).require (); 14145 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require (); 14146 ops[0] = gen_lowpart (inner_mode, ops[0]); 14147 ops[1] = gen_lowpart (inner_mode, ops[1]); 14148 subtarget = gen_reg_rtx (mode); 14149 } 14150 ix86_expand_vector_init_concat (mode, subtarget, ops, 2); 14151 if (subtarget != target) 14152 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget)); 14153 return; 14154 } 14155 gcc_unreachable (); 14156 } 14157 14158 for (i = 0; i < n_elts; ++i) 14159 { 14160 x = XVECEXP (vals, 0, i); 14161 if (!(CONST_SCALAR_INT_P (x) 14162 || CONST_DOUBLE_P (x) 14163 || CONST_FIXED_P (x))) 14164 n_var++, one_var = i; 14165 else if (x != CONST0_RTX (inner_mode)) 14166 all_const_zero = false; 14167 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) 14168 all_same = false; 14169 } 14170 14171 /* Constants are best loaded from the constant pool. */ 14172 if (n_var == 0) 14173 { 14174 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))); 14175 return; 14176 } 14177 14178 /* If all values are identical, broadcast the value. */ 14179 if (all_same 14180 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, 14181 XVECEXP (vals, 0, 0))) 14182 return; 14183 14184 /* Values where only one field is non-constant are best loaded from 14185 the pool and overwritten via move later. */ 14186 if (n_var == 1) 14187 { 14188 if (all_const_zero 14189 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, 14190 XVECEXP (vals, 0, one_var), 14191 one_var)) 14192 return; 14193 14194 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) 14195 return; 14196 } 14197 14198 ix86_expand_vector_init_general (mmx_ok, mode, target, vals); 14199} 14200 14201void 14202ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) 14203{ 14204 machine_mode mode = GET_MODE (target); 14205 machine_mode inner_mode = GET_MODE_INNER (mode); 14206 machine_mode half_mode; 14207 bool use_vec_merge = false; 14208 rtx tmp; 14209 static rtx (*gen_extract[6][2]) (rtx, rtx) 14210 = { 14211 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, 14212 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, 14213 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, 14214 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, 14215 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, 14216 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } 14217 }; 14218 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) 14219 = { 14220 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, 14221 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, 14222 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, 14223 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, 14224 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, 14225 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } 14226 }; 14227 int i, j, n; 14228 machine_mode mmode = VOIDmode; 14229 rtx (*gen_blendm) (rtx, rtx, rtx, rtx); 14230 14231 switch (mode) 14232 { 14233 case E_V2SImode: 14234 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; 14235 if (use_vec_merge) 14236 break; 14237 /* FALLTHRU */ 14238 14239 case E_V2SFmode: 14240 if (mmx_ok) 14241 { 14242 tmp = gen_reg_rtx (GET_MODE_INNER (mode)); 14243 ix86_expand_vector_extract (true, tmp, target, 1 - elt); 14244 if (elt == 0) 14245 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); 14246 else 14247 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); 14248 emit_insn (gen_rtx_SET (target, tmp)); 14249 return; 14250 } 14251 break; 14252 14253 case E_V2DImode: 14254 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT; 14255 if (use_vec_merge) 14256 break; 14257 14258 tmp = gen_reg_rtx (GET_MODE_INNER (mode)); 14259 ix86_expand_vector_extract (false, tmp, target, 1 - elt); 14260 if (elt == 0) 14261 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); 14262 else 14263 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); 14264 emit_insn (gen_rtx_SET (target, tmp)); 14265 return; 14266 14267 case E_V2DFmode: 14268 /* NB: For ELT == 0, use standard scalar operation patterns which 14269 preserve the rest of the vector for combiner: 14270 14271 (vec_merge:V2DF 14272 (vec_duplicate:V2DF (reg:DF)) 14273 (reg:V2DF) 14274 (const_int 1)) 14275 */ 14276 if (elt == 0) 14277 goto do_vec_merge; 14278 14279 { 14280 rtx op0, op1; 14281 14282 /* For the two element vectors, we implement a VEC_CONCAT with 14283 the extraction of the other element. */ 14284 14285 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt))); 14286 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp); 14287 14288 if (elt == 0) 14289 op0 = val, op1 = tmp; 14290 else 14291 op0 = tmp, op1 = val; 14292 14293 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1); 14294 emit_insn (gen_rtx_SET (target, tmp)); 14295 } 14296 return; 14297 14298 case E_V4SFmode: 14299 use_vec_merge = TARGET_SSE4_1; 14300 if (use_vec_merge) 14301 break; 14302 14303 switch (elt) 14304 { 14305 case 0: 14306 use_vec_merge = true; 14307 break; 14308 14309 case 1: 14310 /* tmp = target = A B C D */ 14311 tmp = copy_to_reg (target); 14312 /* target = A A B B */ 14313 emit_insn (gen_vec_interleave_lowv4sf (target, target, target)); 14314 /* target = X A B B */ 14315 ix86_expand_vector_set (false, target, val, 0); 14316 /* target = A X C D */ 14317 emit_insn (gen_sse_shufps_v4sf (target, target, tmp, 14318 const1_rtx, const0_rtx, 14319 GEN_INT (2+4), GEN_INT (3+4))); 14320 return; 14321 14322 case 2: 14323 /* tmp = target = A B C D */ 14324 tmp = copy_to_reg (target); 14325 /* tmp = X B C D */ 14326 ix86_expand_vector_set (false, tmp, val, 0); 14327 /* target = A B X D */ 14328 emit_insn (gen_sse_shufps_v4sf (target, target, tmp, 14329 const0_rtx, const1_rtx, 14330 GEN_INT (0+4), GEN_INT (3+4))); 14331 return; 14332 14333 case 3: 14334 /* tmp = target = A B C D */ 14335 tmp = copy_to_reg (target); 14336 /* tmp = X B C D */ 14337 ix86_expand_vector_set (false, tmp, val, 0); 14338 /* target = A B X D */ 14339 emit_insn (gen_sse_shufps_v4sf (target, target, tmp, 14340 const0_rtx, const1_rtx, 14341 GEN_INT (2+4), GEN_INT (0+4))); 14342 return; 14343 14344 default: 14345 gcc_unreachable (); 14346 } 14347 break; 14348 14349 case E_V4SImode: 14350 use_vec_merge = TARGET_SSE4_1; 14351 if (use_vec_merge) 14352 break; 14353 14354 /* Element 0 handled by vec_merge below. */ 14355 if (elt == 0) 14356 { 14357 use_vec_merge = true; 14358 break; 14359 } 14360 14361 if (TARGET_SSE2) 14362 { 14363 /* With SSE2, use integer shuffles to swap element 0 and ELT, 14364 store into element 0, then shuffle them back. */ 14365 14366 rtx order[4]; 14367 14368 order[0] = GEN_INT (elt); 14369 order[1] = const1_rtx; 14370 order[2] = const2_rtx; 14371 order[3] = GEN_INT (3); 14372 order[elt] = const0_rtx; 14373 14374 emit_insn (gen_sse2_pshufd_1 (target, target, order[0], 14375 order[1], order[2], order[3])); 14376 14377 ix86_expand_vector_set (false, target, val, 0); 14378 14379 emit_insn (gen_sse2_pshufd_1 (target, target, order[0], 14380 order[1], order[2], order[3])); 14381 } 14382 else 14383 { 14384 /* For SSE1, we have to reuse the V4SF code. */ 14385 rtx t = gen_reg_rtx (V4SFmode); 14386 emit_move_insn (t, gen_lowpart (V4SFmode, target)); 14387 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt); 14388 emit_move_insn (target, gen_lowpart (mode, t)); 14389 } 14390 return; 14391 14392 case E_V8HImode: 14393 use_vec_merge = TARGET_SSE2; 14394 break; 14395 case E_V4HImode: 14396 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); 14397 break; 14398 14399 case E_V16QImode: 14400 use_vec_merge = TARGET_SSE4_1; 14401 break; 14402 14403 case E_V8QImode: 14404 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; 14405 break; 14406 14407 case E_V32QImode: 14408 half_mode = V16QImode; 14409 j = 0; 14410 n = 16; 14411 goto half; 14412 14413 case E_V16HImode: 14414 half_mode = V8HImode; 14415 j = 1; 14416 n = 8; 14417 goto half; 14418 14419 case E_V8SImode: 14420 half_mode = V4SImode; 14421 j = 2; 14422 n = 4; 14423 goto half; 14424 14425 case E_V4DImode: 14426 half_mode = V2DImode; 14427 j = 3; 14428 n = 2; 14429 goto half; 14430 14431 case E_V8SFmode: 14432 half_mode = V4SFmode; 14433 j = 4; 14434 n = 4; 14435 goto half; 14436 14437 case E_V4DFmode: 14438 half_mode = V2DFmode; 14439 j = 5; 14440 n = 2; 14441 goto half; 14442 14443half: 14444 /* Compute offset. */ 14445 i = elt / n; 14446 elt %= n; 14447 14448 gcc_assert (i <= 1); 14449 14450 /* Extract the half. */ 14451 tmp = gen_reg_rtx (half_mode); 14452 emit_insn (gen_extract[j][i] (tmp, target)); 14453 14454 /* Put val in tmp at elt. */ 14455 ix86_expand_vector_set (false, tmp, val, elt); 14456 14457 /* Put it back. */ 14458 emit_insn (gen_insert[j][i] (target, target, tmp)); 14459 return; 14460 14461 case E_V8DFmode: 14462 if (TARGET_AVX512F) 14463 { 14464 mmode = QImode; 14465 gen_blendm = gen_avx512f_blendmv8df; 14466 } 14467 break; 14468 14469 case E_V8DImode: 14470 if (TARGET_AVX512F) 14471 { 14472 mmode = QImode; 14473 gen_blendm = gen_avx512f_blendmv8di; 14474 } 14475 break; 14476 14477 case E_V16SFmode: 14478 if (TARGET_AVX512F) 14479 { 14480 mmode = HImode; 14481 gen_blendm = gen_avx512f_blendmv16sf; 14482 } 14483 break; 14484 14485 case E_V16SImode: 14486 if (TARGET_AVX512F) 14487 { 14488 mmode = HImode; 14489 gen_blendm = gen_avx512f_blendmv16si; 14490 } 14491 break; 14492 14493 case E_V32HImode: 14494 if (TARGET_AVX512BW) 14495 { 14496 mmode = SImode; 14497 gen_blendm = gen_avx512bw_blendmv32hi; 14498 } 14499 else if (TARGET_AVX512F) 14500 { 14501 half_mode = E_V8HImode; 14502 n = 8; 14503 goto quarter; 14504 } 14505 break; 14506 14507 case E_V64QImode: 14508 if (TARGET_AVX512BW) 14509 { 14510 mmode = DImode; 14511 gen_blendm = gen_avx512bw_blendmv64qi; 14512 } 14513 else if (TARGET_AVX512F) 14514 { 14515 half_mode = E_V16QImode; 14516 n = 16; 14517 goto quarter; 14518 } 14519 break; 14520 14521quarter: 14522 /* Compute offset. */ 14523 i = elt / n; 14524 elt %= n; 14525 14526 gcc_assert (i <= 3); 14527 14528 { 14529 /* Extract the quarter. */ 14530 tmp = gen_reg_rtx (V4SImode); 14531 rtx tmp2 = gen_lowpart (V16SImode, target); 14532 rtx mask = gen_reg_rtx (QImode); 14533 14534 emit_move_insn (mask, constm1_rtx); 14535 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i), 14536 tmp, mask)); 14537 14538 tmp2 = gen_reg_rtx (half_mode); 14539 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp)); 14540 tmp = tmp2; 14541 14542 /* Put val in tmp at elt. */ 14543 ix86_expand_vector_set (false, tmp, val, elt); 14544 14545 /* Put it back. */ 14546 tmp2 = gen_reg_rtx (V16SImode); 14547 rtx tmp3 = gen_lowpart (V16SImode, target); 14548 mask = gen_reg_rtx (HImode); 14549 emit_move_insn (mask, constm1_rtx); 14550 tmp = gen_lowpart (V4SImode, tmp); 14551 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i), 14552 tmp3, mask)); 14553 emit_move_insn (target, gen_lowpart (mode, tmp2)); 14554 } 14555 return; 14556 14557 default: 14558 break; 14559 } 14560 14561 if (mmode != VOIDmode) 14562 { 14563 tmp = gen_reg_rtx (mode); 14564 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val))); 14565 /* The avx512*_blendm<mode> expanders have different operand order 14566 from VEC_MERGE. In VEC_MERGE, the first input operand is used for 14567 elements where the mask is set and second input operand otherwise, 14568 in {sse,avx}*_*blend* the first input operand is used for elements 14569 where the mask is clear and second input operand otherwise. */ 14570 emit_insn (gen_blendm (target, target, tmp, 14571 force_reg (mmode, 14572 gen_int_mode (HOST_WIDE_INT_1U << elt, 14573 mmode)))); 14574 } 14575 else if (use_vec_merge) 14576 { 14577do_vec_merge: 14578 tmp = gen_rtx_VEC_DUPLICATE (mode, val); 14579 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, 14580 GEN_INT (HOST_WIDE_INT_1U << elt)); 14581 emit_insn (gen_rtx_SET (target, tmp)); 14582 } 14583 else 14584 { 14585 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); 14586 14587 emit_move_insn (mem, target); 14588 14589 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode)); 14590 emit_move_insn (tmp, val); 14591 14592 emit_move_insn (target, mem); 14593 } 14594} 14595 14596void 14597ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) 14598{ 14599 machine_mode mode = GET_MODE (vec); 14600 machine_mode inner_mode = GET_MODE_INNER (mode); 14601 bool use_vec_extr = false; 14602 rtx tmp; 14603 14604 switch (mode) 14605 { 14606 case E_V2SImode: 14607 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; 14608 if (use_vec_extr) 14609 break; 14610 /* FALLTHRU */ 14611 14612 case E_V2SFmode: 14613 if (!mmx_ok) 14614 break; 14615 /* FALLTHRU */ 14616 14617 case E_V2DFmode: 14618 case E_V2DImode: 14619 case E_V2TImode: 14620 case E_V4TImode: 14621 use_vec_extr = true; 14622 break; 14623 14624 case E_V4SFmode: 14625 use_vec_extr = TARGET_SSE4_1; 14626 if (use_vec_extr) 14627 break; 14628 14629 switch (elt) 14630 { 14631 case 0: 14632 tmp = vec; 14633 break; 14634 14635 case 1: 14636 case 3: 14637 tmp = gen_reg_rtx (mode); 14638 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec, 14639 GEN_INT (elt), GEN_INT (elt), 14640 GEN_INT (elt+4), GEN_INT (elt+4))); 14641 break; 14642 14643 case 2: 14644 tmp = gen_reg_rtx (mode); 14645 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec)); 14646 break; 14647 14648 default: 14649 gcc_unreachable (); 14650 } 14651 vec = tmp; 14652 use_vec_extr = true; 14653 elt = 0; 14654 break; 14655 14656 case E_V4SImode: 14657 use_vec_extr = TARGET_SSE4_1; 14658 if (use_vec_extr) 14659 break; 14660 14661 if (TARGET_SSE2) 14662 { 14663 switch (elt) 14664 { 14665 case 0: 14666 tmp = vec; 14667 break; 14668 14669 case 1: 14670 case 3: 14671 tmp = gen_reg_rtx (mode); 14672 emit_insn (gen_sse2_pshufd_1 (tmp, vec, 14673 GEN_INT (elt), GEN_INT (elt), 14674 GEN_INT (elt), GEN_INT (elt))); 14675 break; 14676 14677 case 2: 14678 tmp = gen_reg_rtx (mode); 14679 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec)); 14680 break; 14681 14682 default: 14683 gcc_unreachable (); 14684 } 14685 vec = tmp; 14686 use_vec_extr = true; 14687 elt = 0; 14688 } 14689 else 14690 { 14691 /* For SSE1, we have to reuse the V4SF code. */ 14692 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target), 14693 gen_lowpart (V4SFmode, vec), elt); 14694 return; 14695 } 14696 break; 14697 14698 case E_V8HImode: 14699 use_vec_extr = TARGET_SSE2; 14700 break; 14701 case E_V4HImode: 14702 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); 14703 break; 14704 14705 case E_V16QImode: 14706 use_vec_extr = TARGET_SSE4_1; 14707 if (!use_vec_extr 14708 && TARGET_SSE2 14709 && elt == 0 14710 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC)) 14711 { 14712 tmp = gen_reg_rtx (SImode); 14713 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec), 14714 0); 14715 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp))); 14716 return; 14717 } 14718 break; 14719 14720 case E_V8SFmode: 14721 if (TARGET_AVX) 14722 { 14723 tmp = gen_reg_rtx (V4SFmode); 14724 if (elt < 4) 14725 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec)); 14726 else 14727 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec)); 14728 ix86_expand_vector_extract (false, target, tmp, elt & 3); 14729 return; 14730 } 14731 break; 14732 14733 case E_V4DFmode: 14734 if (TARGET_AVX) 14735 { 14736 tmp = gen_reg_rtx (V2DFmode); 14737 if (elt < 2) 14738 emit_insn (gen_vec_extract_lo_v4df (tmp, vec)); 14739 else 14740 emit_insn (gen_vec_extract_hi_v4df (tmp, vec)); 14741 ix86_expand_vector_extract (false, target, tmp, elt & 1); 14742 return; 14743 } 14744 break; 14745 14746 case E_V32QImode: 14747 if (TARGET_AVX) 14748 { 14749 tmp = gen_reg_rtx (V16QImode); 14750 if (elt < 16) 14751 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec)); 14752 else 14753 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec)); 14754 ix86_expand_vector_extract (false, target, tmp, elt & 15); 14755 return; 14756 } 14757 break; 14758 14759 case E_V16HImode: 14760 if (TARGET_AVX) 14761 { 14762 tmp = gen_reg_rtx (V8HImode); 14763 if (elt < 8) 14764 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec)); 14765 else 14766 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec)); 14767 ix86_expand_vector_extract (false, target, tmp, elt & 7); 14768 return; 14769 } 14770 break; 14771 14772 case E_V8SImode: 14773 if (TARGET_AVX) 14774 { 14775 tmp = gen_reg_rtx (V4SImode); 14776 if (elt < 4) 14777 emit_insn (gen_vec_extract_lo_v8si (tmp, vec)); 14778 else 14779 emit_insn (gen_vec_extract_hi_v8si (tmp, vec)); 14780 ix86_expand_vector_extract (false, target, tmp, elt & 3); 14781 return; 14782 } 14783 break; 14784 14785 case E_V4DImode: 14786 if (TARGET_AVX) 14787 { 14788 tmp = gen_reg_rtx (V2DImode); 14789 if (elt < 2) 14790 emit_insn (gen_vec_extract_lo_v4di (tmp, vec)); 14791 else 14792 emit_insn (gen_vec_extract_hi_v4di (tmp, vec)); 14793 ix86_expand_vector_extract (false, target, tmp, elt & 1); 14794 return; 14795 } 14796 break; 14797 14798 case E_V32HImode: 14799 if (TARGET_AVX512BW) 14800 { 14801 tmp = gen_reg_rtx (V16HImode); 14802 if (elt < 16) 14803 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec)); 14804 else 14805 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec)); 14806 ix86_expand_vector_extract (false, target, tmp, elt & 15); 14807 return; 14808 } 14809 break; 14810 14811 case E_V64QImode: 14812 if (TARGET_AVX512BW) 14813 { 14814 tmp = gen_reg_rtx (V32QImode); 14815 if (elt < 32) 14816 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec)); 14817 else 14818 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec)); 14819 ix86_expand_vector_extract (false, target, tmp, elt & 31); 14820 return; 14821 } 14822 break; 14823 14824 case E_V16SFmode: 14825 tmp = gen_reg_rtx (V8SFmode); 14826 if (elt < 8) 14827 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); 14828 else 14829 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); 14830 ix86_expand_vector_extract (false, target, tmp, elt & 7); 14831 return; 14832 14833 case E_V8DFmode: 14834 tmp = gen_reg_rtx (V4DFmode); 14835 if (elt < 4) 14836 emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); 14837 else 14838 emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); 14839 ix86_expand_vector_extract (false, target, tmp, elt & 3); 14840 return; 14841 14842 case E_V16SImode: 14843 tmp = gen_reg_rtx (V8SImode); 14844 if (elt < 8) 14845 emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); 14846 else 14847 emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); 14848 ix86_expand_vector_extract (false, target, tmp, elt & 7); 14849 return; 14850 14851 case E_V8DImode: 14852 tmp = gen_reg_rtx (V4DImode); 14853 if (elt < 4) 14854 emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); 14855 else 14856 emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); 14857 ix86_expand_vector_extract (false, target, tmp, elt & 3); 14858 return; 14859 14860 case E_V8QImode: 14861 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; 14862 /* ??? Could extract the appropriate HImode element and shift. */ 14863 break; 14864 14865 default: 14866 break; 14867 } 14868 14869 if (use_vec_extr) 14870 { 14871 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt))); 14872 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); 14873 14874 /* Let the rtl optimizers know about the zero extension performed. */ 14875 if (inner_mode == QImode || inner_mode == HImode) 14876 { 14877 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); 14878 target = gen_lowpart (SImode, target); 14879 } 14880 14881 emit_insn (gen_rtx_SET (target, tmp)); 14882 } 14883 else 14884 { 14885 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); 14886 14887 emit_move_insn (mem, vec); 14888 14889 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); 14890 emit_move_insn (target, tmp); 14891 } 14892} 14893 14894/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC 14895 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode. 14896 The upper bits of DEST are undefined, though they shouldn't cause 14897 exceptions (some bits from src or all zeros are ok). */ 14898 14899static void 14900emit_reduc_half (rtx dest, rtx src, int i) 14901{ 14902 rtx tem, d = dest; 14903 switch (GET_MODE (src)) 14904 { 14905 case E_V4SFmode: 14906 if (i == 128) 14907 tem = gen_sse_movhlps (dest, src, src); 14908 else 14909 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx, 14910 GEN_INT (1 + 4), GEN_INT (1 + 4)); 14911 break; 14912 case E_V2DFmode: 14913 tem = gen_vec_interleave_highv2df (dest, src, src); 14914 break; 14915 case E_V16QImode: 14916 case E_V8HImode: 14917 case E_V4SImode: 14918 case E_V2DImode: 14919 d = gen_reg_rtx (V1TImode); 14920 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), 14921 GEN_INT (i / 2)); 14922 break; 14923 case E_V8SFmode: 14924 if (i == 256) 14925 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx); 14926 else 14927 tem = gen_avx_shufps256 (dest, src, src, 14928 GEN_INT (i == 128 ? 2 + (3 << 2) : 1)); 14929 break; 14930 case E_V4DFmode: 14931 if (i == 256) 14932 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx); 14933 else 14934 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx); 14935 break; 14936 case E_V32QImode: 14937 case E_V16HImode: 14938 case E_V8SImode: 14939 case E_V4DImode: 14940 if (i == 256) 14941 { 14942 if (GET_MODE (dest) != V4DImode) 14943 d = gen_reg_rtx (V4DImode); 14944 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src), 14945 gen_lowpart (V4DImode, src), 14946 const1_rtx); 14947 } 14948 else 14949 { 14950 d = gen_reg_rtx (V2TImode); 14951 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src), 14952 GEN_INT (i / 2)); 14953 } 14954 break; 14955 case E_V64QImode: 14956 case E_V32HImode: 14957 if (i < 64) 14958 { 14959 d = gen_reg_rtx (V4TImode); 14960 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src), 14961 GEN_INT (i / 2)); 14962 break; 14963 } 14964 /* FALLTHRU */ 14965 case E_V16SImode: 14966 case E_V16SFmode: 14967 case E_V8DImode: 14968 case E_V8DFmode: 14969 if (i > 128) 14970 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), 14971 gen_lowpart (V16SImode, src), 14972 gen_lowpart (V16SImode, src), 14973 GEN_INT (0x4 + (i == 512 ? 4 : 0)), 14974 GEN_INT (0x5 + (i == 512 ? 4 : 0)), 14975 GEN_INT (0x6 + (i == 512 ? 4 : 0)), 14976 GEN_INT (0x7 + (i == 512 ? 4 : 0)), 14977 GEN_INT (0xC), GEN_INT (0xD), 14978 GEN_INT (0xE), GEN_INT (0xF), 14979 GEN_INT (0x10), GEN_INT (0x11), 14980 GEN_INT (0x12), GEN_INT (0x13), 14981 GEN_INT (0x14), GEN_INT (0x15), 14982 GEN_INT (0x16), GEN_INT (0x17)); 14983 else 14984 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), 14985 gen_lowpart (V16SImode, src), 14986 GEN_INT (i == 128 ? 0x2 : 0x1), 14987 GEN_INT (0x3), 14988 GEN_INT (0x3), 14989 GEN_INT (0x3), 14990 GEN_INT (i == 128 ? 0x6 : 0x5), 14991 GEN_INT (0x7), 14992 GEN_INT (0x7), 14993 GEN_INT (0x7), 14994 GEN_INT (i == 128 ? 0xA : 0x9), 14995 GEN_INT (0xB), 14996 GEN_INT (0xB), 14997 GEN_INT (0xB), 14998 GEN_INT (i == 128 ? 0xE : 0xD), 14999 GEN_INT (0xF), 15000 GEN_INT (0xF), 15001 GEN_INT (0xF)); 15002 break; 15003 default: 15004 gcc_unreachable (); 15005 } 15006 emit_insn (tem); 15007 if (d != dest) 15008 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); 15009} 15010 15011/* Expand a vector reduction. FN is the binary pattern to reduce; 15012 DEST is the destination; IN is the input vector. */ 15013 15014void 15015ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) 15016{ 15017 rtx half, dst, vec = in; 15018 machine_mode mode = GET_MODE (in); 15019 int i; 15020 15021 /* SSE4 has a special instruction for V8HImode UMIN reduction. */ 15022 if (TARGET_SSE4_1 15023 && mode == V8HImode 15024 && fn == gen_uminv8hi3) 15025 { 15026 emit_insn (gen_sse4_1_phminposuw (dest, in)); 15027 return; 15028 } 15029 15030 for (i = GET_MODE_BITSIZE (mode); 15031 i > GET_MODE_UNIT_BITSIZE (mode); 15032 i >>= 1) 15033 { 15034 half = gen_reg_rtx (mode); 15035 emit_reduc_half (half, vec, i); 15036 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2) 15037 dst = dest; 15038 else 15039 dst = gen_reg_rtx (mode); 15040 emit_insn (fn (dst, half, vec)); 15041 vec = dst; 15042 } 15043} 15044 15045/* Output code to perform a conditional jump to LABEL, if C2 flag in 15046 FP status register is set. */ 15047 15048void 15049ix86_emit_fp_unordered_jump (rtx label) 15050{ 15051 rtx reg = gen_reg_rtx (HImode); 15052 rtx_insn *insn; 15053 rtx temp; 15054 15055 emit_insn (gen_x86_fnstsw_1 (reg)); 15056 15057 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) 15058 { 15059 emit_insn (gen_x86_sahf_1 (reg)); 15060 15061 temp = gen_rtx_REG (CCmode, FLAGS_REG); 15062 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx); 15063 } 15064 else 15065 { 15066 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04))); 15067 15068 temp = gen_rtx_REG (CCNOmode, FLAGS_REG); 15069 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx); 15070 } 15071 15072 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp, 15073 gen_rtx_LABEL_REF (VOIDmode, label), 15074 pc_rtx); 15075 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); 15076 predict_jump (REG_BR_PROB_BASE * 10 / 100); 15077 JUMP_LABEL (insn) = label; 15078} 15079 15080/* Output code to perform an sinh XFmode calculation. */ 15081 15082void ix86_emit_i387_sinh (rtx op0, rtx op1) 15083{ 15084 rtx e1 = gen_reg_rtx (XFmode); 15085 rtx e2 = gen_reg_rtx (XFmode); 15086 rtx scratch = gen_reg_rtx (HImode); 15087 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); 15088 rtx half = const_double_from_real_value (dconsthalf, XFmode); 15089 rtx cst1, tmp; 15090 rtx_code_label *jump_label = gen_label_rtx (); 15091 rtx_insn *insn; 15092 15093 /* scratch = fxam (op1) */ 15094 emit_insn (gen_fxamxf2_i387 (scratch, op1)); 15095 15096 /* e1 = expm1 (|op1|) */ 15097 emit_insn (gen_absxf2 (e2, op1)); 15098 emit_insn (gen_expm1xf2 (e1, e2)); 15099 15100 /* e2 = e1 / (e1 + 1.0) + e1 */ 15101 cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15102 emit_insn (gen_addxf3 (e2, e1, cst1)); 15103 emit_insn (gen_divxf3 (e2, e1, e2)); 15104 emit_insn (gen_addxf3 (e2, e2, e1)); 15105 15106 /* flags = signbit (op1) */ 15107 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); 15108 15109 /* if (flags) then e2 = -e2 */ 15110 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, 15111 gen_rtx_EQ (VOIDmode, flags, const0_rtx), 15112 gen_rtx_LABEL_REF (VOIDmode, jump_label), 15113 pc_rtx); 15114 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15115 predict_jump (REG_BR_PROB_BASE * 50 / 100); 15116 JUMP_LABEL (insn) = jump_label; 15117 15118 emit_insn (gen_negxf2 (e2, e2)); 15119 15120 emit_label (jump_label); 15121 LABEL_NUSES (jump_label) = 1; 15122 15123 /* op0 = 0.5 * e2 */ 15124 half = force_reg (XFmode, half); 15125 emit_insn (gen_mulxf3 (op0, e2, half)); 15126} 15127 15128/* Output code to perform an cosh XFmode calculation. */ 15129 15130void ix86_emit_i387_cosh (rtx op0, rtx op1) 15131{ 15132 rtx e1 = gen_reg_rtx (XFmode); 15133 rtx e2 = gen_reg_rtx (XFmode); 15134 rtx half = const_double_from_real_value (dconsthalf, XFmode); 15135 rtx cst1; 15136 15137 /* e1 = exp (op1) */ 15138 emit_insn (gen_expxf2 (e1, op1)); 15139 15140 /* e2 = e1 + 1.0 / e1 */ 15141 cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15142 emit_insn (gen_divxf3 (e2, cst1, e1)); 15143 emit_insn (gen_addxf3 (e2, e1, e2)); 15144 15145 /* op0 = 0.5 * e2 */ 15146 half = force_reg (XFmode, half); 15147 emit_insn (gen_mulxf3 (op0, e2, half)); 15148} 15149 15150/* Output code to perform an tanh XFmode calculation. */ 15151 15152void ix86_emit_i387_tanh (rtx op0, rtx op1) 15153{ 15154 rtx e1 = gen_reg_rtx (XFmode); 15155 rtx e2 = gen_reg_rtx (XFmode); 15156 rtx scratch = gen_reg_rtx (HImode); 15157 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); 15158 rtx cst2, tmp; 15159 rtx_code_label *jump_label = gen_label_rtx (); 15160 rtx_insn *insn; 15161 15162 /* scratch = fxam (op1) */ 15163 emit_insn (gen_fxamxf2_i387 (scratch, op1)); 15164 15165 /* e1 = expm1 (-|2 * op1|) */ 15166 emit_insn (gen_addxf3 (e2, op1, op1)); 15167 emit_insn (gen_absxf2 (e2, e2)); 15168 emit_insn (gen_negxf2 (e2, e2)); 15169 emit_insn (gen_expm1xf2 (e1, e2)); 15170 15171 /* e2 = e1 / (e1 + 2.0) */ 15172 cst2 = force_reg (XFmode, CONST2_RTX (XFmode)); 15173 emit_insn (gen_addxf3 (e2, e1, cst2)); 15174 emit_insn (gen_divxf3 (e2, e1, e2)); 15175 15176 /* flags = signbit (op1) */ 15177 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); 15178 15179 /* if (!flags) then e2 = -e2 */ 15180 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, 15181 gen_rtx_NE (VOIDmode, flags, const0_rtx), 15182 gen_rtx_LABEL_REF (VOIDmode, jump_label), 15183 pc_rtx); 15184 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15185 predict_jump (REG_BR_PROB_BASE * 50 / 100); 15186 JUMP_LABEL (insn) = jump_label; 15187 15188 emit_insn (gen_negxf2 (e2, e2)); 15189 15190 emit_label (jump_label); 15191 LABEL_NUSES (jump_label) = 1; 15192 15193 emit_move_insn (op0, e2); 15194} 15195 15196/* Output code to perform an asinh XFmode calculation. */ 15197 15198void ix86_emit_i387_asinh (rtx op0, rtx op1) 15199{ 15200 rtx e1 = gen_reg_rtx (XFmode); 15201 rtx e2 = gen_reg_rtx (XFmode); 15202 rtx scratch = gen_reg_rtx (HImode); 15203 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); 15204 rtx cst1, tmp; 15205 rtx_code_label *jump_label = gen_label_rtx (); 15206 rtx_insn *insn; 15207 15208 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */ 15209 emit_insn (gen_mulxf3 (e1, op1, op1)); 15210 cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15211 emit_insn (gen_addxf3 (e2, e1, cst1)); 15212 emit_insn (gen_sqrtxf2 (e2, e2)); 15213 emit_insn (gen_addxf3 (e2, e2, cst1)); 15214 15215 /* e1 = e1 / e2 */ 15216 emit_insn (gen_divxf3 (e1, e1, e2)); 15217 15218 /* scratch = fxam (op1) */ 15219 emit_insn (gen_fxamxf2_i387 (scratch, op1)); 15220 15221 /* e1 = e1 + |op1| */ 15222 emit_insn (gen_absxf2 (e2, op1)); 15223 emit_insn (gen_addxf3 (e1, e1, e2)); 15224 15225 /* e2 = log1p (e1) */ 15226 ix86_emit_i387_log1p (e2, e1); 15227 15228 /* flags = signbit (op1) */ 15229 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); 15230 15231 /* if (flags) then e2 = -e2 */ 15232 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, 15233 gen_rtx_EQ (VOIDmode, flags, const0_rtx), 15234 gen_rtx_LABEL_REF (VOIDmode, jump_label), 15235 pc_rtx); 15236 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15237 predict_jump (REG_BR_PROB_BASE * 50 / 100); 15238 JUMP_LABEL (insn) = jump_label; 15239 15240 emit_insn (gen_negxf2 (e2, e2)); 15241 15242 emit_label (jump_label); 15243 LABEL_NUSES (jump_label) = 1; 15244 15245 emit_move_insn (op0, e2); 15246} 15247 15248/* Output code to perform an acosh XFmode calculation. */ 15249 15250void ix86_emit_i387_acosh (rtx op0, rtx op1) 15251{ 15252 rtx e1 = gen_reg_rtx (XFmode); 15253 rtx e2 = gen_reg_rtx (XFmode); 15254 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15255 15256 /* e2 = sqrt (op1 + 1.0) */ 15257 emit_insn (gen_addxf3 (e2, op1, cst1)); 15258 emit_insn (gen_sqrtxf2 (e2, e2)); 15259 15260 /* e1 = sqrt (op1 - 1.0) */ 15261 emit_insn (gen_subxf3 (e1, op1, cst1)); 15262 emit_insn (gen_sqrtxf2 (e1, e1)); 15263 15264 /* e1 = e1 * e2 */ 15265 emit_insn (gen_mulxf3 (e1, e1, e2)); 15266 15267 /* e1 = e1 + op1 */ 15268 emit_insn (gen_addxf3 (e1, e1, op1)); 15269 15270 /* op0 = log (e1) */ 15271 emit_insn (gen_logxf2 (op0, e1)); 15272} 15273 15274/* Output code to perform an atanh XFmode calculation. */ 15275 15276void ix86_emit_i387_atanh (rtx op0, rtx op1) 15277{ 15278 rtx e1 = gen_reg_rtx (XFmode); 15279 rtx e2 = gen_reg_rtx (XFmode); 15280 rtx scratch = gen_reg_rtx (HImode); 15281 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); 15282 rtx half = const_double_from_real_value (dconsthalf, XFmode); 15283 rtx cst1, tmp; 15284 rtx_code_label *jump_label = gen_label_rtx (); 15285 rtx_insn *insn; 15286 15287 /* scratch = fxam (op1) */ 15288 emit_insn (gen_fxamxf2_i387 (scratch, op1)); 15289 15290 /* e2 = |op1| */ 15291 emit_insn (gen_absxf2 (e2, op1)); 15292 15293 /* e1 = -(e2 + e2) / (e2 + 1.0) */ 15294 cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15295 emit_insn (gen_addxf3 (e1, e2, cst1)); 15296 emit_insn (gen_addxf3 (e2, e2, e2)); 15297 emit_insn (gen_negxf2 (e2, e2)); 15298 emit_insn (gen_divxf3 (e1, e2, e1)); 15299 15300 /* e2 = log1p (e1) */ 15301 ix86_emit_i387_log1p (e2, e1); 15302 15303 /* flags = signbit (op1) */ 15304 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); 15305 15306 /* if (!flags) then e2 = -e2 */ 15307 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, 15308 gen_rtx_NE (VOIDmode, flags, const0_rtx), 15309 gen_rtx_LABEL_REF (VOIDmode, jump_label), 15310 pc_rtx); 15311 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15312 predict_jump (REG_BR_PROB_BASE * 50 / 100); 15313 JUMP_LABEL (insn) = jump_label; 15314 15315 emit_insn (gen_negxf2 (e2, e2)); 15316 15317 emit_label (jump_label); 15318 LABEL_NUSES (jump_label) = 1; 15319 15320 /* op0 = 0.5 * e2 */ 15321 half = force_reg (XFmode, half); 15322 emit_insn (gen_mulxf3 (op0, e2, half)); 15323} 15324 15325/* Output code to perform a log1p XFmode calculation. */ 15326 15327void ix86_emit_i387_log1p (rtx op0, rtx op1) 15328{ 15329 rtx_code_label *label1 = gen_label_rtx (); 15330 rtx_code_label *label2 = gen_label_rtx (); 15331 15332 rtx tmp = gen_reg_rtx (XFmode); 15333 rtx res = gen_reg_rtx (XFmode); 15334 rtx cst, cstln2, cst1; 15335 rtx_insn *insn; 15336 15337 /* The emit_jump call emits pending stack adjust, make sure it is emitted 15338 before the conditional jump, otherwise the stack adjustment will be 15339 only conditional. */ 15340 do_pending_stack_adjust (); 15341 15342 cst = const_double_from_real_value 15343 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode); 15344 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */ 15345 15346 emit_insn (gen_absxf2 (tmp, op1)); 15347 15348 cst = force_reg (XFmode, cst); 15349 ix86_expand_branch (GE, tmp, cst, label1); 15350 predict_jump (REG_BR_PROB_BASE * 10 / 100); 15351 insn = get_last_insn (); 15352 JUMP_LABEL (insn) = label1; 15353 15354 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2)); 15355 emit_jump (label2); 15356 15357 emit_label (label1); 15358 LABEL_NUSES (label1) = 1; 15359 15360 cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); 15361 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1))); 15362 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2)); 15363 15364 emit_label (label2); 15365 LABEL_NUSES (label2) = 1; 15366 15367 emit_move_insn (op0, res); 15368} 15369 15370/* Emit code for round calculation. */ 15371void ix86_emit_i387_round (rtx op0, rtx op1) 15372{ 15373 machine_mode inmode = GET_MODE (op1); 15374 machine_mode outmode = GET_MODE (op0); 15375 rtx e1 = gen_reg_rtx (XFmode); 15376 rtx e2 = gen_reg_rtx (XFmode); 15377 rtx scratch = gen_reg_rtx (HImode); 15378 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); 15379 rtx half = const_double_from_real_value (dconsthalf, XFmode); 15380 rtx res = gen_reg_rtx (outmode); 15381 rtx_code_label *jump_label = gen_label_rtx (); 15382 rtx (*floor_insn) (rtx, rtx); 15383 rtx (*neg_insn) (rtx, rtx); 15384 rtx_insn *insn; 15385 rtx tmp; 15386 15387 switch (inmode) 15388 { 15389 case E_SFmode: 15390 case E_DFmode: 15391 tmp = gen_reg_rtx (XFmode); 15392 15393 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1))); 15394 op1 = tmp; 15395 break; 15396 case E_XFmode: 15397 break; 15398 default: 15399 gcc_unreachable (); 15400 } 15401 15402 switch (outmode) 15403 { 15404 case E_SFmode: 15405 floor_insn = gen_frndintxf2_floor; 15406 neg_insn = gen_negsf2; 15407 break; 15408 case E_DFmode: 15409 floor_insn = gen_frndintxf2_floor; 15410 neg_insn = gen_negdf2; 15411 break; 15412 case E_XFmode: 15413 floor_insn = gen_frndintxf2_floor; 15414 neg_insn = gen_negxf2; 15415 break; 15416 case E_HImode: 15417 floor_insn = gen_lfloorxfhi2; 15418 neg_insn = gen_neghi2; 15419 break; 15420 case E_SImode: 15421 floor_insn = gen_lfloorxfsi2; 15422 neg_insn = gen_negsi2; 15423 break; 15424 case E_DImode: 15425 floor_insn = gen_lfloorxfdi2; 15426 neg_insn = gen_negdi2; 15427 break; 15428 default: 15429 gcc_unreachable (); 15430 } 15431 15432 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ 15433 15434 /* scratch = fxam(op1) */ 15435 emit_insn (gen_fxamxf2_i387 (scratch, op1)); 15436 15437 /* e1 = fabs(op1) */ 15438 emit_insn (gen_absxf2 (e1, op1)); 15439 15440 /* e2 = e1 + 0.5 */ 15441 half = force_reg (XFmode, half); 15442 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half))); 15443 15444 /* res = floor(e2) */ 15445 switch (outmode) 15446 { 15447 case E_SFmode: 15448 case E_DFmode: 15449 { 15450 tmp = gen_reg_rtx (XFmode); 15451 15452 emit_insn (floor_insn (tmp, e2)); 15453 emit_insn (gen_rtx_SET (res, 15454 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp), 15455 UNSPEC_TRUNC_NOOP))); 15456 } 15457 break; 15458 default: 15459 emit_insn (floor_insn (res, e2)); 15460 } 15461 15462 /* flags = signbit(a) */ 15463 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); 15464 15465 /* if (flags) then res = -res */ 15466 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, 15467 gen_rtx_EQ (VOIDmode, flags, const0_rtx), 15468 gen_rtx_LABEL_REF (VOIDmode, jump_label), 15469 pc_rtx); 15470 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15471 predict_jump (REG_BR_PROB_BASE * 50 / 100); 15472 JUMP_LABEL (insn) = jump_label; 15473 15474 emit_insn (neg_insn (res, res)); 15475 15476 emit_label (jump_label); 15477 LABEL_NUSES (jump_label) = 1; 15478 15479 emit_move_insn (op0, res); 15480} 15481 15482/* Output code to perform a Newton-Rhapson approximation of a single precision 15483 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ 15484 15485void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) 15486{ 15487 rtx x0, x1, e0, e1; 15488 15489 x0 = gen_reg_rtx (mode); 15490 e0 = gen_reg_rtx (mode); 15491 e1 = gen_reg_rtx (mode); 15492 x1 = gen_reg_rtx (mode); 15493 15494 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ 15495 15496 b = force_reg (mode, b); 15497 15498 /* x0 = rcp(b) estimate */ 15499 if (mode == V16SFmode || mode == V8DFmode) 15500 { 15501 if (TARGET_AVX512ER) 15502 { 15503 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), 15504 UNSPEC_RCP28))); 15505 /* res = a * x0 */ 15506 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); 15507 return; 15508 } 15509 else 15510 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), 15511 UNSPEC_RCP14))); 15512 } 15513 else 15514 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), 15515 UNSPEC_RCP))); 15516 15517 /* e0 = x0 * b */ 15518 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); 15519 15520 /* e0 = x0 * e0 */ 15521 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); 15522 15523 /* e1 = x0 + x0 */ 15524 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); 15525 15526 /* x1 = e1 - e0 */ 15527 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); 15528 15529 /* res = a * x1 */ 15530 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); 15531} 15532 15533/* Output code to perform a Newton-Rhapson approximation of a 15534 single precision floating point [reciprocal] square root. */ 15535 15536void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) 15537{ 15538 rtx x0, e0, e1, e2, e3, mthree, mhalf; 15539 REAL_VALUE_TYPE r; 15540 int unspec; 15541 15542 x0 = gen_reg_rtx (mode); 15543 e0 = gen_reg_rtx (mode); 15544 e1 = gen_reg_rtx (mode); 15545 e2 = gen_reg_rtx (mode); 15546 e3 = gen_reg_rtx (mode); 15547 15548 if (TARGET_AVX512ER && mode == V16SFmode) 15549 { 15550 if (recip) 15551 /* res = rsqrt28(a) estimate */ 15552 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), 15553 UNSPEC_RSQRT28))); 15554 else 15555 { 15556 /* x0 = rsqrt28(a) estimate */ 15557 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), 15558 UNSPEC_RSQRT28))); 15559 /* res = rcp28(x0) estimate */ 15560 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), 15561 UNSPEC_RCP28))); 15562 } 15563 return; 15564 } 15565 15566 real_from_integer (&r, VOIDmode, -3, SIGNED); 15567 mthree = const_double_from_real_value (r, SFmode); 15568 15569 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); 15570 mhalf = const_double_from_real_value (r, SFmode); 15571 unspec = UNSPEC_RSQRT; 15572 15573 if (VECTOR_MODE_P (mode)) 15574 { 15575 mthree = ix86_build_const_vector (mode, true, mthree); 15576 mhalf = ix86_build_const_vector (mode, true, mhalf); 15577 /* There is no 512-bit rsqrt. There is however rsqrt14. */ 15578 if (GET_MODE_SIZE (mode) == 64) 15579 unspec = UNSPEC_RSQRT14; 15580 } 15581 15582 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) 15583 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ 15584 15585 a = force_reg (mode, a); 15586 15587 /* x0 = rsqrt(a) estimate */ 15588 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), 15589 unspec))); 15590 15591 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ 15592 if (!recip) 15593 { 15594 rtx zero = force_reg (mode, CONST0_RTX(mode)); 15595 rtx mask; 15596 15597 /* Handle masked compare. */ 15598 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) 15599 { 15600 mask = gen_reg_rtx (HImode); 15601 /* Imm value 0x4 corresponds to not-equal comparison. */ 15602 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); 15603 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); 15604 } 15605 else 15606 { 15607 mask = gen_reg_rtx (mode); 15608 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a))); 15609 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask))); 15610 } 15611 } 15612 15613 /* e0 = x0 * a */ 15614 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); 15615 /* e1 = e0 * x0 */ 15616 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0))); 15617 15618 /* e2 = e1 - 3. */ 15619 mthree = force_reg (mode, mthree); 15620 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree))); 15621 15622 mhalf = force_reg (mode, mhalf); 15623 if (recip) 15624 /* e3 = -.5 * x0 */ 15625 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf))); 15626 else 15627 /* e3 = -.5 * e0 */ 15628 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf))); 15629 /* ret = e2 * e3 */ 15630 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3))); 15631} 15632 15633/* Expand fabs (OP0) and return a new rtx that holds the result. The 15634 mask for masking out the sign-bit is stored in *SMASK, if that is 15635 non-null. */ 15636 15637static rtx 15638ix86_expand_sse_fabs (rtx op0, rtx *smask) 15639{ 15640 machine_mode vmode, mode = GET_MODE (op0); 15641 rtx xa, mask; 15642 15643 xa = gen_reg_rtx (mode); 15644 if (mode == SFmode) 15645 vmode = V4SFmode; 15646 else if (mode == DFmode) 15647 vmode = V2DFmode; 15648 else 15649 vmode = mode; 15650 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); 15651 if (!VECTOR_MODE_P (mode)) 15652 { 15653 /* We need to generate a scalar mode mask in this case. */ 15654 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); 15655 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); 15656 mask = gen_reg_rtx (mode); 15657 emit_insn (gen_rtx_SET (mask, tmp)); 15658 } 15659 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask))); 15660 15661 if (smask) 15662 *smask = mask; 15663 15664 return xa; 15665} 15666 15667/* Expands a comparison of OP0 with OP1 using comparison code CODE, 15668 swapping the operands if SWAP_OPERANDS is true. The expanded 15669 code is a forward jump to a newly created label in case the 15670 comparison is true. The generated label rtx is returned. */ 15671static rtx_code_label * 15672ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, 15673 bool swap_operands) 15674{ 15675 bool unordered_compare = ix86_unordered_fp_compare (code); 15676 rtx_code_label *label; 15677 rtx tmp, reg; 15678 15679 if (swap_operands) 15680 std::swap (op0, op1); 15681 15682 label = gen_label_rtx (); 15683 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); 15684 if (unordered_compare) 15685 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); 15686 reg = gen_rtx_REG (CCFPmode, FLAGS_REG); 15687 emit_insn (gen_rtx_SET (reg, tmp)); 15688 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx); 15689 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 15690 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); 15691 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); 15692 JUMP_LABEL (tmp) = label; 15693 15694 return label; 15695} 15696 15697/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 15698 using comparison code CODE. Operands are swapped for the comparison if 15699 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ 15700static rtx 15701ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, 15702 bool swap_operands) 15703{ 15704 rtx (*insn)(rtx, rtx, rtx, rtx); 15705 machine_mode mode = GET_MODE (op0); 15706 rtx mask = gen_reg_rtx (mode); 15707 15708 if (swap_operands) 15709 std::swap (op0, op1); 15710 15711 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse; 15712 15713 emit_insn (insn (mask, op0, op1, 15714 gen_rtx_fmt_ee (code, mode, op0, op1))); 15715 return mask; 15716} 15717 15718/* Expand copysign from SIGN to the positive value ABS_VALUE 15719 storing in RESULT. If MASK is non-null, it shall be a mask to mask out 15720 the sign-bit. */ 15721 15722static void 15723ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) 15724{ 15725 machine_mode mode = GET_MODE (sign); 15726 rtx sgn = gen_reg_rtx (mode); 15727 if (mask == NULL_RTX) 15728 { 15729 machine_mode vmode; 15730 15731 if (mode == SFmode) 15732 vmode = V4SFmode; 15733 else if (mode == DFmode) 15734 vmode = V2DFmode; 15735 else 15736 vmode = mode; 15737 15738 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); 15739 if (!VECTOR_MODE_P (mode)) 15740 { 15741 /* We need to generate a scalar mode mask in this case. */ 15742 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); 15743 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); 15744 mask = gen_reg_rtx (mode); 15745 emit_insn (gen_rtx_SET (mask, tmp)); 15746 } 15747 } 15748 else 15749 mask = gen_rtx_NOT (mode, mask); 15750 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign))); 15751 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn))); 15752} 15753 15754/* Expand SSE sequence for computing lround from OP1 storing 15755 into OP0. */ 15756 15757void 15758ix86_expand_lround (rtx op0, rtx op1) 15759{ 15760 /* C code for the stuff we're doing below: 15761 tmp = op1 + copysign (nextafter (0.5, 0.0), op1) 15762 return (long)tmp; 15763 */ 15764 machine_mode mode = GET_MODE (op1); 15765 const struct real_format *fmt; 15766 REAL_VALUE_TYPE pred_half, half_minus_pred_half; 15767 rtx adj; 15768 15769 /* load nextafter (0.5, 0.0) */ 15770 fmt = REAL_MODE_FORMAT (mode); 15771 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); 15772 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); 15773 15774 /* adj = copysign (0.5, op1) */ 15775 adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); 15776 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); 15777 15778 /* adj = op1 + adj */ 15779 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); 15780 15781 /* op0 = (imode)adj */ 15782 expand_fix (op0, adj, 0); 15783} 15784 15785/* Expand SSE2 sequence for computing lround from OPERAND1 storing 15786 into OPERAND0. */ 15787 15788void 15789ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) 15790{ 15791 /* C code for the stuff we're doing below (for do_floor): 15792 xi = (long)op1; 15793 xi -= (double)xi > op1 ? 1 : 0; 15794 return xi; 15795 */ 15796 machine_mode fmode = GET_MODE (op1); 15797 machine_mode imode = GET_MODE (op0); 15798 rtx ireg, freg, tmp; 15799 rtx_code_label *label; 15800 15801 /* reg = (long)op1 */ 15802 ireg = gen_reg_rtx (imode); 15803 expand_fix (ireg, op1, 0); 15804 15805 /* freg = (double)reg */ 15806 freg = gen_reg_rtx (fmode); 15807 expand_float (freg, ireg, 0); 15808 15809 /* ireg = (freg > op1) ? ireg - 1 : ireg */ 15810 label = ix86_expand_sse_compare_and_jump (UNLE, 15811 freg, op1, !do_floor); 15812 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, 15813 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); 15814 emit_move_insn (ireg, tmp); 15815 15816 emit_label (label); 15817 LABEL_NUSES (label) = 1; 15818 15819 emit_move_insn (op0, ireg); 15820} 15821 15822/* Generate and return a rtx of mode MODE for 2**n where n is the number 15823 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ 15824 15825static rtx 15826ix86_gen_TWO52 (machine_mode mode) 15827{ 15828 REAL_VALUE_TYPE TWO52r; 15829 rtx TWO52; 15830 15831 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); 15832 TWO52 = const_double_from_real_value (TWO52r, mode); 15833 TWO52 = force_reg (mode, TWO52); 15834 15835 return TWO52; 15836} 15837 15838/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */ 15839 15840void 15841ix86_expand_rint (rtx operand0, rtx operand1) 15842{ 15843 /* C code for the stuff we're doing below: 15844 xa = fabs (operand1); 15845 if (!isless (xa, 2**52)) 15846 return operand1; 15847 two52 = 2**52; 15848 if (flag_rounding_math) 15849 { 15850 two52 = copysign (two52, operand1); 15851 xa = operand1; 15852 } 15853 xa = xa + two52 - two52; 15854 return copysign (xa, operand1); 15855 */ 15856 machine_mode mode = GET_MODE (operand0); 15857 rtx res, xa, TWO52, mask; 15858 rtx_code_label *label; 15859 15860 res = gen_reg_rtx (mode); 15861 emit_move_insn (res, operand1); 15862 15863 /* xa = abs (operand1) */ 15864 xa = ix86_expand_sse_fabs (res, &mask); 15865 15866 /* if (!isless (xa, TWO52)) goto label; */ 15867 TWO52 = ix86_gen_TWO52 (mode); 15868 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 15869 15870 if (flag_rounding_math) 15871 { 15872 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask); 15873 xa = res; 15874 } 15875 15876 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); 15877 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); 15878 15879 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ 15880 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math) 15881 xa = ix86_expand_sse_fabs (xa, NULL); 15882 15883 ix86_sse_copysign_to_positive (res, xa, res, mask); 15884 15885 emit_label (label); 15886 LABEL_NUSES (label) = 1; 15887 15888 emit_move_insn (operand0, res); 15889} 15890 15891/* Expand SSE2 sequence for computing floor or ceil 15892 from OPERAND1 storing into OPERAND0. */ 15893void 15894ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) 15895{ 15896 /* C code for the stuff we expand below. 15897 double xa = fabs (x), x2; 15898 if (!isless (xa, TWO52)) 15899 return x; 15900 x2 = (double)(long)x; 15901 15902 Compensate. Floor: 15903 if (x2 > x) 15904 x2 -= 1; 15905 Compensate. Ceil: 15906 if (x2 < x) 15907 x2 += 1; 15908 15909 if (HONOR_SIGNED_ZEROS (mode)) 15910 return copysign (x2, x); 15911 return x2; 15912 */ 15913 machine_mode mode = GET_MODE (operand0); 15914 rtx xa, xi, TWO52, tmp, one, res, mask; 15915 rtx_code_label *label; 15916 15917 TWO52 = ix86_gen_TWO52 (mode); 15918 15919 /* Temporary for holding the result, initialized to the input 15920 operand to ease control flow. */ 15921 res = gen_reg_rtx (mode); 15922 emit_move_insn (res, operand1); 15923 15924 /* xa = abs (operand1) */ 15925 xa = ix86_expand_sse_fabs (res, &mask); 15926 15927 /* if (!isless (xa, TWO52)) goto label; */ 15928 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 15929 15930 /* xa = (double)(long)x */ 15931 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); 15932 expand_fix (xi, res, 0); 15933 expand_float (xa, xi, 0); 15934 15935 /* generate 1.0 */ 15936 one = force_reg (mode, const_double_from_real_value (dconst1, mode)); 15937 15938 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ 15939 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); 15940 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); 15941 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, 15942 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); 15943 if (HONOR_SIGNED_ZEROS (mode)) 15944 { 15945 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ 15946 if (do_floor && flag_rounding_math) 15947 tmp = ix86_expand_sse_fabs (tmp, NULL); 15948 15949 ix86_sse_copysign_to_positive (tmp, tmp, res, mask); 15950 } 15951 emit_move_insn (res, tmp); 15952 15953 emit_label (label); 15954 LABEL_NUSES (label) = 1; 15955 15956 emit_move_insn (operand0, res); 15957} 15958 15959/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing 15960 into OPERAND0 without relying on DImode truncation via cvttsd2siq 15961 that is only available on 64bit targets. */ 15962void 15963ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) 15964{ 15965 /* C code for the stuff we expand below. 15966 double xa = fabs (x), x2; 15967 if (!isless (xa, TWO52)) 15968 return x; 15969 xa = xa + TWO52 - TWO52; 15970 x2 = copysign (xa, x); 15971 15972 Compensate. Floor: 15973 if (x2 > x) 15974 x2 -= 1; 15975 Compensate. Ceil: 15976 if (x2 < x) 15977 x2 += 1; 15978 15979 if (HONOR_SIGNED_ZEROS (mode)) 15980 x2 = copysign (x2, x); 15981 return x2; 15982 */ 15983 machine_mode mode = GET_MODE (operand0); 15984 rtx xa, TWO52, tmp, one, res, mask; 15985 rtx_code_label *label; 15986 15987 TWO52 = ix86_gen_TWO52 (mode); 15988 15989 /* Temporary for holding the result, initialized to the input 15990 operand to ease control flow. */ 15991 res = gen_reg_rtx (mode); 15992 emit_move_insn (res, operand1); 15993 15994 /* xa = abs (operand1) */ 15995 xa = ix86_expand_sse_fabs (res, &mask); 15996 15997 /* if (!isless (xa, TWO52)) goto label; */ 15998 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 15999 16000 /* xa = xa + TWO52 - TWO52; */ 16001 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); 16002 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); 16003 16004 /* xa = copysign (xa, operand1) */ 16005 ix86_sse_copysign_to_positive (xa, xa, res, mask); 16006 16007 /* generate 1.0 */ 16008 one = force_reg (mode, const_double_from_real_value (dconst1, mode)); 16009 16010 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ 16011 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); 16012 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); 16013 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, 16014 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); 16015 if (HONOR_SIGNED_ZEROS (mode)) 16016 { 16017 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ 16018 if (do_floor && flag_rounding_math) 16019 tmp = ix86_expand_sse_fabs (tmp, NULL); 16020 16021 ix86_sse_copysign_to_positive (tmp, tmp, res, mask); 16022 } 16023 emit_move_insn (res, tmp); 16024 16025 emit_label (label); 16026 LABEL_NUSES (label) = 1; 16027 16028 emit_move_insn (operand0, res); 16029} 16030 16031/* Expand SSE sequence for computing trunc 16032 from OPERAND1 storing into OPERAND0. */ 16033void 16034ix86_expand_trunc (rtx operand0, rtx operand1) 16035{ 16036 /* C code for SSE variant we expand below. 16037 double xa = fabs (x), x2; 16038 if (!isless (xa, TWO52)) 16039 return x; 16040 x2 = (double)(long)x; 16041 if (HONOR_SIGNED_ZEROS (mode)) 16042 return copysign (x2, x); 16043 return x2; 16044 */ 16045 machine_mode mode = GET_MODE (operand0); 16046 rtx xa, xi, TWO52, res, mask; 16047 rtx_code_label *label; 16048 16049 TWO52 = ix86_gen_TWO52 (mode); 16050 16051 /* Temporary for holding the result, initialized to the input 16052 operand to ease control flow. */ 16053 res = gen_reg_rtx (mode); 16054 emit_move_insn (res, operand1); 16055 16056 /* xa = abs (operand1) */ 16057 xa = ix86_expand_sse_fabs (res, &mask); 16058 16059 /* if (!isless (xa, TWO52)) goto label; */ 16060 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 16061 16062 /* x = (double)(long)x */ 16063 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); 16064 expand_fix (xi, res, 0); 16065 expand_float (res, xi, 0); 16066 16067 if (HONOR_SIGNED_ZEROS (mode)) 16068 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); 16069 16070 emit_label (label); 16071 LABEL_NUSES (label) = 1; 16072 16073 emit_move_insn (operand0, res); 16074} 16075 16076/* Expand SSE sequence for computing trunc from OPERAND1 storing 16077 into OPERAND0 without relying on DImode truncation via cvttsd2siq 16078 that is only available on 64bit targets. */ 16079void 16080ix86_expand_truncdf_32 (rtx operand0, rtx operand1) 16081{ 16082 machine_mode mode = GET_MODE (operand0); 16083 rtx xa, xa2, TWO52, tmp, one, res, mask; 16084 rtx_code_label *label; 16085 16086 /* C code for SSE variant we expand below. 16087 double xa = fabs (x), x2; 16088 if (!isless (xa, TWO52)) 16089 return x; 16090 xa2 = xa + TWO52 - TWO52; 16091 Compensate: 16092 if (xa2 > xa) 16093 xa2 -= 1.0; 16094 x2 = copysign (xa2, x); 16095 return x2; 16096 */ 16097 16098 TWO52 = ix86_gen_TWO52 (mode); 16099 16100 /* Temporary for holding the result, initialized to the input 16101 operand to ease control flow. */ 16102 res = gen_reg_rtx (mode); 16103 emit_move_insn (res, operand1); 16104 16105 /* xa = abs (operand1) */ 16106 xa = ix86_expand_sse_fabs (res, &mask); 16107 16108 /* if (!isless (xa, TWO52)) goto label; */ 16109 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 16110 16111 /* xa2 = xa + TWO52 - TWO52; */ 16112 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); 16113 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); 16114 16115 /* generate 1.0 */ 16116 one = force_reg (mode, const_double_from_real_value (dconst1, mode)); 16117 16118 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */ 16119 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false); 16120 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); 16121 tmp = expand_simple_binop (mode, MINUS, 16122 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); 16123 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ 16124 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math) 16125 tmp = ix86_expand_sse_fabs (tmp, NULL); 16126 16127 /* res = copysign (xa2, operand1) */ 16128 ix86_sse_copysign_to_positive (res, tmp, res, mask); 16129 16130 emit_label (label); 16131 LABEL_NUSES (label) = 1; 16132 16133 emit_move_insn (operand0, res); 16134} 16135 16136/* Expand SSE sequence for computing round 16137 from OPERAND1 storing into OPERAND0. */ 16138void 16139ix86_expand_round (rtx operand0, rtx operand1) 16140{ 16141 /* C code for the stuff we're doing below: 16142 double xa = fabs (x); 16143 if (!isless (xa, TWO52)) 16144 return x; 16145 xa = (double)(long)(xa + nextafter (0.5, 0.0)); 16146 return copysign (xa, x); 16147 */ 16148 machine_mode mode = GET_MODE (operand0); 16149 rtx res, TWO52, xa, xi, half, mask; 16150 rtx_code_label *label; 16151 const struct real_format *fmt; 16152 REAL_VALUE_TYPE pred_half, half_minus_pred_half; 16153 16154 /* Temporary for holding the result, initialized to the input 16155 operand to ease control flow. */ 16156 res = gen_reg_rtx (mode); 16157 emit_move_insn (res, operand1); 16158 16159 TWO52 = ix86_gen_TWO52 (mode); 16160 xa = ix86_expand_sse_fabs (res, &mask); 16161 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 16162 16163 /* load nextafter (0.5, 0.0) */ 16164 fmt = REAL_MODE_FORMAT (mode); 16165 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); 16166 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); 16167 16168 /* xa = xa + 0.5 */ 16169 half = force_reg (mode, const_double_from_real_value (pred_half, mode)); 16170 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); 16171 16172 /* xa = (double)(int64_t)xa */ 16173 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); 16174 expand_fix (xi, xa, 0); 16175 expand_float (xa, xi, 0); 16176 16177 /* res = copysign (xa, operand1) */ 16178 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); 16179 16180 emit_label (label); 16181 LABEL_NUSES (label) = 1; 16182 16183 emit_move_insn (operand0, res); 16184} 16185 16186/* Expand SSE sequence for computing round from OPERAND1 storing 16187 into OPERAND0 without relying on DImode truncation via cvttsd2siq 16188 that is only available on 64bit targets. */ 16189void 16190ix86_expand_rounddf_32 (rtx operand0, rtx operand1) 16191{ 16192 /* C code for the stuff we expand below. 16193 double xa = fabs (x), xa2, x2; 16194 if (!isless (xa, TWO52)) 16195 return x; 16196 Using the absolute value and copying back sign makes 16197 -0.0 -> -0.0 correct. 16198 xa2 = xa + TWO52 - TWO52; 16199 Compensate. 16200 dxa = xa2 - xa; 16201 if (dxa <= -0.5) 16202 xa2 += 1; 16203 else if (dxa > 0.5) 16204 xa2 -= 1; 16205 x2 = copysign (xa2, x); 16206 return x2; 16207 */ 16208 machine_mode mode = GET_MODE (operand0); 16209 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask; 16210 rtx_code_label *label; 16211 16212 TWO52 = ix86_gen_TWO52 (mode); 16213 16214 /* Temporary for holding the result, initialized to the input 16215 operand to ease control flow. */ 16216 res = gen_reg_rtx (mode); 16217 emit_move_insn (res, operand1); 16218 16219 /* xa = abs (operand1) */ 16220 xa = ix86_expand_sse_fabs (res, &mask); 16221 16222 /* if (!isless (xa, TWO52)) goto label; */ 16223 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); 16224 16225 /* xa2 = xa + TWO52 - TWO52; */ 16226 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); 16227 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); 16228 16229 /* dxa = xa2 - xa; */ 16230 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); 16231 16232 /* generate 0.5, 1.0 and -0.5 */ 16233 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); 16234 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); 16235 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, 16236 0, OPTAB_DIRECT); 16237 16238 /* Compensate. */ 16239 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ 16240 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); 16241 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); 16242 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); 16243 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ 16244 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); 16245 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); 16246 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); 16247 16248 /* res = copysign (xa2, operand1) */ 16249 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); 16250 16251 emit_label (label); 16252 LABEL_NUSES (label) = 1; 16253 16254 emit_move_insn (operand0, res); 16255} 16256 16257/* Expand SSE sequence for computing round 16258 from OP1 storing into OP0 using sse4 round insn. */ 16259void 16260ix86_expand_round_sse4 (rtx op0, rtx op1) 16261{ 16262 machine_mode mode = GET_MODE (op0); 16263 rtx e1, e2, res, half; 16264 const struct real_format *fmt; 16265 REAL_VALUE_TYPE pred_half, half_minus_pred_half; 16266 rtx (*gen_copysign) (rtx, rtx, rtx); 16267 rtx (*gen_round) (rtx, rtx, rtx); 16268 16269 switch (mode) 16270 { 16271 case E_SFmode: 16272 gen_copysign = gen_copysignsf3; 16273 gen_round = gen_sse4_1_roundsf2; 16274 break; 16275 case E_DFmode: 16276 gen_copysign = gen_copysigndf3; 16277 gen_round = gen_sse4_1_rounddf2; 16278 break; 16279 default: 16280 gcc_unreachable (); 16281 } 16282 16283 /* round (a) = trunc (a + copysign (0.5, a)) */ 16284 16285 /* load nextafter (0.5, 0.0) */ 16286 fmt = REAL_MODE_FORMAT (mode); 16287 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); 16288 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); 16289 half = const_double_from_real_value (pred_half, mode); 16290 16291 /* e1 = copysign (0.5, op1) */ 16292 e1 = gen_reg_rtx (mode); 16293 emit_insn (gen_copysign (e1, half, op1)); 16294 16295 /* e2 = op1 + e1 */ 16296 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT); 16297 16298 /* res = trunc (e2) */ 16299 res = gen_reg_rtx (mode); 16300 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC))); 16301 16302 emit_move_insn (op0, res); 16303} 16304 16305/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel []))) 16306 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh 16307 insn every time. */ 16308 16309static GTY(()) rtx_insn *vselect_insn; 16310 16311/* Initialize vselect_insn. */ 16312 16313static void 16314init_vselect_insn (void) 16315{ 16316 unsigned i; 16317 rtx x; 16318 16319 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN)); 16320 for (i = 0; i < MAX_VECT_LEN; ++i) 16321 XVECEXP (x, 0, i) = const0_rtx; 16322 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx, 16323 const0_rtx), x); 16324 x = gen_rtx_SET (const0_rtx, x); 16325 start_sequence (); 16326 vselect_insn = emit_insn (x); 16327 end_sequence (); 16328} 16329 16330/* Construct (set target (vec_select op0 (parallel perm))) and 16331 return true if that's a valid instruction in the active ISA. */ 16332 16333static bool 16334expand_vselect (rtx target, rtx op0, const unsigned char *perm, 16335 unsigned nelt, bool testing_p) 16336{ 16337 unsigned int i; 16338 rtx x, save_vconcat; 16339 int icode; 16340 16341 if (vselect_insn == NULL_RTX) 16342 init_vselect_insn (); 16343 16344 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1); 16345 PUT_NUM_ELEM (XVEC (x, 0), nelt); 16346 for (i = 0; i < nelt; ++i) 16347 XVECEXP (x, 0, i) = GEN_INT (perm[i]); 16348 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); 16349 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0; 16350 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target)); 16351 SET_DEST (PATTERN (vselect_insn)) = target; 16352 icode = recog_memoized (vselect_insn); 16353 16354 if (icode >= 0 && !testing_p) 16355 emit_insn (copy_rtx (PATTERN (vselect_insn))); 16356 16357 SET_DEST (PATTERN (vselect_insn)) = const0_rtx; 16358 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat; 16359 INSN_CODE (vselect_insn) = -1; 16360 16361 return icode >= 0; 16362} 16363 16364/* Similar, but generate a vec_concat from op0 and op1 as well. */ 16365 16366static bool 16367expand_vselect_vconcat (rtx target, rtx op0, rtx op1, 16368 const unsigned char *perm, unsigned nelt, 16369 bool testing_p) 16370{ 16371 machine_mode v2mode; 16372 rtx x; 16373 bool ok; 16374 16375 if (vselect_insn == NULL_RTX) 16376 init_vselect_insn (); 16377 16378 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode)) 16379 return false; 16380 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); 16381 PUT_MODE (x, v2mode); 16382 XEXP (x, 0) = op0; 16383 XEXP (x, 1) = op1; 16384 ok = expand_vselect (target, x, perm, nelt, testing_p); 16385 XEXP (x, 0) = const0_rtx; 16386 XEXP (x, 1) = const0_rtx; 16387 return ok; 16388} 16389 16390/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D 16391 using movss or movsd. */ 16392static bool 16393expand_vec_perm_movs (struct expand_vec_perm_d *d) 16394{ 16395 machine_mode vmode = d->vmode; 16396 unsigned i, nelt = d->nelt; 16397 rtx x; 16398 16399 if (d->one_operand_p) 16400 return false; 16401 16402 if (!(TARGET_SSE && vmode == V4SFmode) 16403 && !(TARGET_SSE2 && vmode == V2DFmode)) 16404 return false; 16405 16406 /* Only the first element is changed. */ 16407 if (d->perm[0] != nelt && d->perm[0] != 0) 16408 return false; 16409 for (i = 1; i < nelt; ++i) 16410 if (d->perm[i] != i + nelt - d->perm[0]) 16411 return false; 16412 16413 if (d->testing_p) 16414 return true; 16415 16416 if (d->perm[0] == nelt) 16417 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1)); 16418 else 16419 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1)); 16420 16421 emit_insn (gen_rtx_SET (d->target, x)); 16422 16423 return true; 16424} 16425 16426/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D 16427 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ 16428 16429static bool 16430expand_vec_perm_blend (struct expand_vec_perm_d *d) 16431{ 16432 machine_mode mmode, vmode = d->vmode; 16433 unsigned i, nelt = d->nelt; 16434 unsigned HOST_WIDE_INT mask; 16435 rtx target, op0, op1, maskop, x; 16436 rtx rperm[32], vperm; 16437 16438 if (d->one_operand_p) 16439 return false; 16440 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 16441 && (TARGET_AVX512BW 16442 || GET_MODE_UNIT_SIZE (vmode) >= 4)) 16443 ; 16444 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) 16445 ; 16446 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) 16447 ; 16448 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) 16449 ; 16450 else 16451 return false; 16452 16453 /* This is a blend, not a permute. Elements must stay in their 16454 respective lanes. */ 16455 for (i = 0; i < nelt; ++i) 16456 { 16457 unsigned e = d->perm[i]; 16458 if (!(e == i || e == i + nelt)) 16459 return false; 16460 } 16461 16462 if (d->testing_p) 16463 return true; 16464 16465 /* ??? Without SSE4.1, we could implement this with and/andn/or. This 16466 decision should be extracted elsewhere, so that we only try that 16467 sequence once all budget==3 options have been tried. */ 16468 target = d->target; 16469 op0 = d->op0; 16470 op1 = d->op1; 16471 mask = 0; 16472 16473 switch (vmode) 16474 { 16475 case E_V8DFmode: 16476 case E_V16SFmode: 16477 case E_V4DFmode: 16478 case E_V8SFmode: 16479 case E_V2DFmode: 16480 case E_V4SFmode: 16481 case E_V8HImode: 16482 case E_V8SImode: 16483 case E_V32HImode: 16484 case E_V64QImode: 16485 case E_V16SImode: 16486 case E_V8DImode: 16487 for (i = 0; i < nelt; ++i) 16488 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i; 16489 break; 16490 16491 case E_V2DImode: 16492 for (i = 0; i < 2; ++i) 16493 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); 16494 vmode = V8HImode; 16495 goto do_subreg; 16496 16497 case E_V4SImode: 16498 for (i = 0; i < 4; ++i) 16499 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); 16500 vmode = V8HImode; 16501 goto do_subreg; 16502 16503 case E_V16QImode: 16504 /* See if bytes move in pairs so we can use pblendw with 16505 an immediate argument, rather than pblendvb with a vector 16506 argument. */ 16507 for (i = 0; i < 16; i += 2) 16508 if (d->perm[i] + 1 != d->perm[i + 1]) 16509 { 16510 use_pblendvb: 16511 for (i = 0; i < nelt; ++i) 16512 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); 16513 16514 finish_pblendvb: 16515 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); 16516 vperm = force_reg (vmode, vperm); 16517 16518 if (GET_MODE_SIZE (vmode) == 16) 16519 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); 16520 else 16521 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); 16522 if (target != d->target) 16523 emit_move_insn (d->target, gen_lowpart (d->vmode, target)); 16524 return true; 16525 } 16526 16527 for (i = 0; i < 8; ++i) 16528 mask |= (d->perm[i * 2] >= 16) << i; 16529 vmode = V8HImode; 16530 /* FALLTHRU */ 16531 16532 do_subreg: 16533 target = gen_reg_rtx (vmode); 16534 op0 = gen_lowpart (vmode, op0); 16535 op1 = gen_lowpart (vmode, op1); 16536 break; 16537 16538 case E_V32QImode: 16539 /* See if bytes move in pairs. If not, vpblendvb must be used. */ 16540 for (i = 0; i < 32; i += 2) 16541 if (d->perm[i] + 1 != d->perm[i + 1]) 16542 goto use_pblendvb; 16543 /* See if bytes move in quadruplets. If yes, vpblendd 16544 with immediate can be used. */ 16545 for (i = 0; i < 32; i += 4) 16546 if (d->perm[i] + 2 != d->perm[i + 2]) 16547 break; 16548 if (i < 32) 16549 { 16550 /* See if bytes move the same in both lanes. If yes, 16551 vpblendw with immediate can be used. */ 16552 for (i = 0; i < 16; i += 2) 16553 if (d->perm[i] + 16 != d->perm[i + 16]) 16554 goto use_pblendvb; 16555 16556 /* Use vpblendw. */ 16557 for (i = 0; i < 16; ++i) 16558 mask |= (d->perm[i * 2] >= 32) << i; 16559 vmode = V16HImode; 16560 goto do_subreg; 16561 } 16562 16563 /* Use vpblendd. */ 16564 for (i = 0; i < 8; ++i) 16565 mask |= (d->perm[i * 4] >= 32) << i; 16566 vmode = V8SImode; 16567 goto do_subreg; 16568 16569 case E_V16HImode: 16570 /* See if words move in pairs. If yes, vpblendd can be used. */ 16571 for (i = 0; i < 16; i += 2) 16572 if (d->perm[i] + 1 != d->perm[i + 1]) 16573 break; 16574 if (i < 16) 16575 { 16576 /* See if words move the same in both lanes. If not, 16577 vpblendvb must be used. */ 16578 for (i = 0; i < 8; i++) 16579 if (d->perm[i] + 8 != d->perm[i + 8]) 16580 { 16581 /* Use vpblendvb. */ 16582 for (i = 0; i < 32; ++i) 16583 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx); 16584 16585 vmode = V32QImode; 16586 nelt = 32; 16587 target = gen_reg_rtx (vmode); 16588 op0 = gen_lowpart (vmode, op0); 16589 op1 = gen_lowpart (vmode, op1); 16590 goto finish_pblendvb; 16591 } 16592 16593 /* Use vpblendw. */ 16594 for (i = 0; i < 16; ++i) 16595 mask |= (d->perm[i] >= 16) << i; 16596 break; 16597 } 16598 16599 /* Use vpblendd. */ 16600 for (i = 0; i < 8; ++i) 16601 mask |= (d->perm[i * 2] >= 16) << i; 16602 vmode = V8SImode; 16603 goto do_subreg; 16604 16605 case E_V4DImode: 16606 /* Use vpblendd. */ 16607 for (i = 0; i < 4; ++i) 16608 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); 16609 vmode = V8SImode; 16610 goto do_subreg; 16611 16612 default: 16613 gcc_unreachable (); 16614 } 16615 16616 switch (vmode) 16617 { 16618 case E_V8DFmode: 16619 case E_V8DImode: 16620 mmode = QImode; 16621 break; 16622 case E_V16SFmode: 16623 case E_V16SImode: 16624 mmode = HImode; 16625 break; 16626 case E_V32HImode: 16627 mmode = SImode; 16628 break; 16629 case E_V64QImode: 16630 mmode = DImode; 16631 break; 16632 default: 16633 mmode = VOIDmode; 16634 } 16635 16636 if (mmode != VOIDmode) 16637 maskop = force_reg (mmode, gen_int_mode (mask, mmode)); 16638 else 16639 maskop = GEN_INT (mask); 16640 16641 /* This matches five different patterns with the different modes. */ 16642 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop); 16643 x = gen_rtx_SET (target, x); 16644 emit_insn (x); 16645 if (target != d->target) 16646 emit_move_insn (d->target, gen_lowpart (d->vmode, target)); 16647 16648 return true; 16649} 16650 16651/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D 16652 in terms of the variable form of vpermilps. 16653 16654 Note that we will have already failed the immediate input vpermilps, 16655 which requires that the high and low part shuffle be identical; the 16656 variable form doesn't require that. */ 16657 16658static bool 16659expand_vec_perm_vpermil (struct expand_vec_perm_d *d) 16660{ 16661 rtx rperm[8], vperm; 16662 unsigned i; 16663 16664 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p) 16665 return false; 16666 16667 /* We can only permute within the 128-bit lane. */ 16668 for (i = 0; i < 8; ++i) 16669 { 16670 unsigned e = d->perm[i]; 16671 if (i < 4 ? e >= 4 : e < 4) 16672 return false; 16673 } 16674 16675 if (d->testing_p) 16676 return true; 16677 16678 for (i = 0; i < 8; ++i) 16679 { 16680 unsigned e = d->perm[i]; 16681 16682 /* Within each 128-bit lane, the elements of op0 are numbered 16683 from 0 and the elements of op1 are numbered from 4. */ 16684 if (e >= 8 + 4) 16685 e -= 8; 16686 else if (e >= 4) 16687 e -= 4; 16688 16689 rperm[i] = GEN_INT (e); 16690 } 16691 16692 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); 16693 vperm = force_reg (V8SImode, vperm); 16694 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); 16695 16696 return true; 16697} 16698 16699/* Return true if permutation D can be performed as VMODE permutation 16700 instead. */ 16701 16702static bool 16703valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d) 16704{ 16705 unsigned int i, j, chunk; 16706 16707 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT 16708 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT 16709 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode)) 16710 return false; 16711 16712 if (GET_MODE_NUNITS (vmode) >= d->nelt) 16713 return true; 16714 16715 chunk = d->nelt / GET_MODE_NUNITS (vmode); 16716 for (i = 0; i < d->nelt; i += chunk) 16717 if (d->perm[i] & (chunk - 1)) 16718 return false; 16719 else 16720 for (j = 1; j < chunk; ++j) 16721 if (d->perm[i] + j != d->perm[i + j]) 16722 return false; 16723 16724 return true; 16725} 16726 16727/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D 16728 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */ 16729 16730static bool 16731expand_vec_perm_pshufb (struct expand_vec_perm_d *d) 16732{ 16733 unsigned i, nelt, eltsz, mask; 16734 unsigned char perm[64]; 16735 machine_mode vmode = V16QImode; 16736 rtx rperm[64], vperm, target, op0, op1; 16737 16738 nelt = d->nelt; 16739 16740 if (!d->one_operand_p) 16741 { 16742 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) 16743 { 16744 if (TARGET_AVX2 16745 && valid_perm_using_mode_p (V2TImode, d)) 16746 { 16747 if (d->testing_p) 16748 return true; 16749 16750 /* Use vperm2i128 insn. The pattern uses 16751 V4DImode instead of V2TImode. */ 16752 target = d->target; 16753 if (d->vmode != V4DImode) 16754 target = gen_reg_rtx (V4DImode); 16755 op0 = gen_lowpart (V4DImode, d->op0); 16756 op1 = gen_lowpart (V4DImode, d->op1); 16757 rperm[0] 16758 = GEN_INT ((d->perm[0] / (nelt / 2)) 16759 | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); 16760 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); 16761 if (target != d->target) 16762 emit_move_insn (d->target, gen_lowpart (d->vmode, target)); 16763 return true; 16764 } 16765 return false; 16766 } 16767 } 16768 else 16769 { 16770 if (GET_MODE_SIZE (d->vmode) == 16) 16771 { 16772 if (!TARGET_SSSE3) 16773 return false; 16774 } 16775 else if (GET_MODE_SIZE (d->vmode) == 32) 16776 { 16777 if (!TARGET_AVX2) 16778 return false; 16779 16780 /* V4DImode should be already handled through 16781 expand_vselect by vpermq instruction. */ 16782 gcc_assert (d->vmode != V4DImode); 16783 16784 vmode = V32QImode; 16785 if (d->vmode == V8SImode 16786 || d->vmode == V16HImode 16787 || d->vmode == V32QImode) 16788 { 16789 /* First see if vpermq can be used for 16790 V8SImode/V16HImode/V32QImode. */ 16791 if (valid_perm_using_mode_p (V4DImode, d)) 16792 { 16793 for (i = 0; i < 4; i++) 16794 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; 16795 if (d->testing_p) 16796 return true; 16797 target = gen_reg_rtx (V4DImode); 16798 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), 16799 perm, 4, false)) 16800 { 16801 emit_move_insn (d->target, 16802 gen_lowpart (d->vmode, target)); 16803 return true; 16804 } 16805 return false; 16806 } 16807 16808 /* Next see if vpermd can be used. */ 16809 if (valid_perm_using_mode_p (V8SImode, d)) 16810 vmode = V8SImode; 16811 } 16812 /* Or if vpermps can be used. */ 16813 else if (d->vmode == V8SFmode) 16814 vmode = V8SImode; 16815 16816 if (vmode == V32QImode) 16817 { 16818 /* vpshufb only works intra lanes, it is not 16819 possible to shuffle bytes in between the lanes. */ 16820 for (i = 0; i < nelt; ++i) 16821 if ((d->perm[i] ^ i) & (nelt / 2)) 16822 return false; 16823 } 16824 } 16825 else if (GET_MODE_SIZE (d->vmode) == 64) 16826 { 16827 if (!TARGET_AVX512BW) 16828 return false; 16829 16830 /* If vpermq didn't work, vpshufb won't work either. */ 16831 if (d->vmode == V8DFmode || d->vmode == V8DImode) 16832 return false; 16833 16834 vmode = V64QImode; 16835 if (d->vmode == V16SImode 16836 || d->vmode == V32HImode 16837 || d->vmode == V64QImode) 16838 { 16839 /* First see if vpermq can be used for 16840 V16SImode/V32HImode/V64QImode. */ 16841 if (valid_perm_using_mode_p (V8DImode, d)) 16842 { 16843 for (i = 0; i < 8; i++) 16844 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; 16845 if (d->testing_p) 16846 return true; 16847 target = gen_reg_rtx (V8DImode); 16848 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), 16849 perm, 8, false)) 16850 { 16851 emit_move_insn (d->target, 16852 gen_lowpart (d->vmode, target)); 16853 return true; 16854 } 16855 return false; 16856 } 16857 16858 /* Next see if vpermd can be used. */ 16859 if (valid_perm_using_mode_p (V16SImode, d)) 16860 vmode = V16SImode; 16861 } 16862 /* Or if vpermps can be used. */ 16863 else if (d->vmode == V16SFmode) 16864 vmode = V16SImode; 16865 if (vmode == V64QImode) 16866 { 16867 /* vpshufb only works intra lanes, it is not 16868 possible to shuffle bytes in between the lanes. */ 16869 for (i = 0; i < nelt; ++i) 16870 if ((d->perm[i] ^ i) & (3 * nelt / 4)) 16871 return false; 16872 } 16873 } 16874 else 16875 return false; 16876 } 16877 16878 if (d->testing_p) 16879 return true; 16880 16881 if (vmode == V8SImode) 16882 for (i = 0; i < 8; ++i) 16883 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); 16884 else if (vmode == V16SImode) 16885 for (i = 0; i < 16; ++i) 16886 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15); 16887 else 16888 { 16889 eltsz = GET_MODE_UNIT_SIZE (d->vmode); 16890 if (!d->one_operand_p) 16891 mask = 2 * nelt - 1; 16892 else if (vmode == V16QImode) 16893 mask = nelt - 1; 16894 else if (vmode == V64QImode) 16895 mask = nelt / 4 - 1; 16896 else 16897 mask = nelt / 2 - 1; 16898 16899 for (i = 0; i < nelt; ++i) 16900 { 16901 unsigned j, e = d->perm[i] & mask; 16902 for (j = 0; j < eltsz; ++j) 16903 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); 16904 } 16905 } 16906 16907 vperm = gen_rtx_CONST_VECTOR (vmode, 16908 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); 16909 vperm = force_reg (vmode, vperm); 16910 16911 target = d->target; 16912 if (d->vmode != vmode) 16913 target = gen_reg_rtx (vmode); 16914 op0 = gen_lowpart (vmode, d->op0); 16915 if (d->one_operand_p) 16916 { 16917 if (vmode == V16QImode) 16918 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); 16919 else if (vmode == V32QImode) 16920 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); 16921 else if (vmode == V64QImode) 16922 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); 16923 else if (vmode == V8SFmode) 16924 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); 16925 else if (vmode == V8SImode) 16926 emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); 16927 else if (vmode == V16SFmode) 16928 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm)); 16929 else if (vmode == V16SImode) 16930 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm)); 16931 else 16932 gcc_unreachable (); 16933 } 16934 else 16935 { 16936 op1 = gen_lowpart (vmode, d->op1); 16937 emit_insn (gen_xop_pperm (target, op0, op1, vperm)); 16938 } 16939 if (target != d->target) 16940 emit_move_insn (d->target, gen_lowpart (d->vmode, target)); 16941 16942 return true; 16943} 16944 16945/* For V*[QHS]Imode permutations, check if the same permutation 16946 can't be performed in a 2x, 4x or 8x wider inner mode. */ 16947 16948static bool 16949canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, 16950 struct expand_vec_perm_d *nd) 16951{ 16952 int i; 16953 machine_mode mode = VOIDmode; 16954 16955 switch (d->vmode) 16956 { 16957 case E_V16QImode: mode = V8HImode; break; 16958 case E_V32QImode: mode = V16HImode; break; 16959 case E_V64QImode: mode = V32HImode; break; 16960 case E_V8HImode: mode = V4SImode; break; 16961 case E_V16HImode: mode = V8SImode; break; 16962 case E_V32HImode: mode = V16SImode; break; 16963 case E_V4SImode: mode = V2DImode; break; 16964 case E_V8SImode: mode = V4DImode; break; 16965 case E_V16SImode: mode = V8DImode; break; 16966 default: return false; 16967 } 16968 for (i = 0; i < d->nelt; i += 2) 16969 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) 16970 return false; 16971 nd->vmode = mode; 16972 nd->nelt = d->nelt / 2; 16973 for (i = 0; i < nd->nelt; i++) 16974 nd->perm[i] = d->perm[2 * i] / 2; 16975 if (GET_MODE_INNER (mode) != DImode) 16976 canonicalize_vector_int_perm (nd, nd); 16977 if (nd != d) 16978 { 16979 nd->one_operand_p = d->one_operand_p; 16980 nd->testing_p = d->testing_p; 16981 if (d->op0 == d->op1) 16982 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); 16983 else 16984 { 16985 nd->op0 = gen_lowpart (nd->vmode, d->op0); 16986 nd->op1 = gen_lowpart (nd->vmode, d->op1); 16987 } 16988 if (d->testing_p) 16989 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); 16990 else 16991 nd->target = gen_reg_rtx (nd->vmode); 16992 } 16993 return true; 16994} 16995 16996/* Try to expand one-operand permutation with constant mask. */ 16997 16998static bool 16999ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) 17000{ 17001 machine_mode mode = GET_MODE (d->op0); 17002 machine_mode maskmode = mode; 17003 rtx (*gen) (rtx, rtx, rtx) = NULL; 17004 rtx target, op0, mask; 17005 rtx vec[64]; 17006 17007 if (!rtx_equal_p (d->op0, d->op1)) 17008 return false; 17009 17010 if (!TARGET_AVX512F) 17011 return false; 17012 17013 switch (mode) 17014 { 17015 case E_V16SImode: 17016 gen = gen_avx512f_permvarv16si; 17017 break; 17018 case E_V16SFmode: 17019 gen = gen_avx512f_permvarv16sf; 17020 maskmode = V16SImode; 17021 break; 17022 case E_V8DImode: 17023 gen = gen_avx512f_permvarv8di; 17024 break; 17025 case E_V8DFmode: 17026 gen = gen_avx512f_permvarv8df; 17027 maskmode = V8DImode; 17028 break; 17029 default: 17030 return false; 17031 } 17032 17033 target = d->target; 17034 op0 = d->op0; 17035 for (int i = 0; i < d->nelt; ++i) 17036 vec[i] = GEN_INT (d->perm[i]); 17037 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); 17038 emit_insn (gen (target, op0, force_reg (maskmode, mask))); 17039 return true; 17040} 17041 17042static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool); 17043 17044/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D 17045 in a single instruction. */ 17046 17047static bool 17048expand_vec_perm_1 (struct expand_vec_perm_d *d) 17049{ 17050 unsigned i, nelt = d->nelt; 17051 struct expand_vec_perm_d nd; 17052 17053 /* Check plain VEC_SELECT first, because AVX has instructions that could 17054 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory 17055 input where SEL+CONCAT may not. */ 17056 if (d->one_operand_p) 17057 { 17058 int mask = nelt - 1; 17059 bool identity_perm = true; 17060 bool broadcast_perm = true; 17061 17062 for (i = 0; i < nelt; i++) 17063 { 17064 nd.perm[i] = d->perm[i] & mask; 17065 if (nd.perm[i] != i) 17066 identity_perm = false; 17067 if (nd.perm[i]) 17068 broadcast_perm = false; 17069 } 17070 17071 if (identity_perm) 17072 { 17073 if (!d->testing_p) 17074 emit_move_insn (d->target, d->op0); 17075 return true; 17076 } 17077 else if (broadcast_perm && TARGET_AVX2) 17078 { 17079 /* Use vpbroadcast{b,w,d}. */ 17080 rtx (*gen) (rtx, rtx) = NULL; 17081 switch (d->vmode) 17082 { 17083 case E_V64QImode: 17084 if (TARGET_AVX512BW) 17085 gen = gen_avx512bw_vec_dupv64qi_1; 17086 break; 17087 case E_V32QImode: 17088 gen = gen_avx2_pbroadcastv32qi_1; 17089 break; 17090 case E_V32HImode: 17091 if (TARGET_AVX512BW) 17092 gen = gen_avx512bw_vec_dupv32hi_1; 17093 break; 17094 case E_V16HImode: 17095 gen = gen_avx2_pbroadcastv16hi_1; 17096 break; 17097 case E_V16SImode: 17098 if (TARGET_AVX512F) 17099 gen = gen_avx512f_vec_dupv16si_1; 17100 break; 17101 case E_V8SImode: 17102 gen = gen_avx2_pbroadcastv8si_1; 17103 break; 17104 case E_V16QImode: 17105 gen = gen_avx2_pbroadcastv16qi; 17106 break; 17107 case E_V8HImode: 17108 gen = gen_avx2_pbroadcastv8hi; 17109 break; 17110 case E_V16SFmode: 17111 if (TARGET_AVX512F) 17112 gen = gen_avx512f_vec_dupv16sf_1; 17113 break; 17114 case E_V8SFmode: 17115 gen = gen_avx2_vec_dupv8sf_1; 17116 break; 17117 case E_V8DFmode: 17118 if (TARGET_AVX512F) 17119 gen = gen_avx512f_vec_dupv8df_1; 17120 break; 17121 case E_V8DImode: 17122 if (TARGET_AVX512F) 17123 gen = gen_avx512f_vec_dupv8di_1; 17124 break; 17125 /* For other modes prefer other shuffles this function creates. */ 17126 default: break; 17127 } 17128 if (gen != NULL) 17129 { 17130 if (!d->testing_p) 17131 emit_insn (gen (d->target, d->op0)); 17132 return true; 17133 } 17134 } 17135 17136 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) 17137 return true; 17138 17139 /* There are plenty of patterns in sse.md that are written for 17140 SEL+CONCAT and are not replicated for a single op. Perhaps 17141 that should be changed, to avoid the nastiness here. */ 17142 17143 /* Recognize interleave style patterns, which means incrementing 17144 every other permutation operand. */ 17145 for (i = 0; i < nelt; i += 2) 17146 { 17147 nd.perm[i] = d->perm[i] & mask; 17148 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; 17149 } 17150 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, 17151 d->testing_p)) 17152 return true; 17153 17154 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ 17155 if (nelt >= 4) 17156 { 17157 for (i = 0; i < nelt; i += 4) 17158 { 17159 nd.perm[i + 0] = d->perm[i + 0] & mask; 17160 nd.perm[i + 1] = d->perm[i + 1] & mask; 17161 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; 17162 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; 17163 } 17164 17165 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, 17166 d->testing_p)) 17167 return true; 17168 } 17169 } 17170 17171 /* Try movss/movsd instructions. */ 17172 if (expand_vec_perm_movs (d)) 17173 return true; 17174 17175 /* Finally, try the fully general two operand permute. */ 17176 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, 17177 d->testing_p)) 17178 return true; 17179 17180 /* Recognize interleave style patterns with reversed operands. */ 17181 if (!d->one_operand_p) 17182 { 17183 for (i = 0; i < nelt; ++i) 17184 { 17185 unsigned e = d->perm[i]; 17186 if (e >= nelt) 17187 e -= nelt; 17188 else 17189 e += nelt; 17190 nd.perm[i] = e; 17191 } 17192 17193 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, 17194 d->testing_p)) 17195 return true; 17196 } 17197 17198 /* Try the SSE4.1 blend variable merge instructions. */ 17199 if (expand_vec_perm_blend (d)) 17200 return true; 17201 17202 /* Try one of the AVX vpermil variable permutations. */ 17203 if (expand_vec_perm_vpermil (d)) 17204 return true; 17205 17206 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, 17207 vpshufb, vpermd, vpermps or vpermq variable permutation. */ 17208 if (expand_vec_perm_pshufb (d)) 17209 return true; 17210 17211 /* Try the AVX2 vpalignr instruction. */ 17212 if (expand_vec_perm_palignr (d, true)) 17213 return true; 17214 17215 /* Try the AVX512F vperm{s,d} instructions. */ 17216 if (ix86_expand_vec_one_operand_perm_avx512 (d)) 17217 return true; 17218 17219 /* Try the AVX512F vpermt2/vpermi2 instructions. */ 17220 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) 17221 return true; 17222 17223 /* See if we can get the same permutation in different vector integer 17224 mode. */ 17225 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) 17226 { 17227 if (!d->testing_p) 17228 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); 17229 return true; 17230 } 17231 return false; 17232} 17233 17234/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D 17235 in terms of a pair of pshuflw + pshufhw instructions. */ 17236 17237static bool 17238expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) 17239{ 17240 unsigned char perm2[MAX_VECT_LEN]; 17241 unsigned i; 17242 bool ok; 17243 17244 if (d->vmode != V8HImode || !d->one_operand_p) 17245 return false; 17246 17247 /* The two permutations only operate in 64-bit lanes. */ 17248 for (i = 0; i < 4; ++i) 17249 if (d->perm[i] >= 4) 17250 return false; 17251 for (i = 4; i < 8; ++i) 17252 if (d->perm[i] < 4) 17253 return false; 17254 17255 if (d->testing_p) 17256 return true; 17257 17258 /* Emit the pshuflw. */ 17259 memcpy (perm2, d->perm, 4); 17260 for (i = 4; i < 8; ++i) 17261 perm2[i] = i; 17262 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p); 17263 gcc_assert (ok); 17264 17265 /* Emit the pshufhw. */ 17266 memcpy (perm2 + 4, d->perm + 4, 4); 17267 for (i = 0; i < 4; ++i) 17268 perm2[i] = i; 17269 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p); 17270 gcc_assert (ok); 17271 17272 return true; 17273} 17274 17275/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify 17276 the permutation using the SSSE3 palignr instruction. This succeeds 17277 when all of the elements in PERM fit within one vector and we merely 17278 need to shift them down so that a single vector permutation has a 17279 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only 17280 the vpalignr instruction itself can perform the requested permutation. */ 17281 17282static bool 17283expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p) 17284{ 17285 unsigned i, nelt = d->nelt; 17286 unsigned min, max, minswap, maxswap; 17287 bool in_order, ok, swap = false; 17288 rtx shift, target; 17289 struct expand_vec_perm_d dcopy; 17290 17291 /* Even with AVX, palignr only operates on 128-bit vectors, 17292 in AVX2 palignr operates on both 128-bit lanes. */ 17293 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) 17294 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) 17295 return false; 17296 17297 min = 2 * nelt; 17298 max = 0; 17299 minswap = 2 * nelt; 17300 maxswap = 0; 17301 for (i = 0; i < nelt; ++i) 17302 { 17303 unsigned e = d->perm[i]; 17304 unsigned eswap = d->perm[i] ^ nelt; 17305 if (GET_MODE_SIZE (d->vmode) == 32) 17306 { 17307 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); 17308 eswap = e ^ (nelt / 2); 17309 } 17310 if (e < min) 17311 min = e; 17312 if (e > max) 17313 max = e; 17314 if (eswap < minswap) 17315 minswap = eswap; 17316 if (eswap > maxswap) 17317 maxswap = eswap; 17318 } 17319 if (min == 0 17320 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) 17321 { 17322 if (d->one_operand_p 17323 || minswap == 0 17324 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32 17325 ? nelt / 2 : nelt)) 17326 return false; 17327 swap = true; 17328 min = minswap; 17329 max = maxswap; 17330 } 17331 17332 /* Given that we have SSSE3, we know we'll be able to implement the 17333 single operand permutation after the palignr with pshufb for 17334 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed 17335 first. */ 17336 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p) 17337 return true; 17338 17339 dcopy = *d; 17340 if (swap) 17341 { 17342 dcopy.op0 = d->op1; 17343 dcopy.op1 = d->op0; 17344 for (i = 0; i < nelt; ++i) 17345 dcopy.perm[i] ^= nelt; 17346 } 17347 17348 in_order = true; 17349 for (i = 0; i < nelt; ++i) 17350 { 17351 unsigned e = dcopy.perm[i]; 17352 if (GET_MODE_SIZE (d->vmode) == 32 17353 && e >= nelt 17354 && (e & (nelt / 2 - 1)) < min) 17355 e = e - min - (nelt / 2); 17356 else 17357 e = e - min; 17358 if (e != i) 17359 in_order = false; 17360 dcopy.perm[i] = e; 17361 } 17362 dcopy.one_operand_p = true; 17363 17364 if (single_insn_only_p && !in_order) 17365 return false; 17366 17367 /* For AVX2, test whether we can permute the result in one instruction. */ 17368 if (d->testing_p) 17369 { 17370 if (in_order) 17371 return true; 17372 dcopy.op1 = dcopy.op0; 17373 return expand_vec_perm_1 (&dcopy); 17374 } 17375 17376 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode)); 17377 if (GET_MODE_SIZE (d->vmode) == 16) 17378 { 17379 target = gen_reg_rtx (TImode); 17380 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1), 17381 gen_lowpart (TImode, dcopy.op0), shift)); 17382 } 17383 else 17384 { 17385 target = gen_reg_rtx (V2TImode); 17386 emit_insn (gen_avx2_palignrv2ti (target, 17387 gen_lowpart (V2TImode, dcopy.op1), 17388 gen_lowpart (V2TImode, dcopy.op0), 17389 shift)); 17390 } 17391 17392 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); 17393 17394 /* Test for the degenerate case where the alignment by itself 17395 produces the desired permutation. */ 17396 if (in_order) 17397 { 17398 emit_move_insn (d->target, dcopy.op0); 17399 return true; 17400 } 17401 17402 ok = expand_vec_perm_1 (&dcopy); 17403 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); 17404 17405 return ok; 17406} 17407 17408/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify 17409 the permutation using the SSE4_1 pblendv instruction. Potentially 17410 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */ 17411 17412static bool 17413expand_vec_perm_pblendv (struct expand_vec_perm_d *d) 17414{ 17415 unsigned i, which, nelt = d->nelt; 17416 struct expand_vec_perm_d dcopy, dcopy1; 17417 machine_mode vmode = d->vmode; 17418 bool ok; 17419 17420 /* Use the same checks as in expand_vec_perm_blend. */ 17421 if (d->one_operand_p) 17422 return false; 17423 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) 17424 ; 17425 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) 17426 ; 17427 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) 17428 ; 17429 else 17430 return false; 17431 17432 /* Figure out where permutation elements stay not in their 17433 respective lanes. */ 17434 for (i = 0, which = 0; i < nelt; ++i) 17435 { 17436 unsigned e = d->perm[i]; 17437 if (e != i) 17438 which |= (e < nelt ? 1 : 2); 17439 } 17440 /* We can pblend the part where elements stay not in their 17441 respective lanes only when these elements are all in one 17442 half of a permutation. 17443 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective 17444 lanes, but both 8 and 9 >= 8 17445 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their 17446 respective lanes and 8 >= 8, but 2 not. */ 17447 if (which != 1 && which != 2) 17448 return false; 17449 if (d->testing_p && GET_MODE_SIZE (vmode) == 16) 17450 return true; 17451 17452 /* First we apply one operand permutation to the part where 17453 elements stay not in their respective lanes. */ 17454 dcopy = *d; 17455 if (which == 2) 17456 dcopy.op0 = dcopy.op1 = d->op1; 17457 else 17458 dcopy.op0 = dcopy.op1 = d->op0; 17459 if (!d->testing_p) 17460 dcopy.target = gen_reg_rtx (vmode); 17461 dcopy.one_operand_p = true; 17462 17463 for (i = 0; i < nelt; ++i) 17464 dcopy.perm[i] = d->perm[i] & (nelt - 1); 17465 17466 ok = expand_vec_perm_1 (&dcopy); 17467 if (GET_MODE_SIZE (vmode) != 16 && !ok) 17468 return false; 17469 else 17470 gcc_assert (ok); 17471 if (d->testing_p) 17472 return true; 17473 17474 /* Next we put permuted elements into their positions. */ 17475 dcopy1 = *d; 17476 if (which == 2) 17477 dcopy1.op1 = dcopy.target; 17478 else 17479 dcopy1.op0 = dcopy.target; 17480 17481 for (i = 0; i < nelt; ++i) 17482 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i); 17483 17484 ok = expand_vec_perm_blend (&dcopy1); 17485 gcc_assert (ok); 17486 17487 return true; 17488} 17489 17490static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); 17491 17492/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify 17493 a two vector permutation into a single vector permutation by using 17494 an interleave operation to merge the vectors. */ 17495 17496static bool 17497expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) 17498{ 17499 struct expand_vec_perm_d dremap, dfinal; 17500 unsigned i, nelt = d->nelt, nelt2 = nelt / 2; 17501 unsigned HOST_WIDE_INT contents; 17502 unsigned char remap[2 * MAX_VECT_LEN]; 17503 rtx_insn *seq; 17504 bool ok, same_halves = false; 17505 17506 if (GET_MODE_SIZE (d->vmode) == 16) 17507 { 17508 if (d->one_operand_p) 17509 return false; 17510 } 17511 else if (GET_MODE_SIZE (d->vmode) == 32) 17512 { 17513 if (!TARGET_AVX) 17514 return false; 17515 /* For 32-byte modes allow even d->one_operand_p. 17516 The lack of cross-lane shuffling in some instructions 17517 might prevent a single insn shuffle. */ 17518 dfinal = *d; 17519 dfinal.testing_p = true; 17520 /* If expand_vec_perm_interleave3 can expand this into 17521 a 3 insn sequence, give up and let it be expanded as 17522 3 insn sequence. While that is one insn longer, 17523 it doesn't need a memory operand and in the common 17524 case that both interleave low and high permutations 17525 with the same operands are adjacent needs 4 insns 17526 for both after CSE. */ 17527 if (expand_vec_perm_interleave3 (&dfinal)) 17528 return false; 17529 } 17530 else 17531 return false; 17532 17533 /* Examine from whence the elements come. */ 17534 contents = 0; 17535 for (i = 0; i < nelt; ++i) 17536 contents |= HOST_WIDE_INT_1U << d->perm[i]; 17537 17538 memset (remap, 0xff, sizeof (remap)); 17539 dremap = *d; 17540 17541 if (GET_MODE_SIZE (d->vmode) == 16) 17542 { 17543 unsigned HOST_WIDE_INT h1, h2, h3, h4; 17544 17545 /* Split the two input vectors into 4 halves. */ 17546 h1 = (HOST_WIDE_INT_1U << nelt2) - 1; 17547 h2 = h1 << nelt2; 17548 h3 = h2 << nelt2; 17549 h4 = h3 << nelt2; 17550 17551 /* If the elements from the low halves use interleave low, and similarly 17552 for interleave high. If the elements are from mis-matched halves, we 17553 can use shufps for V4SF/V4SI or do a DImode shuffle. */ 17554 if ((contents & (h1 | h3)) == contents) 17555 { 17556 /* punpckl* */ 17557 for (i = 0; i < nelt2; ++i) 17558 { 17559 remap[i] = i * 2; 17560 remap[i + nelt] = i * 2 + 1; 17561 dremap.perm[i * 2] = i; 17562 dremap.perm[i * 2 + 1] = i + nelt; 17563 } 17564 if (!TARGET_SSE2 && d->vmode == V4SImode) 17565 dremap.vmode = V4SFmode; 17566 } 17567 else if ((contents & (h2 | h4)) == contents) 17568 { 17569 /* punpckh* */ 17570 for (i = 0; i < nelt2; ++i) 17571 { 17572 remap[i + nelt2] = i * 2; 17573 remap[i + nelt + nelt2] = i * 2 + 1; 17574 dremap.perm[i * 2] = i + nelt2; 17575 dremap.perm[i * 2 + 1] = i + nelt + nelt2; 17576 } 17577 if (!TARGET_SSE2 && d->vmode == V4SImode) 17578 dremap.vmode = V4SFmode; 17579 } 17580 else if ((contents & (h1 | h4)) == contents) 17581 { 17582 /* shufps */ 17583 for (i = 0; i < nelt2; ++i) 17584 { 17585 remap[i] = i; 17586 remap[i + nelt + nelt2] = i + nelt2; 17587 dremap.perm[i] = i; 17588 dremap.perm[i + nelt2] = i + nelt + nelt2; 17589 } 17590 if (nelt != 4) 17591 { 17592 /* shufpd */ 17593 dremap.vmode = V2DImode; 17594 dremap.nelt = 2; 17595 dremap.perm[0] = 0; 17596 dremap.perm[1] = 3; 17597 } 17598 } 17599 else if ((contents & (h2 | h3)) == contents) 17600 { 17601 /* shufps */ 17602 for (i = 0; i < nelt2; ++i) 17603 { 17604 remap[i + nelt2] = i; 17605 remap[i + nelt] = i + nelt2; 17606 dremap.perm[i] = i + nelt2; 17607 dremap.perm[i + nelt2] = i + nelt; 17608 } 17609 if (nelt != 4) 17610 { 17611 /* shufpd */ 17612 dremap.vmode = V2DImode; 17613 dremap.nelt = 2; 17614 dremap.perm[0] = 1; 17615 dremap.perm[1] = 2; 17616 } 17617 } 17618 else 17619 return false; 17620 } 17621 else 17622 { 17623 unsigned int nelt4 = nelt / 4, nzcnt = 0; 17624 unsigned HOST_WIDE_INT q[8]; 17625 unsigned int nonzero_halves[4]; 17626 17627 /* Split the two input vectors into 8 quarters. */ 17628 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1; 17629 for (i = 1; i < 8; ++i) 17630 q[i] = q[0] << (nelt4 * i); 17631 for (i = 0; i < 4; ++i) 17632 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0) 17633 { 17634 nonzero_halves[nzcnt] = i; 17635 ++nzcnt; 17636 } 17637 17638 if (nzcnt == 1) 17639 { 17640 gcc_assert (d->one_operand_p); 17641 nonzero_halves[1] = nonzero_halves[0]; 17642 same_halves = true; 17643 } 17644 else if (d->one_operand_p) 17645 { 17646 gcc_assert (nonzero_halves[0] == 0); 17647 gcc_assert (nonzero_halves[1] == 1); 17648 } 17649 17650 if (nzcnt <= 2) 17651 { 17652 if (d->perm[0] / nelt2 == nonzero_halves[1]) 17653 { 17654 /* Attempt to increase the likelihood that dfinal 17655 shuffle will be intra-lane. */ 17656 std::swap (nonzero_halves[0], nonzero_halves[1]); 17657 } 17658 17659 /* vperm2f128 or vperm2i128. */ 17660 for (i = 0; i < nelt2; ++i) 17661 { 17662 remap[i + nonzero_halves[1] * nelt2] = i + nelt2; 17663 remap[i + nonzero_halves[0] * nelt2] = i; 17664 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2; 17665 dremap.perm[i] = i + nonzero_halves[0] * nelt2; 17666 } 17667 17668 if (d->vmode != V8SFmode 17669 && d->vmode != V4DFmode 17670 && d->vmode != V8SImode) 17671 { 17672 dremap.vmode = V8SImode; 17673 dremap.nelt = 8; 17674 for (i = 0; i < 4; ++i) 17675 { 17676 dremap.perm[i] = i + nonzero_halves[0] * 4; 17677 dremap.perm[i + 4] = i + nonzero_halves[1] * 4; 17678 } 17679 } 17680 } 17681 else if (d->one_operand_p) 17682 return false; 17683 else if (TARGET_AVX2 17684 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents) 17685 { 17686 /* vpunpckl* */ 17687 for (i = 0; i < nelt4; ++i) 17688 { 17689 remap[i] = i * 2; 17690 remap[i + nelt] = i * 2 + 1; 17691 remap[i + nelt2] = i * 2 + nelt2; 17692 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1; 17693 dremap.perm[i * 2] = i; 17694 dremap.perm[i * 2 + 1] = i + nelt; 17695 dremap.perm[i * 2 + nelt2] = i + nelt2; 17696 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2; 17697 } 17698 } 17699 else if (TARGET_AVX2 17700 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents) 17701 { 17702 /* vpunpckh* */ 17703 for (i = 0; i < nelt4; ++i) 17704 { 17705 remap[i + nelt4] = i * 2; 17706 remap[i + nelt + nelt4] = i * 2 + 1; 17707 remap[i + nelt2 + nelt4] = i * 2 + nelt2; 17708 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1; 17709 dremap.perm[i * 2] = i + nelt4; 17710 dremap.perm[i * 2 + 1] = i + nelt + nelt4; 17711 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4; 17712 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4; 17713 } 17714 } 17715 else 17716 return false; 17717 } 17718 17719 /* Use the remapping array set up above to move the elements from their 17720 swizzled locations into their final destinations. */ 17721 dfinal = *d; 17722 for (i = 0; i < nelt; ++i) 17723 { 17724 unsigned e = remap[d->perm[i]]; 17725 gcc_assert (e < nelt); 17726 /* If same_halves is true, both halves of the remapped vector are the 17727 same. Avoid cross-lane accesses if possible. */ 17728 if (same_halves && i >= nelt2) 17729 { 17730 gcc_assert (e < nelt2); 17731 dfinal.perm[i] = e + nelt2; 17732 } 17733 else 17734 dfinal.perm[i] = e; 17735 } 17736 if (!d->testing_p) 17737 { 17738 dremap.target = gen_reg_rtx (dremap.vmode); 17739 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); 17740 } 17741 dfinal.op1 = dfinal.op0; 17742 dfinal.one_operand_p = true; 17743 17744 /* Test if the final remap can be done with a single insn. For V4SFmode or 17745 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ 17746 start_sequence (); 17747 ok = expand_vec_perm_1 (&dfinal); 17748 seq = get_insns (); 17749 end_sequence (); 17750 17751 if (!ok) 17752 return false; 17753 17754 if (d->testing_p) 17755 return true; 17756 17757 if (dremap.vmode != dfinal.vmode) 17758 { 17759 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); 17760 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); 17761 } 17762 17763 ok = expand_vec_perm_1 (&dremap); 17764 gcc_assert (ok); 17765 17766 emit_insn (seq); 17767 return true; 17768} 17769 17770/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify 17771 a single vector cross-lane permutation into vpermq followed 17772 by any of the single insn permutations. */ 17773 17774static bool 17775expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d) 17776{ 17777 struct expand_vec_perm_d dremap, dfinal; 17778 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4; 17779 unsigned contents[2]; 17780 bool ok; 17781 17782 if (!(TARGET_AVX2 17783 && (d->vmode == V32QImode || d->vmode == V16HImode) 17784 && d->one_operand_p)) 17785 return false; 17786 17787 contents[0] = 0; 17788 contents[1] = 0; 17789 for (i = 0; i < nelt2; ++i) 17790 { 17791 contents[0] |= 1u << (d->perm[i] / nelt4); 17792 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4); 17793 } 17794 17795 for (i = 0; i < 2; ++i) 17796 { 17797 unsigned int cnt = 0; 17798 for (j = 0; j < 4; ++j) 17799 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2) 17800 return false; 17801 } 17802 17803 if (d->testing_p) 17804 return true; 17805 17806 dremap = *d; 17807 dremap.vmode = V4DImode; 17808 dremap.nelt = 4; 17809 dremap.target = gen_reg_rtx (V4DImode); 17810 dremap.op0 = gen_lowpart (V4DImode, d->op0); 17811 dremap.op1 = dremap.op0; 17812 dremap.one_operand_p = true; 17813 for (i = 0; i < 2; ++i) 17814 { 17815 unsigned int cnt = 0; 17816 for (j = 0; j < 4; ++j) 17817 if ((contents[i] & (1u << j)) != 0) 17818 dremap.perm[2 * i + cnt++] = j; 17819 for (; cnt < 2; ++cnt) 17820 dremap.perm[2 * i + cnt] = 0; 17821 } 17822 17823 dfinal = *d; 17824 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); 17825 dfinal.op1 = dfinal.op0; 17826 dfinal.one_operand_p = true; 17827 for (i = 0, j = 0; i < nelt; ++i) 17828 { 17829 if (i == nelt2) 17830 j = 2; 17831 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0); 17832 if ((d->perm[i] / nelt4) == dremap.perm[j]) 17833 ; 17834 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1]) 17835 dfinal.perm[i] |= nelt4; 17836 else 17837 gcc_unreachable (); 17838 } 17839 17840 ok = expand_vec_perm_1 (&dremap); 17841 gcc_assert (ok); 17842 17843 ok = expand_vec_perm_1 (&dfinal); 17844 gcc_assert (ok); 17845 17846 return true; 17847} 17848 17849static bool canonicalize_perm (struct expand_vec_perm_d *d); 17850 17851/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand 17852 a vector permutation using two instructions, vperm2f128 resp. 17853 vperm2i128 followed by any single in-lane permutation. */ 17854 17855static bool 17856expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d) 17857{ 17858 struct expand_vec_perm_d dfirst, dsecond; 17859 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm; 17860 bool ok; 17861 17862 if (!TARGET_AVX 17863 || GET_MODE_SIZE (d->vmode) != 32 17864 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)) 17865 return false; 17866 17867 dsecond = *d; 17868 dsecond.one_operand_p = false; 17869 dsecond.testing_p = true; 17870 17871 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128 17872 immediate. For perm < 16 the second permutation uses 17873 d->op0 as first operand, for perm >= 16 it uses d->op1 17874 as first operand. The second operand is the result of 17875 vperm2[fi]128. */ 17876 for (perm = 0; perm < 32; perm++) 17877 { 17878 /* Ignore permutations which do not move anything cross-lane. */ 17879 if (perm < 16) 17880 { 17881 /* The second shuffle for e.g. V4DFmode has 17882 0123 and ABCD operands. 17883 Ignore AB23, as 23 is already in the second lane 17884 of the first operand. */ 17885 if ((perm & 0xc) == (1 << 2)) continue; 17886 /* And 01CD, as 01 is in the first lane of the first 17887 operand. */ 17888 if ((perm & 3) == 0) continue; 17889 /* And 4567, as then the vperm2[fi]128 doesn't change 17890 anything on the original 4567 second operand. */ 17891 if ((perm & 0xf) == ((3 << 2) | 2)) continue; 17892 } 17893 else 17894 { 17895 /* The second shuffle for e.g. V4DFmode has 17896 4567 and ABCD operands. 17897 Ignore AB67, as 67 is already in the second lane 17898 of the first operand. */ 17899 if ((perm & 0xc) == (3 << 2)) continue; 17900 /* And 45CD, as 45 is in the first lane of the first 17901 operand. */ 17902 if ((perm & 3) == 2) continue; 17903 /* And 0123, as then the vperm2[fi]128 doesn't change 17904 anything on the original 0123 first operand. */ 17905 if ((perm & 0xf) == (1 << 2)) continue; 17906 } 17907 17908 for (i = 0; i < nelt; i++) 17909 { 17910 j = d->perm[i] / nelt2; 17911 if (j == ((perm >> (2 * (i >= nelt2))) & 3)) 17912 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1)); 17913 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16)) 17914 dsecond.perm[i] = d->perm[i] & (nelt - 1); 17915 else 17916 break; 17917 } 17918 17919 if (i == nelt) 17920 { 17921 start_sequence (); 17922 ok = expand_vec_perm_1 (&dsecond); 17923 end_sequence (); 17924 } 17925 else 17926 ok = false; 17927 17928 if (ok) 17929 { 17930 if (d->testing_p) 17931 return true; 17932 17933 /* Found a usable second shuffle. dfirst will be 17934 vperm2f128 on d->op0 and d->op1. */ 17935 dsecond.testing_p = false; 17936 dfirst = *d; 17937 dfirst.target = gen_reg_rtx (d->vmode); 17938 for (i = 0; i < nelt; i++) 17939 dfirst.perm[i] = (i & (nelt2 - 1)) 17940 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2; 17941 17942 canonicalize_perm (&dfirst); 17943 ok = expand_vec_perm_1 (&dfirst); 17944 gcc_assert (ok); 17945 17946 /* And dsecond is some single insn shuffle, taking 17947 d->op0 and result of vperm2f128 (if perm < 16) or 17948 d->op1 and result of vperm2f128 (otherwise). */ 17949 if (perm >= 16) 17950 dsecond.op0 = dsecond.op1; 17951 dsecond.op1 = dfirst.target; 17952 17953 ok = expand_vec_perm_1 (&dsecond); 17954 gcc_assert (ok); 17955 17956 return true; 17957 } 17958 17959 /* For one operand, the only useful vperm2f128 permutation is 0x01 17960 aka lanes swap. */ 17961 if (d->one_operand_p) 17962 return false; 17963 } 17964 17965 return false; 17966} 17967 17968/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify 17969 a two vector permutation using 2 intra-lane interleave insns 17970 and cross-lane shuffle for 32-byte vectors. */ 17971 17972static bool 17973expand_vec_perm_interleave3 (struct expand_vec_perm_d *d) 17974{ 17975 unsigned i, nelt; 17976 rtx (*gen) (rtx, rtx, rtx); 17977 17978 if (d->one_operand_p) 17979 return false; 17980 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) 17981 ; 17982 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode)) 17983 ; 17984 else 17985 return false; 17986 17987 nelt = d->nelt; 17988 if (d->perm[0] != 0 && d->perm[0] != nelt / 2) 17989 return false; 17990 for (i = 0; i < nelt; i += 2) 17991 if (d->perm[i] != d->perm[0] + i / 2 17992 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt) 17993 return false; 17994 17995 if (d->testing_p) 17996 return true; 17997 17998 switch (d->vmode) 17999 { 18000 case E_V32QImode: 18001 if (d->perm[0]) 18002 gen = gen_vec_interleave_highv32qi; 18003 else 18004 gen = gen_vec_interleave_lowv32qi; 18005 break; 18006 case E_V16HImode: 18007 if (d->perm[0]) 18008 gen = gen_vec_interleave_highv16hi; 18009 else 18010 gen = gen_vec_interleave_lowv16hi; 18011 break; 18012 case E_V8SImode: 18013 if (d->perm[0]) 18014 gen = gen_vec_interleave_highv8si; 18015 else 18016 gen = gen_vec_interleave_lowv8si; 18017 break; 18018 case E_V4DImode: 18019 if (d->perm[0]) 18020 gen = gen_vec_interleave_highv4di; 18021 else 18022 gen = gen_vec_interleave_lowv4di; 18023 break; 18024 case E_V8SFmode: 18025 if (d->perm[0]) 18026 gen = gen_vec_interleave_highv8sf; 18027 else 18028 gen = gen_vec_interleave_lowv8sf; 18029 break; 18030 case E_V4DFmode: 18031 if (d->perm[0]) 18032 gen = gen_vec_interleave_highv4df; 18033 else 18034 gen = gen_vec_interleave_lowv4df; 18035 break; 18036 default: 18037 gcc_unreachable (); 18038 } 18039 18040 emit_insn (gen (d->target, d->op0, d->op1)); 18041 return true; 18042} 18043 18044/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement 18045 a single vector permutation using a single intra-lane vector 18046 permutation, vperm2f128 swapping the lanes and vblend* insn blending 18047 the non-swapped and swapped vectors together. */ 18048 18049static bool 18050expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) 18051{ 18052 struct expand_vec_perm_d dfirst, dsecond; 18053 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2; 18054 rtx_insn *seq; 18055 bool ok; 18056 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; 18057 18058 if (!TARGET_AVX 18059 || TARGET_AVX2 18060 || (d->vmode != V8SFmode && d->vmode != V4DFmode) 18061 || !d->one_operand_p) 18062 return false; 18063 18064 dfirst = *d; 18065 for (i = 0; i < nelt; i++) 18066 dfirst.perm[i] = 0xff; 18067 for (i = 0, msk = 0; i < nelt; i++) 18068 { 18069 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; 18070 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i]) 18071 return false; 18072 dfirst.perm[j] = d->perm[i]; 18073 if (j != i) 18074 msk |= (1 << i); 18075 } 18076 for (i = 0; i < nelt; i++) 18077 if (dfirst.perm[i] == 0xff) 18078 dfirst.perm[i] = i; 18079 18080 if (!d->testing_p) 18081 dfirst.target = gen_reg_rtx (dfirst.vmode); 18082 18083 start_sequence (); 18084 ok = expand_vec_perm_1 (&dfirst); 18085 seq = get_insns (); 18086 end_sequence (); 18087 18088 if (!ok) 18089 return false; 18090 18091 if (d->testing_p) 18092 return true; 18093 18094 emit_insn (seq); 18095 18096 dsecond = *d; 18097 dsecond.op0 = dfirst.target; 18098 dsecond.op1 = dfirst.target; 18099 dsecond.one_operand_p = true; 18100 dsecond.target = gen_reg_rtx (dsecond.vmode); 18101 for (i = 0; i < nelt; i++) 18102 dsecond.perm[i] = i ^ nelt2; 18103 18104 ok = expand_vec_perm_1 (&dsecond); 18105 gcc_assert (ok); 18106 18107 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; 18108 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk))); 18109 return true; 18110} 18111 18112/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF 18113 permutation using two vperm2f128, followed by a vshufpd insn blending 18114 the two vectors together. */ 18115 18116static bool 18117expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d) 18118{ 18119 struct expand_vec_perm_d dfirst, dsecond, dthird; 18120 bool ok; 18121 18122 if (!TARGET_AVX || (d->vmode != V4DFmode)) 18123 return false; 18124 18125 if (d->testing_p) 18126 return true; 18127 18128 dfirst = *d; 18129 dsecond = *d; 18130 dthird = *d; 18131 18132 dfirst.perm[0] = (d->perm[0] & ~1); 18133 dfirst.perm[1] = (d->perm[0] & ~1) + 1; 18134 dfirst.perm[2] = (d->perm[2] & ~1); 18135 dfirst.perm[3] = (d->perm[2] & ~1) + 1; 18136 dsecond.perm[0] = (d->perm[1] & ~1); 18137 dsecond.perm[1] = (d->perm[1] & ~1) + 1; 18138 dsecond.perm[2] = (d->perm[3] & ~1); 18139 dsecond.perm[3] = (d->perm[3] & ~1) + 1; 18140 dthird.perm[0] = (d->perm[0] % 2); 18141 dthird.perm[1] = (d->perm[1] % 2) + 4; 18142 dthird.perm[2] = (d->perm[2] % 2) + 2; 18143 dthird.perm[3] = (d->perm[3] % 2) + 6; 18144 18145 dfirst.target = gen_reg_rtx (dfirst.vmode); 18146 dsecond.target = gen_reg_rtx (dsecond.vmode); 18147 dthird.op0 = dfirst.target; 18148 dthird.op1 = dsecond.target; 18149 dthird.one_operand_p = false; 18150 18151 canonicalize_perm (&dfirst); 18152 canonicalize_perm (&dsecond); 18153 18154 ok = expand_vec_perm_1 (&dfirst) 18155 && expand_vec_perm_1 (&dsecond) 18156 && expand_vec_perm_1 (&dthird); 18157 18158 gcc_assert (ok); 18159 18160 return true; 18161} 18162 18163static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *); 18164 18165/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement 18166 a two vector permutation using two intra-lane vector 18167 permutations, vperm2f128 swapping the lanes and vblend* insn blending 18168 the non-swapped and swapped vectors together. */ 18169 18170static bool 18171expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d) 18172{ 18173 struct expand_vec_perm_d dfirst, dsecond, dthird; 18174 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0; 18175 rtx_insn *seq1, *seq2; 18176 bool ok; 18177 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; 18178 18179 if (!TARGET_AVX 18180 || TARGET_AVX2 18181 || (d->vmode != V8SFmode && d->vmode != V4DFmode) 18182 || d->one_operand_p) 18183 return false; 18184 18185 dfirst = *d; 18186 dsecond = *d; 18187 for (i = 0; i < nelt; i++) 18188 { 18189 dfirst.perm[i] = 0xff; 18190 dsecond.perm[i] = 0xff; 18191 } 18192 for (i = 0, msk = 0; i < nelt; i++) 18193 { 18194 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; 18195 if (j == i) 18196 { 18197 dfirst.perm[j] = d->perm[i]; 18198 which1 |= (d->perm[i] < nelt ? 1 : 2); 18199 } 18200 else 18201 { 18202 dsecond.perm[j] = d->perm[i]; 18203 which2 |= (d->perm[i] < nelt ? 1 : 2); 18204 msk |= (1U << i); 18205 } 18206 } 18207 if (msk == 0 || msk == (1U << nelt) - 1) 18208 return false; 18209 18210 if (!d->testing_p) 18211 { 18212 dfirst.target = gen_reg_rtx (dfirst.vmode); 18213 dsecond.target = gen_reg_rtx (dsecond.vmode); 18214 } 18215 18216 for (i = 0; i < nelt; i++) 18217 { 18218 if (dfirst.perm[i] == 0xff) 18219 dfirst.perm[i] = (which1 == 2 ? i + nelt : i); 18220 if (dsecond.perm[i] == 0xff) 18221 dsecond.perm[i] = (which2 == 2 ? i + nelt : i); 18222 } 18223 canonicalize_perm (&dfirst); 18224 start_sequence (); 18225 ok = ix86_expand_vec_perm_const_1 (&dfirst); 18226 seq1 = get_insns (); 18227 end_sequence (); 18228 18229 if (!ok) 18230 return false; 18231 18232 canonicalize_perm (&dsecond); 18233 start_sequence (); 18234 ok = ix86_expand_vec_perm_const_1 (&dsecond); 18235 seq2 = get_insns (); 18236 end_sequence (); 18237 18238 if (!ok) 18239 return false; 18240 18241 if (d->testing_p) 18242 return true; 18243 18244 emit_insn (seq1); 18245 emit_insn (seq2); 18246 18247 dthird = *d; 18248 dthird.op0 = dsecond.target; 18249 dthird.op1 = dsecond.target; 18250 dthird.one_operand_p = true; 18251 dthird.target = gen_reg_rtx (dthird.vmode); 18252 for (i = 0; i < nelt; i++) 18253 dthird.perm[i] = i ^ nelt2; 18254 18255 ok = expand_vec_perm_1 (&dthird); 18256 gcc_assert (ok); 18257 18258 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; 18259 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk))); 18260 return true; 18261} 18262 18263/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word 18264 permutation with two pshufb insns and an ior. We should have already 18265 failed all two instruction sequences. */ 18266 18267static bool 18268expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) 18269{ 18270 rtx rperm[2][16], vperm, l, h, op, m128; 18271 unsigned int i, nelt, eltsz; 18272 18273 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) 18274 return false; 18275 gcc_assert (!d->one_operand_p); 18276 18277 if (d->testing_p) 18278 return true; 18279 18280 nelt = d->nelt; 18281 eltsz = GET_MODE_UNIT_SIZE (d->vmode); 18282 18283 /* Generate two permutation masks. If the required element is within 18284 the given vector it is shuffled into the proper lane. If the required 18285 element is in the other vector, force a zero into the lane by setting 18286 bit 7 in the permutation mask. */ 18287 m128 = GEN_INT (-128); 18288 for (i = 0; i < nelt; ++i) 18289 { 18290 unsigned j, e = d->perm[i]; 18291 unsigned which = (e >= nelt); 18292 if (e >= nelt) 18293 e -= nelt; 18294 18295 for (j = 0; j < eltsz; ++j) 18296 { 18297 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); 18298 rperm[1-which][i*eltsz + j] = m128; 18299 } 18300 } 18301 18302 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); 18303 vperm = force_reg (V16QImode, vperm); 18304 18305 l = gen_reg_rtx (V16QImode); 18306 op = gen_lowpart (V16QImode, d->op0); 18307 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); 18308 18309 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); 18310 vperm = force_reg (V16QImode, vperm); 18311 18312 h = gen_reg_rtx (V16QImode); 18313 op = gen_lowpart (V16QImode, d->op1); 18314 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); 18315 18316 op = d->target; 18317 if (d->vmode != V16QImode) 18318 op = gen_reg_rtx (V16QImode); 18319 emit_insn (gen_iorv16qi3 (op, l, h)); 18320 if (op != d->target) 18321 emit_move_insn (d->target, gen_lowpart (d->vmode, op)); 18322 18323 return true; 18324} 18325 18326/* Implement arbitrary permutation of one V32QImode and V16QImode operand 18327 with two vpshufb insns, vpermq and vpor. We should have already failed 18328 all two or three instruction sequences. */ 18329 18330static bool 18331expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) 18332{ 18333 rtx rperm[2][32], vperm, l, h, hp, op, m128; 18334 unsigned int i, nelt, eltsz; 18335 18336 if (!TARGET_AVX2 18337 || !d->one_operand_p 18338 || (d->vmode != V32QImode && d->vmode != V16HImode)) 18339 return false; 18340 18341 if (d->testing_p) 18342 return true; 18343 18344 nelt = d->nelt; 18345 eltsz = GET_MODE_UNIT_SIZE (d->vmode); 18346 18347 /* Generate two permutation masks. If the required element is within 18348 the same lane, it is shuffled in. If the required element from the 18349 other lane, force a zero by setting bit 7 in the permutation mask. 18350 In the other mask the mask has non-negative elements if element 18351 is requested from the other lane, but also moved to the other lane, 18352 so that the result of vpshufb can have the two V2TImode halves 18353 swapped. */ 18354 m128 = GEN_INT (-128); 18355 for (i = 0; i < nelt; ++i) 18356 { 18357 unsigned j, e = d->perm[i] & (nelt / 2 - 1); 18358 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; 18359 18360 for (j = 0; j < eltsz; ++j) 18361 { 18362 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); 18363 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128; 18364 } 18365 } 18366 18367 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); 18368 vperm = force_reg (V32QImode, vperm); 18369 18370 h = gen_reg_rtx (V32QImode); 18371 op = gen_lowpart (V32QImode, d->op0); 18372 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); 18373 18374 /* Swap the 128-byte lanes of h into hp. */ 18375 hp = gen_reg_rtx (V4DImode); 18376 op = gen_lowpart (V4DImode, h); 18377 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx, 18378 const1_rtx)); 18379 18380 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); 18381 vperm = force_reg (V32QImode, vperm); 18382 18383 l = gen_reg_rtx (V32QImode); 18384 op = gen_lowpart (V32QImode, d->op0); 18385 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); 18386 18387 op = d->target; 18388 if (d->vmode != V32QImode) 18389 op = gen_reg_rtx (V32QImode); 18390 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); 18391 if (op != d->target) 18392 emit_move_insn (d->target, gen_lowpart (d->vmode, op)); 18393 18394 return true; 18395} 18396 18397/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even 18398 and extract-odd permutations of two V32QImode and V16QImode operand 18399 with two vpshufb insns, vpor and vpermq. We should have already 18400 failed all two or three instruction sequences. */ 18401 18402static bool 18403expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) 18404{ 18405 rtx rperm[2][32], vperm, l, h, ior, op, m128; 18406 unsigned int i, nelt, eltsz; 18407 18408 if (!TARGET_AVX2 18409 || d->one_operand_p 18410 || (d->vmode != V32QImode && d->vmode != V16HImode)) 18411 return false; 18412 18413 for (i = 0; i < d->nelt; ++i) 18414 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2)) 18415 return false; 18416 18417 if (d->testing_p) 18418 return true; 18419 18420 nelt = d->nelt; 18421 eltsz = GET_MODE_UNIT_SIZE (d->vmode); 18422 18423 /* Generate two permutation masks. In the first permutation mask 18424 the first quarter will contain indexes for the first half 18425 of the op0, the second quarter will contain bit 7 set, third quarter 18426 will contain indexes for the second half of the op0 and the 18427 last quarter bit 7 set. In the second permutation mask 18428 the first quarter will contain bit 7 set, the second quarter 18429 indexes for the first half of the op1, the third quarter bit 7 set 18430 and last quarter indexes for the second half of the op1. 18431 I.e. the first mask e.g. for V32QImode extract even will be: 18432 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128 18433 (all values masked with 0xf except for -128) and second mask 18434 for extract even will be 18435 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */ 18436 m128 = GEN_INT (-128); 18437 for (i = 0; i < nelt; ++i) 18438 { 18439 unsigned j, e = d->perm[i] & (nelt / 2 - 1); 18440 unsigned which = d->perm[i] >= nelt; 18441 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0; 18442 18443 for (j = 0; j < eltsz; ++j) 18444 { 18445 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j); 18446 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128; 18447 } 18448 } 18449 18450 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); 18451 vperm = force_reg (V32QImode, vperm); 18452 18453 l = gen_reg_rtx (V32QImode); 18454 op = gen_lowpart (V32QImode, d->op0); 18455 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); 18456 18457 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); 18458 vperm = force_reg (V32QImode, vperm); 18459 18460 h = gen_reg_rtx (V32QImode); 18461 op = gen_lowpart (V32QImode, d->op1); 18462 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); 18463 18464 ior = gen_reg_rtx (V32QImode); 18465 emit_insn (gen_iorv32qi3 (ior, l, h)); 18466 18467 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ 18468 op = gen_reg_rtx (V4DImode); 18469 ior = gen_lowpart (V4DImode, ior); 18470 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, 18471 const1_rtx, GEN_INT (3))); 18472 emit_move_insn (d->target, gen_lowpart (d->vmode, op)); 18473 18474 return true; 18475} 18476 18477/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even 18478 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands 18479 with two "and" and "pack" or two "shift" and "pack" insns. We should 18480 have already failed all two instruction sequences. */ 18481 18482static bool 18483expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) 18484{ 18485 rtx op, dop0, dop1, t; 18486 unsigned i, odd, c, s, nelt = d->nelt; 18487 bool end_perm = false; 18488 machine_mode half_mode; 18489 rtx (*gen_and) (rtx, rtx, rtx); 18490 rtx (*gen_pack) (rtx, rtx, rtx); 18491 rtx (*gen_shift) (rtx, rtx, rtx); 18492 18493 if (d->one_operand_p) 18494 return false; 18495 18496 switch (d->vmode) 18497 { 18498 case E_V8HImode: 18499 /* Required for "pack". */ 18500 if (!TARGET_SSE4_1) 18501 return false; 18502 c = 0xffff; 18503 s = 16; 18504 half_mode = V4SImode; 18505 gen_and = gen_andv4si3; 18506 gen_pack = gen_sse4_1_packusdw; 18507 gen_shift = gen_lshrv4si3; 18508 break; 18509 case E_V16QImode: 18510 /* No check as all instructions are SSE2. */ 18511 c = 0xff; 18512 s = 8; 18513 half_mode = V8HImode; 18514 gen_and = gen_andv8hi3; 18515 gen_pack = gen_sse2_packuswb; 18516 gen_shift = gen_lshrv8hi3; 18517 break; 18518 case E_V16HImode: 18519 if (!TARGET_AVX2) 18520 return false; 18521 c = 0xffff; 18522 s = 16; 18523 half_mode = V8SImode; 18524 gen_and = gen_andv8si3; 18525 gen_pack = gen_avx2_packusdw; 18526 gen_shift = gen_lshrv8si3; 18527 end_perm = true; 18528 break; 18529 case E_V32QImode: 18530 if (!TARGET_AVX2) 18531 return false; 18532 c = 0xff; 18533 s = 8; 18534 half_mode = V16HImode; 18535 gen_and = gen_andv16hi3; 18536 gen_pack = gen_avx2_packuswb; 18537 gen_shift = gen_lshrv16hi3; 18538 end_perm = true; 18539 break; 18540 default: 18541 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than 18542 general shuffles. */ 18543 return false; 18544 } 18545 18546 /* Check that permutation is even or odd. */ 18547 odd = d->perm[0]; 18548 if (odd > 1) 18549 return false; 18550 18551 for (i = 1; i < nelt; ++i) 18552 if (d->perm[i] != 2 * i + odd) 18553 return false; 18554 18555 if (d->testing_p) 18556 return true; 18557 18558 dop0 = gen_reg_rtx (half_mode); 18559 dop1 = gen_reg_rtx (half_mode); 18560 if (odd == 0) 18561 { 18562 t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); 18563 t = force_reg (half_mode, t); 18564 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); 18565 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); 18566 } 18567 else 18568 { 18569 emit_insn (gen_shift (dop0, 18570 gen_lowpart (half_mode, d->op0), 18571 GEN_INT (s))); 18572 emit_insn (gen_shift (dop1, 18573 gen_lowpart (half_mode, d->op1), 18574 GEN_INT (s))); 18575 } 18576 /* In AVX2 for 256 bit case we need to permute pack result. */ 18577 if (TARGET_AVX2 && end_perm) 18578 { 18579 op = gen_reg_rtx (d->vmode); 18580 t = gen_reg_rtx (V4DImode); 18581 emit_insn (gen_pack (op, dop0, dop1)); 18582 emit_insn (gen_avx2_permv4di_1 (t, 18583 gen_lowpart (V4DImode, op), 18584 const0_rtx, 18585 const2_rtx, 18586 const1_rtx, 18587 GEN_INT (3))); 18588 emit_move_insn (d->target, gen_lowpart (d->vmode, t)); 18589 } 18590 else 18591 emit_insn (gen_pack (d->target, dop0, dop1)); 18592 18593 return true; 18594} 18595 18596/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even 18597 and extract-odd permutations of two V64QI operands 18598 with two "shifts", two "truncs" and one "concat" insns for "odd" 18599 and two "truncs" and one concat insn for "even." 18600 Have already failed all two instruction sequences. */ 18601 18602static bool 18603expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d) 18604{ 18605 rtx t1, t2, t3, t4; 18606 unsigned i, odd, nelt = d->nelt; 18607 18608 if (!TARGET_AVX512BW 18609 || d->one_operand_p 18610 || d->vmode != V64QImode) 18611 return false; 18612 18613 /* Check that permutation is even or odd. */ 18614 odd = d->perm[0]; 18615 if (odd > 1) 18616 return false; 18617 18618 for (i = 1; i < nelt; ++i) 18619 if (d->perm[i] != 2 * i + odd) 18620 return false; 18621 18622 if (d->testing_p) 18623 return true; 18624 18625 18626 if (odd) 18627 { 18628 t1 = gen_reg_rtx (V32HImode); 18629 t2 = gen_reg_rtx (V32HImode); 18630 emit_insn (gen_lshrv32hi3 (t1, 18631 gen_lowpart (V32HImode, d->op0), 18632 GEN_INT (8))); 18633 emit_insn (gen_lshrv32hi3 (t2, 18634 gen_lowpart (V32HImode, d->op1), 18635 GEN_INT (8))); 18636 } 18637 else 18638 { 18639 t1 = gen_lowpart (V32HImode, d->op0); 18640 t2 = gen_lowpart (V32HImode, d->op1); 18641 } 18642 18643 t3 = gen_reg_rtx (V32QImode); 18644 t4 = gen_reg_rtx (V32QImode); 18645 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1)); 18646 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2)); 18647 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4)); 18648 18649 return true; 18650} 18651 18652/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even 18653 and extract-odd permutations. */ 18654 18655static bool 18656expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) 18657{ 18658 rtx t1, t2, t3, t4, t5; 18659 18660 switch (d->vmode) 18661 { 18662 case E_V4DFmode: 18663 if (d->testing_p) 18664 break; 18665 t1 = gen_reg_rtx (V4DFmode); 18666 t2 = gen_reg_rtx (V4DFmode); 18667 18668 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ 18669 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20))); 18670 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31))); 18671 18672 /* Now an unpck[lh]pd will produce the result required. */ 18673 if (odd) 18674 t3 = gen_avx_unpckhpd256 (d->target, t1, t2); 18675 else 18676 t3 = gen_avx_unpcklpd256 (d->target, t1, t2); 18677 emit_insn (t3); 18678 break; 18679 18680 case E_V8SFmode: 18681 { 18682 int mask = odd ? 0xdd : 0x88; 18683 18684 if (d->testing_p) 18685 break; 18686 t1 = gen_reg_rtx (V8SFmode); 18687 t2 = gen_reg_rtx (V8SFmode); 18688 t3 = gen_reg_rtx (V8SFmode); 18689 18690 /* Shuffle within the 128-bit lanes to produce: 18691 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ 18692 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, 18693 GEN_INT (mask))); 18694 18695 /* Shuffle the lanes around to produce: 18696 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ 18697 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, 18698 GEN_INT (0x3))); 18699 18700 /* Shuffle within the 128-bit lanes to produce: 18701 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ 18702 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); 18703 18704 /* Shuffle within the 128-bit lanes to produce: 18705 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ 18706 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); 18707 18708 /* Shuffle the lanes around to produce: 18709 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ 18710 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, 18711 GEN_INT (0x20))); 18712 } 18713 break; 18714 18715 case E_V2DFmode: 18716 case E_V4SFmode: 18717 case E_V2DImode: 18718 case E_V4SImode: 18719 /* These are always directly implementable by expand_vec_perm_1. */ 18720 gcc_unreachable (); 18721 18722 case E_V8HImode: 18723 if (TARGET_SSE4_1) 18724 return expand_vec_perm_even_odd_pack (d); 18725 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) 18726 return expand_vec_perm_pshufb2 (d); 18727 else 18728 { 18729 if (d->testing_p) 18730 break; 18731 /* We need 2*log2(N)-1 operations to achieve odd/even 18732 with interleave. */ 18733 t1 = gen_reg_rtx (V8HImode); 18734 t2 = gen_reg_rtx (V8HImode); 18735 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1)); 18736 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1)); 18737 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1)); 18738 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1)); 18739 if (odd) 18740 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2); 18741 else 18742 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2); 18743 emit_insn (t3); 18744 } 18745 break; 18746 18747 case E_V16QImode: 18748 return expand_vec_perm_even_odd_pack (d); 18749 18750 case E_V16HImode: 18751 case E_V32QImode: 18752 return expand_vec_perm_even_odd_pack (d); 18753 18754 case E_V64QImode: 18755 return expand_vec_perm_even_odd_trunc (d); 18756 18757 case E_V4DImode: 18758 if (!TARGET_AVX2) 18759 { 18760 struct expand_vec_perm_d d_copy = *d; 18761 d_copy.vmode = V4DFmode; 18762 if (d->testing_p) 18763 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); 18764 else 18765 d_copy.target = gen_reg_rtx (V4DFmode); 18766 d_copy.op0 = gen_lowpart (V4DFmode, d->op0); 18767 d_copy.op1 = gen_lowpart (V4DFmode, d->op1); 18768 if (expand_vec_perm_even_odd_1 (&d_copy, odd)) 18769 { 18770 if (!d->testing_p) 18771 emit_move_insn (d->target, 18772 gen_lowpart (V4DImode, d_copy.target)); 18773 return true; 18774 } 18775 return false; 18776 } 18777 18778 if (d->testing_p) 18779 break; 18780 18781 t1 = gen_reg_rtx (V4DImode); 18782 t2 = gen_reg_rtx (V4DImode); 18783 18784 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ 18785 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20))); 18786 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31))); 18787 18788 /* Now an vpunpck[lh]qdq will produce the result required. */ 18789 if (odd) 18790 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2); 18791 else 18792 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2); 18793 emit_insn (t3); 18794 break; 18795 18796 case E_V8SImode: 18797 if (!TARGET_AVX2) 18798 { 18799 struct expand_vec_perm_d d_copy = *d; 18800 d_copy.vmode = V8SFmode; 18801 if (d->testing_p) 18802 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); 18803 else 18804 d_copy.target = gen_reg_rtx (V8SFmode); 18805 d_copy.op0 = gen_lowpart (V8SFmode, d->op0); 18806 d_copy.op1 = gen_lowpart (V8SFmode, d->op1); 18807 if (expand_vec_perm_even_odd_1 (&d_copy, odd)) 18808 { 18809 if (!d->testing_p) 18810 emit_move_insn (d->target, 18811 gen_lowpart (V8SImode, d_copy.target)); 18812 return true; 18813 } 18814 return false; 18815 } 18816 18817 if (d->testing_p) 18818 break; 18819 18820 t1 = gen_reg_rtx (V8SImode); 18821 t2 = gen_reg_rtx (V8SImode); 18822 t3 = gen_reg_rtx (V4DImode); 18823 t4 = gen_reg_rtx (V4DImode); 18824 t5 = gen_reg_rtx (V4DImode); 18825 18826 /* Shuffle the lanes around into 18827 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ 18828 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0), 18829 gen_lowpart (V4DImode, d->op1), 18830 GEN_INT (0x20))); 18831 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0), 18832 gen_lowpart (V4DImode, d->op1), 18833 GEN_INT (0x31))); 18834 18835 /* Swap the 2nd and 3rd position in each lane into 18836 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ 18837 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3), 18838 GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); 18839 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4), 18840 GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); 18841 18842 /* Now an vpunpck[lh]qdq will produce 18843 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ 18844 if (odd) 18845 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1), 18846 gen_lowpart (V4DImode, t2)); 18847 else 18848 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1), 18849 gen_lowpart (V4DImode, t2)); 18850 emit_insn (t3); 18851 emit_move_insn (d->target, gen_lowpart (V8SImode, t5)); 18852 break; 18853 18854 default: 18855 gcc_unreachable (); 18856 } 18857 18858 return true; 18859} 18860 18861/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match 18862 extract-even and extract-odd permutations. */ 18863 18864static bool 18865expand_vec_perm_even_odd (struct expand_vec_perm_d *d) 18866{ 18867 unsigned i, odd, nelt = d->nelt; 18868 18869 odd = d->perm[0]; 18870 if (odd != 0 && odd != 1) 18871 return false; 18872 18873 for (i = 1; i < nelt; ++i) 18874 if (d->perm[i] != 2 * i + odd) 18875 return false; 18876 18877 return expand_vec_perm_even_odd_1 (d, odd); 18878} 18879 18880/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast 18881 permutations. We assume that expand_vec_perm_1 has already failed. */ 18882 18883static bool 18884expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) 18885{ 18886 unsigned elt = d->perm[0], nelt2 = d->nelt / 2; 18887 machine_mode vmode = d->vmode; 18888 unsigned char perm2[4]; 18889 rtx op0 = d->op0, dest; 18890 bool ok; 18891 18892 switch (vmode) 18893 { 18894 case E_V4DFmode: 18895 case E_V8SFmode: 18896 /* These are special-cased in sse.md so that we can optionally 18897 use the vbroadcast instruction. They expand to two insns 18898 if the input happens to be in a register. */ 18899 gcc_unreachable (); 18900 18901 case E_V2DFmode: 18902 case E_V2DImode: 18903 case E_V4SFmode: 18904 case E_V4SImode: 18905 /* These are always implementable using standard shuffle patterns. */ 18906 gcc_unreachable (); 18907 18908 case E_V8HImode: 18909 case E_V16QImode: 18910 /* These can be implemented via interleave. We save one insn by 18911 stopping once we have promoted to V4SImode and then use pshufd. */ 18912 if (d->testing_p) 18913 return true; 18914 do 18915 { 18916 rtx dest; 18917 rtx (*gen) (rtx, rtx, rtx) 18918 = vmode == V16QImode ? gen_vec_interleave_lowv16qi 18919 : gen_vec_interleave_lowv8hi; 18920 18921 if (elt >= nelt2) 18922 { 18923 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi 18924 : gen_vec_interleave_highv8hi; 18925 elt -= nelt2; 18926 } 18927 nelt2 /= 2; 18928 18929 dest = gen_reg_rtx (vmode); 18930 emit_insn (gen (dest, op0, op0)); 18931 vmode = get_mode_wider_vector (vmode); 18932 op0 = gen_lowpart (vmode, dest); 18933 } 18934 while (vmode != V4SImode); 18935 18936 memset (perm2, elt, 4); 18937 dest = gen_reg_rtx (V4SImode); 18938 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); 18939 gcc_assert (ok); 18940 if (!d->testing_p) 18941 emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); 18942 return true; 18943 18944 case E_V64QImode: 18945 case E_V32QImode: 18946 case E_V16HImode: 18947 case E_V8SImode: 18948 case E_V4DImode: 18949 /* For AVX2 broadcasts of the first element vpbroadcast* or 18950 vpermq should be used by expand_vec_perm_1. */ 18951 gcc_assert (!TARGET_AVX2 || d->perm[0]); 18952 return false; 18953 18954 default: 18955 gcc_unreachable (); 18956 } 18957} 18958 18959/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match 18960 broadcast permutations. */ 18961 18962static bool 18963expand_vec_perm_broadcast (struct expand_vec_perm_d *d) 18964{ 18965 unsigned i, elt, nelt = d->nelt; 18966 18967 if (!d->one_operand_p) 18968 return false; 18969 18970 elt = d->perm[0]; 18971 for (i = 1; i < nelt; ++i) 18972 if (d->perm[i] != elt) 18973 return false; 18974 18975 return expand_vec_perm_broadcast_1 (d); 18976} 18977 18978/* Implement arbitrary permutations of two V64QImode operands 18979 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */ 18980static bool 18981expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d) 18982{ 18983 if (!TARGET_AVX512BW || !(d->vmode == V64QImode)) 18984 return false; 18985 18986 if (d->testing_p) 18987 return true; 18988 18989 struct expand_vec_perm_d ds[2]; 18990 rtx rperm[128], vperm, target0, target1; 18991 unsigned int i, nelt; 18992 machine_mode vmode; 18993 18994 nelt = d->nelt; 18995 vmode = V64QImode; 18996 18997 for (i = 0; i < 2; i++) 18998 { 18999 ds[i] = *d; 19000 ds[i].vmode = V32HImode; 19001 ds[i].nelt = 32; 19002 ds[i].target = gen_reg_rtx (V32HImode); 19003 ds[i].op0 = gen_lowpart (V32HImode, d->op0); 19004 ds[i].op1 = gen_lowpart (V32HImode, d->op1); 19005 } 19006 19007 /* Prepare permutations such that the first one takes care of 19008 putting the even bytes into the right positions or one higher 19009 positions (ds[0]) and the second one takes care of 19010 putting the odd bytes into the right positions or one below 19011 (ds[1]). */ 19012 19013 for (i = 0; i < nelt; i++) 19014 { 19015 ds[i & 1].perm[i / 2] = d->perm[i] / 2; 19016 if (i & 1) 19017 { 19018 rperm[i] = constm1_rtx; 19019 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1)); 19020 } 19021 else 19022 { 19023 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1)); 19024 rperm[i + 64] = constm1_rtx; 19025 } 19026 } 19027 19028 bool ok = expand_vec_perm_1 (&ds[0]); 19029 gcc_assert (ok); 19030 ds[0].target = gen_lowpart (V64QImode, ds[0].target); 19031 19032 ok = expand_vec_perm_1 (&ds[1]); 19033 gcc_assert (ok); 19034 ds[1].target = gen_lowpart (V64QImode, ds[1].target); 19035 19036 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm)); 19037 vperm = force_reg (vmode, vperm); 19038 target0 = gen_reg_rtx (V64QImode); 19039 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm)); 19040 19041 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64)); 19042 vperm = force_reg (vmode, vperm); 19043 target1 = gen_reg_rtx (V64QImode); 19044 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm)); 19045 19046 emit_insn (gen_iorv64qi3 (d->target, target0, target1)); 19047 return true; 19048} 19049 19050/* Implement arbitrary permutation of two V32QImode and V16QImode operands 19051 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed 19052 all the shorter instruction sequences. */ 19053 19054static bool 19055expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) 19056{ 19057 rtx rperm[4][32], vperm, l[2], h[2], op, m128; 19058 unsigned int i, nelt, eltsz; 19059 bool used[4]; 19060 19061 if (!TARGET_AVX2 19062 || d->one_operand_p 19063 || (d->vmode != V32QImode && d->vmode != V16HImode)) 19064 return false; 19065 19066 if (d->testing_p) 19067 return true; 19068 19069 nelt = d->nelt; 19070 eltsz = GET_MODE_UNIT_SIZE (d->vmode); 19071 19072 /* Generate 4 permutation masks. If the required element is within 19073 the same lane, it is shuffled in. If the required element from the 19074 other lane, force a zero by setting bit 7 in the permutation mask. 19075 In the other mask the mask has non-negative elements if element 19076 is requested from the other lane, but also moved to the other lane, 19077 so that the result of vpshufb can have the two V2TImode halves 19078 swapped. */ 19079 m128 = GEN_INT (-128); 19080 for (i = 0; i < 32; ++i) 19081 { 19082 rperm[0][i] = m128; 19083 rperm[1][i] = m128; 19084 rperm[2][i] = m128; 19085 rperm[3][i] = m128; 19086 } 19087 used[0] = false; 19088 used[1] = false; 19089 used[2] = false; 19090 used[3] = false; 19091 for (i = 0; i < nelt; ++i) 19092 { 19093 unsigned j, e = d->perm[i] & (nelt / 2 - 1); 19094 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; 19095 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0); 19096 19097 for (j = 0; j < eltsz; ++j) 19098 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j); 19099 used[which] = true; 19100 } 19101 19102 for (i = 0; i < 2; ++i) 19103 { 19104 if (!used[2 * i + 1]) 19105 { 19106 h[i] = NULL_RTX; 19107 continue; 19108 } 19109 vperm = gen_rtx_CONST_VECTOR (V32QImode, 19110 gen_rtvec_v (32, rperm[2 * i + 1])); 19111 vperm = force_reg (V32QImode, vperm); 19112 h[i] = gen_reg_rtx (V32QImode); 19113 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); 19114 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm)); 19115 } 19116 19117 /* Swap the 128-byte lanes of h[X]. */ 19118 for (i = 0; i < 2; ++i) 19119 { 19120 if (h[i] == NULL_RTX) 19121 continue; 19122 op = gen_reg_rtx (V4DImode); 19123 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]), 19124 const2_rtx, GEN_INT (3), const0_rtx, 19125 const1_rtx)); 19126 h[i] = gen_lowpart (V32QImode, op); 19127 } 19128 19129 for (i = 0; i < 2; ++i) 19130 { 19131 if (!used[2 * i]) 19132 { 19133 l[i] = NULL_RTX; 19134 continue; 19135 } 19136 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i])); 19137 vperm = force_reg (V32QImode, vperm); 19138 l[i] = gen_reg_rtx (V32QImode); 19139 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); 19140 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm)); 19141 } 19142 19143 for (i = 0; i < 2; ++i) 19144 { 19145 if (h[i] && l[i]) 19146 { 19147 op = gen_reg_rtx (V32QImode); 19148 emit_insn (gen_iorv32qi3 (op, l[i], h[i])); 19149 l[i] = op; 19150 } 19151 else if (h[i]) 19152 l[i] = h[i]; 19153 } 19154 19155 gcc_assert (l[0] && l[1]); 19156 op = d->target; 19157 if (d->vmode != V32QImode) 19158 op = gen_reg_rtx (V32QImode); 19159 emit_insn (gen_iorv32qi3 (op, l[0], l[1])); 19160 if (op != d->target) 19161 emit_move_insn (d->target, gen_lowpart (d->vmode, op)); 19162 return true; 19163} 19164 19165/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits 19166 taken care of, perform the expansion in D and return true on success. */ 19167 19168static bool 19169ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) 19170{ 19171 /* Try a single instruction expansion. */ 19172 if (expand_vec_perm_1 (d)) 19173 return true; 19174 19175 /* Try sequences of two instructions. */ 19176 19177 if (expand_vec_perm_pshuflw_pshufhw (d)) 19178 return true; 19179 19180 if (expand_vec_perm_palignr (d, false)) 19181 return true; 19182 19183 if (expand_vec_perm_interleave2 (d)) 19184 return true; 19185 19186 if (expand_vec_perm_broadcast (d)) 19187 return true; 19188 19189 if (expand_vec_perm_vpermq_perm_1 (d)) 19190 return true; 19191 19192 if (expand_vec_perm_vperm2f128 (d)) 19193 return true; 19194 19195 if (expand_vec_perm_pblendv (d)) 19196 return true; 19197 19198 /* Try sequences of three instructions. */ 19199 19200 if (expand_vec_perm_even_odd_pack (d)) 19201 return true; 19202 19203 if (expand_vec_perm_2vperm2f128_vshuf (d)) 19204 return true; 19205 19206 if (expand_vec_perm_pshufb2 (d)) 19207 return true; 19208 19209 if (expand_vec_perm_interleave3 (d)) 19210 return true; 19211 19212 if (expand_vec_perm_vperm2f128_vblend (d)) 19213 return true; 19214 19215 /* Try sequences of four instructions. */ 19216 19217 if (expand_vec_perm_even_odd_trunc (d)) 19218 return true; 19219 if (expand_vec_perm_vpshufb2_vpermq (d)) 19220 return true; 19221 19222 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) 19223 return true; 19224 19225 if (expand_vec_perm_vpermt2_vpshub2 (d)) 19226 return true; 19227 19228 /* ??? Look for narrow permutations whose element orderings would 19229 allow the promotion to a wider mode. */ 19230 19231 /* ??? Look for sequences of interleave or a wider permute that place 19232 the data into the correct lanes for a half-vector shuffle like 19233 pshuf[lh]w or vpermilps. */ 19234 19235 /* ??? Look for sequences of interleave that produce the desired results. 19236 The combinatorics of punpck[lh] get pretty ugly... */ 19237 19238 if (expand_vec_perm_even_odd (d)) 19239 return true; 19240 19241 /* Even longer sequences. */ 19242 if (expand_vec_perm_vpshufb4_vpermq2 (d)) 19243 return true; 19244 19245 /* See if we can get the same permutation in different vector integer 19246 mode. */ 19247 struct expand_vec_perm_d nd; 19248 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) 19249 { 19250 if (!d->testing_p) 19251 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); 19252 return true; 19253 } 19254 19255 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */ 19256 if (expand_vec_perm2_vperm2f128_vblend (d)) 19257 return true; 19258 19259 return false; 19260} 19261 19262/* If a permutation only uses one operand, make it clear. Returns true 19263 if the permutation references both operands. */ 19264 19265static bool 19266canonicalize_perm (struct expand_vec_perm_d *d) 19267{ 19268 int i, which, nelt = d->nelt; 19269 19270 for (i = which = 0; i < nelt; ++i) 19271 which |= (d->perm[i] < nelt ? 1 : 2); 19272 19273 d->one_operand_p = true; 19274 switch (which) 19275 { 19276 default: 19277 gcc_unreachable(); 19278 19279 case 3: 19280 if (!rtx_equal_p (d->op0, d->op1)) 19281 { 19282 d->one_operand_p = false; 19283 break; 19284 } 19285 /* The elements of PERM do not suggest that only the first operand 19286 is used, but both operands are identical. Allow easier matching 19287 of the permutation by folding the permutation into the single 19288 input vector. */ 19289 /* FALLTHRU */ 19290 19291 case 2: 19292 for (i = 0; i < nelt; ++i) 19293 d->perm[i] &= nelt - 1; 19294 d->op0 = d->op1; 19295 break; 19296 19297 case 1: 19298 d->op1 = d->op0; 19299 break; 19300 } 19301 19302 return (which == 3); 19303} 19304 19305/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ 19306 19307bool 19308ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, 19309 rtx op1, const vec_perm_indices &sel) 19310{ 19311 struct expand_vec_perm_d d; 19312 unsigned char perm[MAX_VECT_LEN]; 19313 unsigned int i, nelt, which; 19314 bool two_args; 19315 19316 d.target = target; 19317 d.op0 = op0; 19318 d.op1 = op1; 19319 19320 d.vmode = vmode; 19321 gcc_assert (VECTOR_MODE_P (d.vmode)); 19322 d.nelt = nelt = GET_MODE_NUNITS (d.vmode); 19323 d.testing_p = !target; 19324 19325 gcc_assert (sel.length () == nelt); 19326 gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); 19327 19328 /* Given sufficient ISA support we can just return true here 19329 for selected vector modes. */ 19330 switch (d.vmode) 19331 { 19332 case E_V16SFmode: 19333 case E_V16SImode: 19334 case E_V8DImode: 19335 case E_V8DFmode: 19336 if (!TARGET_AVX512F) 19337 return false; 19338 /* All implementable with a single vperm[it]2 insn. */ 19339 if (d.testing_p) 19340 return true; 19341 break; 19342 case E_V32HImode: 19343 if (!TARGET_AVX512BW) 19344 return false; 19345 if (d.testing_p) 19346 /* All implementable with a single vperm[it]2 insn. */ 19347 return true; 19348 break; 19349 case E_V64QImode: 19350 if (!TARGET_AVX512BW) 19351 return false; 19352 if (d.testing_p) 19353 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ 19354 return true; 19355 break; 19356 case E_V8SImode: 19357 case E_V8SFmode: 19358 case E_V4DFmode: 19359 case E_V4DImode: 19360 if (!TARGET_AVX) 19361 return false; 19362 if (d.testing_p && TARGET_AVX512VL) 19363 /* All implementable with a single vperm[it]2 insn. */ 19364 return true; 19365 break; 19366 case E_V16HImode: 19367 if (!TARGET_SSE2) 19368 return false; 19369 if (d.testing_p && TARGET_AVX2) 19370 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ 19371 return true; 19372 break; 19373 case E_V32QImode: 19374 if (!TARGET_SSE2) 19375 return false; 19376 if (d.testing_p && TARGET_AVX2) 19377 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ 19378 return true; 19379 break; 19380 case E_V8HImode: 19381 case E_V16QImode: 19382 if (!TARGET_SSE2) 19383 return false; 19384 /* Fall through. */ 19385 case E_V4SImode: 19386 case E_V4SFmode: 19387 if (!TARGET_SSE) 19388 return false; 19389 /* All implementable with a single vpperm insn. */ 19390 if (d.testing_p && TARGET_XOP) 19391 return true; 19392 /* All implementable with 2 pshufb + 1 ior. */ 19393 if (d.testing_p && TARGET_SSSE3) 19394 return true; 19395 break; 19396 case E_V2DImode: 19397 case E_V2DFmode: 19398 if (!TARGET_SSE) 19399 return false; 19400 /* All implementable with shufpd or unpck[lh]pd. */ 19401 if (d.testing_p) 19402 return true; 19403 break; 19404 default: 19405 return false; 19406 } 19407 19408 for (i = which = 0; i < nelt; ++i) 19409 { 19410 unsigned char e = sel[i]; 19411 gcc_assert (e < 2 * nelt); 19412 d.perm[i] = e; 19413 perm[i] = e; 19414 which |= (e < nelt ? 1 : 2); 19415 } 19416 19417 if (d.testing_p) 19418 { 19419 /* For all elements from second vector, fold the elements to first. */ 19420 if (which == 2) 19421 for (i = 0; i < nelt; ++i) 19422 d.perm[i] -= nelt; 19423 19424 /* Check whether the mask can be applied to the vector type. */ 19425 d.one_operand_p = (which != 3); 19426 19427 /* Implementable with shufps or pshufd. */ 19428 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode)) 19429 return true; 19430 19431 /* Otherwise we have to go through the motions and see if we can 19432 figure out how to generate the requested permutation. */ 19433 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); 19434 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); 19435 if (!d.one_operand_p) 19436 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); 19437 19438 start_sequence (); 19439 bool ret = ix86_expand_vec_perm_const_1 (&d); 19440 end_sequence (); 19441 19442 return ret; 19443 } 19444 19445 two_args = canonicalize_perm (&d); 19446 19447 if (ix86_expand_vec_perm_const_1 (&d)) 19448 return true; 19449 19450 /* If the selector says both arguments are needed, but the operands are the 19451 same, the above tried to expand with one_operand_p and flattened selector. 19452 If that didn't work, retry without one_operand_p; we succeeded with that 19453 during testing. */ 19454 if (two_args && d.one_operand_p) 19455 { 19456 d.one_operand_p = false; 19457 memcpy (d.perm, perm, sizeof (perm)); 19458 return ix86_expand_vec_perm_const_1 (&d); 19459 } 19460 19461 return false; 19462} 19463 19464void 19465ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) 19466{ 19467 struct expand_vec_perm_d d; 19468 unsigned i, nelt; 19469 19470 d.target = targ; 19471 d.op0 = op0; 19472 d.op1 = op1; 19473 d.vmode = GET_MODE (targ); 19474 d.nelt = nelt = GET_MODE_NUNITS (d.vmode); 19475 d.one_operand_p = false; 19476 d.testing_p = false; 19477 19478 for (i = 0; i < nelt; ++i) 19479 d.perm[i] = i * 2 + odd; 19480 19481 /* We'll either be able to implement the permutation directly... */ 19482 if (expand_vec_perm_1 (&d)) 19483 return; 19484 19485 /* ... or we use the special-case patterns. */ 19486 expand_vec_perm_even_odd_1 (&d, odd); 19487} 19488 19489static void 19490ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) 19491{ 19492 struct expand_vec_perm_d d; 19493 unsigned i, nelt, base; 19494 bool ok; 19495 19496 d.target = targ; 19497 d.op0 = op0; 19498 d.op1 = op1; 19499 d.vmode = GET_MODE (targ); 19500 d.nelt = nelt = GET_MODE_NUNITS (d.vmode); 19501 d.one_operand_p = false; 19502 d.testing_p = false; 19503 19504 base = high_p ? nelt / 2 : 0; 19505 for (i = 0; i < nelt / 2; ++i) 19506 { 19507 d.perm[i * 2] = i + base; 19508 d.perm[i * 2 + 1] = i + base + nelt; 19509 } 19510 19511 /* Note that for AVX this isn't one instruction. */ 19512 ok = ix86_expand_vec_perm_const_1 (&d); 19513 gcc_assert (ok); 19514} 19515 19516 19517/* Expand a vector operation CODE for a V*QImode in terms of the 19518 same operation on V*HImode. */ 19519 19520void 19521ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) 19522{ 19523 machine_mode qimode = GET_MODE (dest); 19524 machine_mode himode; 19525 rtx (*gen_il) (rtx, rtx, rtx); 19526 rtx (*gen_ih) (rtx, rtx, rtx); 19527 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; 19528 struct expand_vec_perm_d d; 19529 bool ok, full_interleave; 19530 bool uns_p = false; 19531 int i; 19532 19533 switch (qimode) 19534 { 19535 case E_V16QImode: 19536 himode = V8HImode; 19537 gen_il = gen_vec_interleave_lowv16qi; 19538 gen_ih = gen_vec_interleave_highv16qi; 19539 break; 19540 case E_V32QImode: 19541 himode = V16HImode; 19542 gen_il = gen_avx2_interleave_lowv32qi; 19543 gen_ih = gen_avx2_interleave_highv32qi; 19544 break; 19545 case E_V64QImode: 19546 himode = V32HImode; 19547 gen_il = gen_avx512bw_interleave_lowv64qi; 19548 gen_ih = gen_avx512bw_interleave_highv64qi; 19549 break; 19550 default: 19551 gcc_unreachable (); 19552 } 19553 19554 op2_l = op2_h = op2; 19555 switch (code) 19556 { 19557 case MULT: 19558 /* Unpack data such that we've got a source byte in each low byte of 19559 each word. We don't care what goes into the high byte of each word. 19560 Rather than trying to get zero in there, most convenient is to let 19561 it be a copy of the low byte. */ 19562 op2_l = gen_reg_rtx (qimode); 19563 op2_h = gen_reg_rtx (qimode); 19564 emit_insn (gen_il (op2_l, op2, op2)); 19565 emit_insn (gen_ih (op2_h, op2, op2)); 19566 19567 op1_l = gen_reg_rtx (qimode); 19568 op1_h = gen_reg_rtx (qimode); 19569 emit_insn (gen_il (op1_l, op1, op1)); 19570 emit_insn (gen_ih (op1_h, op1, op1)); 19571 full_interleave = qimode == V16QImode; 19572 break; 19573 19574 case ASHIFT: 19575 case LSHIFTRT: 19576 uns_p = true; 19577 /* FALLTHRU */ 19578 case ASHIFTRT: 19579 op1_l = gen_reg_rtx (himode); 19580 op1_h = gen_reg_rtx (himode); 19581 ix86_expand_sse_unpack (op1_l, op1, uns_p, false); 19582 ix86_expand_sse_unpack (op1_h, op1, uns_p, true); 19583 full_interleave = true; 19584 break; 19585 default: 19586 gcc_unreachable (); 19587 } 19588 19589 /* Perform the operation. */ 19590 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, 19591 1, OPTAB_DIRECT); 19592 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, 19593 1, OPTAB_DIRECT); 19594 gcc_assert (res_l && res_h); 19595 19596 /* Merge the data back into the right place. */ 19597 d.target = dest; 19598 d.op0 = gen_lowpart (qimode, res_l); 19599 d.op1 = gen_lowpart (qimode, res_h); 19600 d.vmode = qimode; 19601 d.nelt = GET_MODE_NUNITS (qimode); 19602 d.one_operand_p = false; 19603 d.testing_p = false; 19604 19605 if (full_interleave) 19606 { 19607 /* For SSE2, we used an full interleave, so the desired 19608 results are in the even elements. */ 19609 for (i = 0; i < d.nelt; ++i) 19610 d.perm[i] = i * 2; 19611 } 19612 else 19613 { 19614 /* For AVX, the interleave used above was not cross-lane. So the 19615 extraction is evens but with the second and third quarter swapped. 19616 Happily, that is even one insn shorter than even extraction. 19617 For AVX512BW we have 4 lanes. We extract evens from within a lane, 19618 always first from the first and then from the second source operand, 19619 the index bits above the low 4 bits remains the same. 19620 Thus, for d.nelt == 32 we want permutation 19621 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 19622 and for d.nelt == 64 we want permutation 19623 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, 19624 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ 19625 for (i = 0; i < d.nelt; ++i) 19626 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); 19627 } 19628 19629 ok = ix86_expand_vec_perm_const_1 (&d); 19630 gcc_assert (ok); 19631 19632 set_unique_reg_note (get_last_insn (), REG_EQUAL, 19633 gen_rtx_fmt_ee (code, qimode, op1, op2)); 19634} 19635 19636/* Helper function of ix86_expand_mul_widen_evenodd. Return true 19637 if op is CONST_VECTOR with all odd elements equal to their 19638 preceding element. */ 19639 19640static bool 19641const_vector_equal_evenodd_p (rtx op) 19642{ 19643 machine_mode mode = GET_MODE (op); 19644 int i, nunits = GET_MODE_NUNITS (mode); 19645 if (GET_CODE (op) != CONST_VECTOR 19646 || nunits != CONST_VECTOR_NUNITS (op)) 19647 return false; 19648 for (i = 0; i < nunits; i += 2) 19649 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) 19650 return false; 19651 return true; 19652} 19653 19654void 19655ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, 19656 bool uns_p, bool odd_p) 19657{ 19658 machine_mode mode = GET_MODE (op1); 19659 machine_mode wmode = GET_MODE (dest); 19660 rtx x; 19661 rtx orig_op1 = op1, orig_op2 = op2; 19662 19663 if (!nonimmediate_operand (op1, mode)) 19664 op1 = force_reg (mode, op1); 19665 if (!nonimmediate_operand (op2, mode)) 19666 op2 = force_reg (mode, op2); 19667 19668 /* We only play even/odd games with vectors of SImode. */ 19669 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); 19670 19671 /* If we're looking for the odd results, shift those members down to 19672 the even slots. For some cpus this is faster than a PSHUFD. */ 19673 if (odd_p) 19674 { 19675 /* For XOP use vpmacsdqh, but only for smult, as it is only 19676 signed. */ 19677 if (TARGET_XOP && mode == V4SImode && !uns_p) 19678 { 19679 x = force_reg (wmode, CONST0_RTX (wmode)); 19680 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); 19681 return; 19682 } 19683 19684 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); 19685 if (!const_vector_equal_evenodd_p (orig_op1)) 19686 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), 19687 x, NULL, 1, OPTAB_DIRECT); 19688 if (!const_vector_equal_evenodd_p (orig_op2)) 19689 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), 19690 x, NULL, 1, OPTAB_DIRECT); 19691 op1 = gen_lowpart (mode, op1); 19692 op2 = gen_lowpart (mode, op2); 19693 } 19694 19695 if (mode == V16SImode) 19696 { 19697 if (uns_p) 19698 x = gen_vec_widen_umult_even_v16si (dest, op1, op2); 19699 else 19700 x = gen_vec_widen_smult_even_v16si (dest, op1, op2); 19701 } 19702 else if (mode == V8SImode) 19703 { 19704 if (uns_p) 19705 x = gen_vec_widen_umult_even_v8si (dest, op1, op2); 19706 else 19707 x = gen_vec_widen_smult_even_v8si (dest, op1, op2); 19708 } 19709 else if (uns_p) 19710 x = gen_vec_widen_umult_even_v4si (dest, op1, op2); 19711 else if (TARGET_SSE4_1) 19712 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); 19713 else 19714 { 19715 rtx s1, s2, t0, t1, t2; 19716 19717 /* The easiest way to implement this without PMULDQ is to go through 19718 the motions as if we are performing a full 64-bit multiply. With 19719 the exception that we need to do less shuffling of the elements. */ 19720 19721 /* Compute the sign-extension, aka highparts, of the two operands. */ 19722 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), 19723 op1, pc_rtx, pc_rtx); 19724 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), 19725 op2, pc_rtx, pc_rtx); 19726 19727 /* Multiply LO(A) * HI(B), and vice-versa. */ 19728 t1 = gen_reg_rtx (wmode); 19729 t2 = gen_reg_rtx (wmode); 19730 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2)); 19731 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1)); 19732 19733 /* Multiply LO(A) * LO(B). */ 19734 t0 = gen_reg_rtx (wmode); 19735 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2)); 19736 19737 /* Combine and shift the highparts into place. */ 19738 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); 19739 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, 19740 1, OPTAB_DIRECT); 19741 19742 /* Combine high and low parts. */ 19743 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); 19744 return; 19745 } 19746 emit_insn (x); 19747} 19748 19749void 19750ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, 19751 bool uns_p, bool high_p) 19752{ 19753 machine_mode wmode = GET_MODE (dest); 19754 machine_mode mode = GET_MODE (op1); 19755 rtx t1, t2, t3, t4, mask; 19756 19757 switch (mode) 19758 { 19759 case E_V4SImode: 19760 t1 = gen_reg_rtx (mode); 19761 t2 = gen_reg_rtx (mode); 19762 if (TARGET_XOP && !uns_p) 19763 { 19764 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case, 19765 shuffle the elements once so that all elements are in the right 19766 place for immediate use: { A C B D }. */ 19767 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx, 19768 const1_rtx, GEN_INT (3))); 19769 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx, 19770 const1_rtx, GEN_INT (3))); 19771 } 19772 else 19773 { 19774 /* Put the elements into place for the multiply. */ 19775 ix86_expand_vec_interleave (t1, op1, op1, high_p); 19776 ix86_expand_vec_interleave (t2, op2, op2, high_p); 19777 high_p = false; 19778 } 19779 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p); 19780 break; 19781 19782 case E_V8SImode: 19783 /* Shuffle the elements between the lanes. After this we 19784 have { A B E F | C D G H } for each operand. */ 19785 t1 = gen_reg_rtx (V4DImode); 19786 t2 = gen_reg_rtx (V4DImode); 19787 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1), 19788 const0_rtx, const2_rtx, 19789 const1_rtx, GEN_INT (3))); 19790 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2), 19791 const0_rtx, const2_rtx, 19792 const1_rtx, GEN_INT (3))); 19793 19794 /* Shuffle the elements within the lanes. After this we 19795 have { A A B B | C C D D } or { E E F F | G G H H }. */ 19796 t3 = gen_reg_rtx (V8SImode); 19797 t4 = gen_reg_rtx (V8SImode); 19798 mask = GEN_INT (high_p 19799 ? 2 + (2 << 2) + (3 << 4) + (3 << 6) 19800 : 0 + (0 << 2) + (1 << 4) + (1 << 6)); 19801 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask)); 19802 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask)); 19803 19804 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false); 19805 break; 19806 19807 case E_V8HImode: 19808 case E_V16HImode: 19809 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX, 19810 uns_p, OPTAB_DIRECT); 19811 t2 = expand_binop (mode, 19812 uns_p ? umul_highpart_optab : smul_highpart_optab, 19813 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); 19814 gcc_assert (t1 && t2); 19815 19816 t3 = gen_reg_rtx (mode); 19817 ix86_expand_vec_interleave (t3, t1, t2, high_p); 19818 emit_move_insn (dest, gen_lowpart (wmode, t3)); 19819 break; 19820 19821 case E_V16QImode: 19822 case E_V32QImode: 19823 case E_V32HImode: 19824 case E_V16SImode: 19825 case E_V64QImode: 19826 t1 = gen_reg_rtx (wmode); 19827 t2 = gen_reg_rtx (wmode); 19828 ix86_expand_sse_unpack (t1, op1, uns_p, high_p); 19829 ix86_expand_sse_unpack (t2, op2, uns_p, high_p); 19830 19831 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2))); 19832 break; 19833 19834 default: 19835 gcc_unreachable (); 19836 } 19837} 19838 19839void 19840ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) 19841{ 19842 rtx res_1, res_2, res_3, res_4; 19843 19844 res_1 = gen_reg_rtx (V4SImode); 19845 res_2 = gen_reg_rtx (V4SImode); 19846 res_3 = gen_reg_rtx (V2DImode); 19847 res_4 = gen_reg_rtx (V2DImode); 19848 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false); 19849 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true); 19850 19851 /* Move the results in element 2 down to element 1; we don't care 19852 what goes in elements 2 and 3. Then we can merge the parts 19853 back together with an interleave. 19854 19855 Note that two other sequences were tried: 19856 (1) Use interleaves at the start instead of psrldq, which allows 19857 us to use a single shufps to merge things back at the end. 19858 (2) Use shufps here to combine the two vectors, then pshufd to 19859 put the elements in the correct order. 19860 In both cases the cost of the reformatting stall was too high 19861 and the overall sequence slower. */ 19862 19863 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3), 19864 const0_rtx, const2_rtx, 19865 const0_rtx, const0_rtx)); 19866 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4), 19867 const0_rtx, const2_rtx, 19868 const0_rtx, const0_rtx)); 19869 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); 19870 19871 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); 19872} 19873 19874void 19875ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) 19876{ 19877 machine_mode mode = GET_MODE (op0); 19878 rtx t1, t2, t3, t4, t5, t6; 19879 19880 if (TARGET_AVX512DQ && mode == V8DImode) 19881 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2)); 19882 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode) 19883 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2)); 19884 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode) 19885 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2)); 19886 else if (TARGET_XOP && mode == V2DImode) 19887 { 19888 /* op1: A,B,C,D, op2: E,F,G,H */ 19889 op1 = gen_lowpart (V4SImode, op1); 19890 op2 = gen_lowpart (V4SImode, op2); 19891 19892 t1 = gen_reg_rtx (V4SImode); 19893 t2 = gen_reg_rtx (V4SImode); 19894 t3 = gen_reg_rtx (V2DImode); 19895 t4 = gen_reg_rtx (V2DImode); 19896 19897 /* t1: B,A,D,C */ 19898 emit_insn (gen_sse2_pshufd_1 (t1, op1, 19899 GEN_INT (1), 19900 GEN_INT (0), 19901 GEN_INT (3), 19902 GEN_INT (2))); 19903 19904 /* t2: (B*E),(A*F),(D*G),(C*H) */ 19905 emit_insn (gen_mulv4si3 (t2, t1, op2)); 19906 19907 /* t3: (B*E)+(A*F), (D*G)+(C*H) */ 19908 emit_insn (gen_xop_phadddq (t3, t2)); 19909 19910 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ 19911 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); 19912 19913 /* Multiply lower parts and add all */ 19914 t5 = gen_reg_rtx (V2DImode); 19915 emit_insn (gen_vec_widen_umult_even_v4si (t5, 19916 gen_lowpart (V4SImode, op1), 19917 gen_lowpart (V4SImode, op2))); 19918 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT); 19919 } 19920 else 19921 { 19922 machine_mode nmode; 19923 rtx (*umul) (rtx, rtx, rtx); 19924 19925 if (mode == V2DImode) 19926 { 19927 umul = gen_vec_widen_umult_even_v4si; 19928 nmode = V4SImode; 19929 } 19930 else if (mode == V4DImode) 19931 { 19932 umul = gen_vec_widen_umult_even_v8si; 19933 nmode = V8SImode; 19934 } 19935 else if (mode == V8DImode) 19936 { 19937 umul = gen_vec_widen_umult_even_v16si; 19938 nmode = V16SImode; 19939 } 19940 else 19941 gcc_unreachable (); 19942 19943 19944 /* Multiply low parts. */ 19945 t1 = gen_reg_rtx (mode); 19946 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2))); 19947 19948 /* Shift input vectors right 32 bits so we can multiply high parts. */ 19949 t6 = GEN_INT (32); 19950 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT); 19951 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT); 19952 19953 /* Multiply high parts by low parts. */ 19954 t4 = gen_reg_rtx (mode); 19955 t5 = gen_reg_rtx (mode); 19956 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2))); 19957 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1))); 19958 19959 /* Combine and shift the highparts back. */ 19960 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT); 19961 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT); 19962 19963 /* Combine high and low parts. */ 19964 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT); 19965 } 19966 19967 set_unique_reg_note (get_last_insn (), REG_EQUAL, 19968 gen_rtx_MULT (mode, op1, op2)); 19969} 19970 19971/* Return 1 if control tansfer instruction INSN 19972 should be encoded with notrack prefix. */ 19973 19974bool 19975ix86_notrack_prefixed_insn_p (rtx_insn *insn) 19976{ 19977 if (!insn || !((flag_cf_protection & CF_BRANCH))) 19978 return false; 19979 19980 if (CALL_P (insn)) 19981 { 19982 rtx call = get_call_rtx_from (insn); 19983 gcc_assert (call != NULL_RTX); 19984 rtx addr = XEXP (call, 0); 19985 19986 /* Do not emit 'notrack' if it's not an indirect call. */ 19987 if (MEM_P (addr) 19988 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF) 19989 return false; 19990 else 19991 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0); 19992 } 19993 19994 if (JUMP_P (insn) && !flag_cet_switch) 19995 { 19996 rtx target = JUMP_LABEL (insn); 19997 if (target == NULL_RTX || ANY_RETURN_P (target)) 19998 return false; 19999 20000 /* Check the jump is a switch table. */ 20001 rtx_insn *label = as_a<rtx_insn *> (target); 20002 rtx_insn *table = next_insn (label); 20003 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) 20004 return false; 20005 else 20006 return true; 20007 } 20008 return false; 20009} 20010 20011/* Calculate integer abs() using only SSE2 instructions. */ 20012 20013void 20014ix86_expand_sse2_abs (rtx target, rtx input) 20015{ 20016 machine_mode mode = GET_MODE (target); 20017 rtx tmp0, tmp1, x; 20018 20019 switch (mode) 20020 { 20021 case E_V2DImode: 20022 case E_V4DImode: 20023 /* For 64-bit signed integer X, with SSE4.2 use 20024 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. 20025 Otherwise handle it similarly to V4SImode, except use 64 as W instead of 20026 32 and use logical instead of arithmetic right shift (which is 20027 unimplemented) and subtract. */ 20028 if (TARGET_SSE4_2) 20029 { 20030 tmp0 = gen_reg_rtx (mode); 20031 tmp1 = gen_reg_rtx (mode); 20032 emit_move_insn (tmp1, CONST0_RTX (mode)); 20033 if (mode == E_V2DImode) 20034 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); 20035 else 20036 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); 20037 } 20038 else 20039 { 20040 tmp0 = expand_simple_binop (mode, LSHIFTRT, input, 20041 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) 20042 - 1), NULL, 0, OPTAB_DIRECT); 20043 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); 20044 } 20045 20046 tmp1 = expand_simple_binop (mode, XOR, tmp0, input, 20047 NULL, 0, OPTAB_DIRECT); 20048 x = expand_simple_binop (mode, MINUS, tmp1, tmp0, 20049 target, 0, OPTAB_DIRECT); 20050 break; 20051 20052 case E_V4SImode: 20053 /* For 32-bit signed integer X, the best way to calculate the absolute 20054 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ 20055 tmp0 = expand_simple_binop (mode, ASHIFTRT, input, 20056 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), 20057 NULL, 0, OPTAB_DIRECT); 20058 tmp1 = expand_simple_binop (mode, XOR, tmp0, input, 20059 NULL, 0, OPTAB_DIRECT); 20060 x = expand_simple_binop (mode, MINUS, tmp1, tmp0, 20061 target, 0, OPTAB_DIRECT); 20062 break; 20063 20064 case E_V8HImode: 20065 /* For 16-bit signed integer X, the best way to calculate the absolute 20066 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ 20067 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); 20068 20069 x = expand_simple_binop (mode, SMAX, tmp0, input, 20070 target, 0, OPTAB_DIRECT); 20071 break; 20072 20073 case E_V16QImode: 20074 /* For 8-bit signed integer X, the best way to calculate the absolute 20075 value of X is min ((unsigned char) X, (unsigned char) (-X)), 20076 as SSE2 provides the PMINUB insn. */ 20077 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); 20078 20079 x = expand_simple_binop (V16QImode, UMIN, tmp0, input, 20080 target, 0, OPTAB_DIRECT); 20081 break; 20082 20083 default: 20084 gcc_unreachable (); 20085 } 20086 20087 if (x != target) 20088 emit_move_insn (target, x); 20089} 20090 20091/* Expand an extract from a vector register through pextr insn. 20092 Return true if successful. */ 20093 20094bool 20095ix86_expand_pextr (rtx *operands) 20096{ 20097 rtx dst = operands[0]; 20098 rtx src = operands[1]; 20099 20100 unsigned int size = INTVAL (operands[2]); 20101 unsigned int pos = INTVAL (operands[3]); 20102 20103 if (SUBREG_P (dst)) 20104 { 20105 /* Reject non-lowpart subregs. */ 20106 if (SUBREG_BYTE (dst) > 0) 20107 return false; 20108 dst = SUBREG_REG (dst); 20109 } 20110 20111 if (SUBREG_P (src)) 20112 { 20113 pos += SUBREG_BYTE (src) * BITS_PER_UNIT; 20114 src = SUBREG_REG (src); 20115 } 20116 20117 switch (GET_MODE (src)) 20118 { 20119 case E_V16QImode: 20120 case E_V8HImode: 20121 case E_V4SImode: 20122 case E_V2DImode: 20123 case E_V1TImode: 20124 { 20125 machine_mode srcmode, dstmode; 20126 rtx d, pat; 20127 20128 if (!int_mode_for_size (size, 0).exists (&dstmode)) 20129 return false; 20130 20131 switch (dstmode) 20132 { 20133 case E_QImode: 20134 if (!TARGET_SSE4_1) 20135 return false; 20136 srcmode = V16QImode; 20137 break; 20138 20139 case E_HImode: 20140 if (!TARGET_SSE2) 20141 return false; 20142 srcmode = V8HImode; 20143 break; 20144 20145 case E_SImode: 20146 if (!TARGET_SSE4_1) 20147 return false; 20148 srcmode = V4SImode; 20149 break; 20150 20151 case E_DImode: 20152 gcc_assert (TARGET_64BIT); 20153 if (!TARGET_SSE4_1) 20154 return false; 20155 srcmode = V2DImode; 20156 break; 20157 20158 default: 20159 return false; 20160 } 20161 20162 /* Reject extractions from misaligned positions. */ 20163 if (pos & (size-1)) 20164 return false; 20165 20166 if (GET_MODE (dst) == dstmode) 20167 d = dst; 20168 else 20169 d = gen_reg_rtx (dstmode); 20170 20171 /* Construct insn pattern. */ 20172 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size))); 20173 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat); 20174 20175 /* Let the rtl optimizers know about the zero extension performed. */ 20176 if (dstmode == QImode || dstmode == HImode) 20177 { 20178 pat = gen_rtx_ZERO_EXTEND (SImode, pat); 20179 d = gen_lowpart (SImode, d); 20180 } 20181 20182 emit_insn (gen_rtx_SET (d, pat)); 20183 20184 if (d != dst) 20185 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); 20186 return true; 20187 } 20188 20189 default: 20190 return false; 20191 } 20192} 20193 20194/* Expand an insert into a vector register through pinsr insn. 20195 Return true if successful. */ 20196 20197bool 20198ix86_expand_pinsr (rtx *operands) 20199{ 20200 rtx dst = operands[0]; 20201 rtx src = operands[3]; 20202 20203 unsigned int size = INTVAL (operands[1]); 20204 unsigned int pos = INTVAL (operands[2]); 20205 20206 if (SUBREG_P (dst)) 20207 { 20208 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT; 20209 dst = SUBREG_REG (dst); 20210 } 20211 20212 switch (GET_MODE (dst)) 20213 { 20214 case E_V16QImode: 20215 case E_V8HImode: 20216 case E_V4SImode: 20217 case E_V2DImode: 20218 case E_V1TImode: 20219 { 20220 machine_mode srcmode, dstmode; 20221 rtx (*pinsr)(rtx, rtx, rtx, rtx); 20222 rtx d; 20223 20224 if (!int_mode_for_size (size, 0).exists (&srcmode)) 20225 return false; 20226 20227 switch (srcmode) 20228 { 20229 case E_QImode: 20230 if (!TARGET_SSE4_1) 20231 return false; 20232 dstmode = V16QImode; 20233 pinsr = gen_sse4_1_pinsrb; 20234 break; 20235 20236 case E_HImode: 20237 if (!TARGET_SSE2) 20238 return false; 20239 dstmode = V8HImode; 20240 pinsr = gen_sse2_pinsrw; 20241 break; 20242 20243 case E_SImode: 20244 if (!TARGET_SSE4_1) 20245 return false; 20246 dstmode = V4SImode; 20247 pinsr = gen_sse4_1_pinsrd; 20248 break; 20249 20250 case E_DImode: 20251 gcc_assert (TARGET_64BIT); 20252 if (!TARGET_SSE4_1) 20253 return false; 20254 dstmode = V2DImode; 20255 pinsr = gen_sse4_1_pinsrq; 20256 break; 20257 20258 default: 20259 return false; 20260 } 20261 20262 /* Reject insertions to misaligned positions. */ 20263 if (pos & (size-1)) 20264 return false; 20265 20266 if (SUBREG_P (src)) 20267 { 20268 unsigned int srcpos = SUBREG_BYTE (src); 20269 20270 if (srcpos > 0) 20271 { 20272 rtx extr_ops[4]; 20273 20274 extr_ops[0] = gen_reg_rtx (srcmode); 20275 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src)); 20276 extr_ops[2] = GEN_INT (size); 20277 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT); 20278 20279 if (!ix86_expand_pextr (extr_ops)) 20280 return false; 20281 20282 src = extr_ops[0]; 20283 } 20284 else 20285 src = gen_lowpart (srcmode, SUBREG_REG (src)); 20286 } 20287 20288 if (GET_MODE (dst) == dstmode) 20289 d = dst; 20290 else 20291 d = gen_reg_rtx (dstmode); 20292 20293 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), 20294 gen_lowpart (srcmode, src), 20295 GEN_INT (1 << (pos / size)))); 20296 if (d != dst) 20297 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); 20298 return true; 20299 } 20300 20301 default: 20302 return false; 20303 } 20304} 20305 20306/* All CPUs prefer to avoid cross-lane operations so perform reductions 20307 upper against lower halves up to SSE reg size. */ 20308 20309machine_mode 20310ix86_split_reduction (machine_mode mode) 20311{ 20312 /* Reduce lowpart against highpart until we reach SSE reg width to 20313 avoid cross-lane operations. */ 20314 switch (mode) 20315 { 20316 case E_V8DImode: 20317 case E_V4DImode: 20318 return V2DImode; 20319 case E_V16SImode: 20320 case E_V8SImode: 20321 return V4SImode; 20322 case E_V32HImode: 20323 case E_V16HImode: 20324 return V8HImode; 20325 case E_V64QImode: 20326 case E_V32QImode: 20327 return V16QImode; 20328 case E_V16SFmode: 20329 case E_V8SFmode: 20330 return V4SFmode; 20331 case E_V8DFmode: 20332 case E_V4DFmode: 20333 return V2DFmode; 20334 default: 20335 return mode; 20336 } 20337} 20338 20339/* Generate call to __divmoddi4. */ 20340 20341void 20342ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, 20343 rtx op0, rtx op1, 20344 rtx *quot_p, rtx *rem_p) 20345{ 20346 rtx rem = assign_386_stack_local (mode, SLOT_TEMP); 20347 20348 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL, 20349 mode, op0, mode, op1, mode, 20350 XEXP (rem, 0), Pmode); 20351 *quot_p = quot; 20352 *rem_p = rem; 20353} 20354 20355#include "gt-i386-expand.h" 20356