1/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3.  If not see
17<http://www.gnu.org/licenses/>.  */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97   references.  The RTL can be REG, offsettable MEM, integer constant, or
98   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
99   split and "num" is its length.  lo_half and hi_half are output arrays
100   that parallel "operands".  */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104		   int num, rtx lo_half[], rtx hi_half[])
105{
106  machine_mode half_mode;
107  unsigned int byte;
108  rtx mem_op = NULL_RTX;
109  int mem_num = 0;
110
111  switch (mode)
112    {
113    case E_TImode:
114      half_mode = DImode;
115      break;
116    case E_DImode:
117      half_mode = SImode;
118      break;
119    case E_P2HImode:
120      half_mode = HImode;
121      break;
122    case E_P2QImode:
123      half_mode = QImode;
124      break;
125    default:
126      gcc_unreachable ();
127    }
128
129  byte = GET_MODE_SIZE (half_mode);
130
131  while (num--)
132    {
133      rtx op = operands[num];
134
135      /* simplify_subreg refuse to split volatile memory addresses,
136         but we still have to handle it.  */
137      if (MEM_P (op))
138	{
139	  if (mem_op && rtx_equal_p (op, mem_op))
140	    {
141	      lo_half[num] = lo_half[mem_num];
142	      hi_half[num] = hi_half[mem_num];
143	    }
144	  else
145	    {
146	      mem_op = op;
147	      mem_num = num;
148	      lo_half[num] = adjust_address (op, half_mode, 0);
149	      hi_half[num] = adjust_address (op, half_mode, byte);
150	    }
151	}
152      else
153	{
154	  lo_half[num] = simplify_gen_subreg (half_mode, op,
155					      GET_MODE (op) == VOIDmode
156					      ? mode : GET_MODE (op), 0);
157	  hi_half[num] = simplify_gen_subreg (half_mode, op,
158					      GET_MODE (op) == VOIDmode
159					      ? mode : GET_MODE (op), byte);
160	}
161    }
162}
163
164/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165   for the target.  */
166
167void
168ix86_expand_clear (rtx dest)
169{
170  rtx tmp;
171
172  /* We play register width games, which are only valid after reload.  */
173  gcc_assert (reload_completed);
174
175  /* Avoid HImode and its attendant prefix byte.  */
176  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177    dest = gen_rtx_REG (SImode, REGNO (dest));
178  tmp = gen_rtx_SET (dest, const0_rtx);
179
180  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181    {
182      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184    }
185
186  emit_insn (tmp);
187}
188
189void
190ix86_expand_move (machine_mode mode, rtx operands[])
191{
192  rtx op0, op1;
193  rtx tmp, addend = NULL_RTX;
194  enum tls_model model;
195
196  op0 = operands[0];
197  op1 = operands[1];
198
199  switch (GET_CODE (op1))
200    {
201    case CONST:
202      tmp = XEXP (op1, 0);
203
204      if (GET_CODE (tmp) != PLUS
205	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
206	break;
207
208      op1 = XEXP (tmp, 0);
209      addend = XEXP (tmp, 1);
210      /* FALLTHRU */
211
212    case SYMBOL_REF:
213      model = SYMBOL_REF_TLS_MODEL (op1);
214
215      if (model)
216	op1 = legitimize_tls_address (op1, model, true);
217      else if (ix86_force_load_from_GOT_p (op1))
218	{
219	  /* Load the external function address via GOT slot to avoid PLT.  */
220	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
221				(TARGET_64BIT
222				 ? UNSPEC_GOTPCREL
223				 : UNSPEC_GOT));
224	  op1 = gen_rtx_CONST (Pmode, op1);
225	  op1 = gen_const_mem (Pmode, op1);
226	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
227	}
228      else
229	{
230	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
231	  if (tmp)
232	    {
233	      op1 = tmp;
234	      if (!addend)
235		break;
236	    }
237	  else
238	    {
239	      op1 = operands[1];
240	      break;
241	    }
242	}
243
244      if (addend)
245	{
246	  op1 = force_operand (op1, NULL_RTX);
247	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
248				     op0, 1, OPTAB_DIRECT);
249	}
250      else
251	op1 = force_operand (op1, op0);
252
253      if (op1 == op0)
254	return;
255
256      op1 = convert_to_mode (mode, op1, 1);
257
258    default:
259      break;
260    }
261
262  if ((flag_pic || MACHOPIC_INDIRECT)
263      && symbolic_operand (op1, mode))
264    {
265      if (TARGET_MACHO && !TARGET_64BIT)
266	{
267#if TARGET_MACHO
268	  /* dynamic-no-pic */
269	  if (MACHOPIC_INDIRECT)
270	    {
271	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
272			 ? op0 : gen_reg_rtx (Pmode);
273	      op1 = machopic_indirect_data_reference (op1, temp);
274	      if (MACHOPIC_PURE)
275		op1 = machopic_legitimize_pic_address (op1, mode,
276						       temp == op1 ? 0 : temp);
277	    }
278	  if (op0 != op1 && GET_CODE (op0) != MEM)
279	    {
280	      rtx insn = gen_rtx_SET (op0, op1);
281	      emit_insn (insn);
282	      return;
283	    }
284	  if (GET_CODE (op0) == MEM)
285	    op1 = force_reg (Pmode, op1);
286	  else
287	    {
288	      rtx temp = op0;
289	      if (GET_CODE (temp) != REG)
290		temp = gen_reg_rtx (Pmode);
291	      temp = legitimize_pic_address (op1, temp);
292	      if (temp == op0)
293	    return;
294	      op1 = temp;
295	    }
296      /* dynamic-no-pic */
297#endif
298	}
299      else
300	{
301	  if (MEM_P (op0))
302	    op1 = force_reg (mode, op1);
303	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
304	    {
305	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
306	      op1 = legitimize_pic_address (op1, reg);
307	      if (op0 == op1)
308		return;
309	      op1 = convert_to_mode (mode, op1, 1);
310	    }
311	}
312    }
313  else
314    {
315      if (MEM_P (op0)
316	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
317	      || !push_operand (op0, mode))
318	  && MEM_P (op1))
319	op1 = force_reg (mode, op1);
320
321      if (push_operand (op0, mode)
322	  && ! general_no_elim_operand (op1, mode))
323	op1 = copy_to_mode_reg (mode, op1);
324
325      /* Force large constants in 64bit compilation into register
326	 to get them CSEed.  */
327      if (can_create_pseudo_p ()
328	  && (mode == DImode) && TARGET_64BIT
329	  && immediate_operand (op1, mode)
330	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
331	  && !register_operand (op0, mode)
332	  && optimize)
333	op1 = copy_to_mode_reg (mode, op1);
334
335      if (can_create_pseudo_p ()
336	  && CONST_DOUBLE_P (op1))
337	{
338	  /* If we are loading a floating point constant to a register,
339	     force the value to memory now, since we'll get better code
340	     out the back end.  */
341
342	  op1 = validize_mem (force_const_mem (mode, op1));
343	  if (!register_operand (op0, mode))
344	    {
345	      rtx temp = gen_reg_rtx (mode);
346	      emit_insn (gen_rtx_SET (temp, op1));
347	      emit_move_insn (op0, temp);
348	      return;
349	    }
350	}
351    }
352
353  emit_insn (gen_rtx_SET (op0, op1));
354}
355
356void
357ix86_expand_vector_move (machine_mode mode, rtx operands[])
358{
359  rtx op0 = operands[0], op1 = operands[1];
360  /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
361     psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
362  unsigned int align = (TARGET_IAMCU
363			? GET_MODE_BITSIZE (mode)
364			: GET_MODE_ALIGNMENT (mode));
365
366  if (push_operand (op0, VOIDmode))
367    op0 = emit_move_resolve_push (mode, op0);
368
369  /* Force constants other than zero into memory.  We do not know how
370     the instructions used to build constants modify the upper 64 bits
371     of the register, once we have that information we may be able
372     to handle some of them more efficiently.  */
373  if (can_create_pseudo_p ()
374      && (CONSTANT_P (op1)
375	  || (SUBREG_P (op1)
376	      && CONSTANT_P (SUBREG_REG (op1))))
377      && ((register_operand (op0, mode)
378	   && !standard_sse_constant_p (op1, mode))
379	  /* ix86_expand_vector_move_misalign() does not like constants.  */
380	  || (SSE_REG_MODE_P (mode)
381	      && MEM_P (op0)
382	      && MEM_ALIGN (op0) < align)))
383    {
384      if (SUBREG_P (op1))
385	{
386	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
387	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
388	  if (r)
389	    r = validize_mem (r);
390	  else
391	    r = force_reg (imode, SUBREG_REG (op1));
392	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
393	}
394      else
395	op1 = validize_mem (force_const_mem (mode, op1));
396    }
397
398  /* We need to check memory alignment for SSE mode since attribute
399     can make operands unaligned.  */
400  if (can_create_pseudo_p ()
401      && SSE_REG_MODE_P (mode)
402      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
403	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
404    {
405      rtx tmp[2];
406
407      /* ix86_expand_vector_move_misalign() does not like both
408	 arguments in memory.  */
409      if (!register_operand (op0, mode)
410	  && !register_operand (op1, mode))
411	op1 = force_reg (mode, op1);
412
413      tmp[0] = op0; tmp[1] = op1;
414      ix86_expand_vector_move_misalign (mode, tmp);
415      return;
416    }
417
418  /* Make operand1 a register if it isn't already.  */
419  if (can_create_pseudo_p ()
420      && !register_operand (op0, mode)
421      && !register_operand (op1, mode))
422    {
423      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
424      return;
425    }
426
427  emit_insn (gen_rtx_SET (op0, op1));
428}
429
430/* Split 32-byte AVX unaligned load and store if needed.  */
431
432static void
433ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
434{
435  rtx m;
436  rtx (*extract) (rtx, rtx, rtx);
437  machine_mode mode;
438
439  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
440      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
441    {
442      emit_insn (gen_rtx_SET (op0, op1));
443      return;
444    }
445
446  rtx orig_op0 = NULL_RTX;
447  mode = GET_MODE (op0);
448  switch (GET_MODE_CLASS (mode))
449    {
450    case MODE_VECTOR_INT:
451    case MODE_INT:
452      if (mode != V32QImode)
453	{
454	  if (!MEM_P (op0))
455	    {
456	      orig_op0 = op0;
457	      op0 = gen_reg_rtx (V32QImode);
458	    }
459	  else
460	    op0 = gen_lowpart (V32QImode, op0);
461	  op1 = gen_lowpart (V32QImode, op1);
462	  mode = V32QImode;
463	}
464      break;
465    case MODE_VECTOR_FLOAT:
466      break;
467    default:
468      gcc_unreachable ();
469    }
470
471  switch (mode)
472    {
473    default:
474      gcc_unreachable ();
475    case E_V32QImode:
476      extract = gen_avx_vextractf128v32qi;
477      mode = V16QImode;
478      break;
479    case E_V8SFmode:
480      extract = gen_avx_vextractf128v8sf;
481      mode = V4SFmode;
482      break;
483    case E_V4DFmode:
484      extract = gen_avx_vextractf128v4df;
485      mode = V2DFmode;
486      break;
487    }
488
489  if (MEM_P (op1))
490    {
491      rtx r = gen_reg_rtx (mode);
492      m = adjust_address (op1, mode, 0);
493      emit_move_insn (r, m);
494      m = adjust_address (op1, mode, 16);
495      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
496      emit_move_insn (op0, r);
497    }
498  else if (MEM_P (op0))
499    {
500      m = adjust_address (op0, mode, 0);
501      emit_insn (extract (m, op1, const0_rtx));
502      m = adjust_address (op0, mode, 16);
503      emit_insn (extract (m, copy_rtx (op1), const1_rtx));
504    }
505  else
506    gcc_unreachable ();
507
508  if (orig_op0)
509    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
510}
511
512/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
513   straight to ix86_expand_vector_move.  */
514/* Code generation for scalar reg-reg moves of single and double precision data:
515     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
516       movaps reg, reg
517     else
518       movss reg, reg
519     if (x86_sse_partial_reg_dependency == true)
520       movapd reg, reg
521     else
522       movsd reg, reg
523
524   Code generation for scalar loads of double precision data:
525     if (x86_sse_split_regs == true)
526       movlpd mem, reg      (gas syntax)
527     else
528       movsd mem, reg
529
530   Code generation for unaligned packed loads of single precision data
531   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
532     if (x86_sse_unaligned_move_optimal)
533       movups mem, reg
534
535     if (x86_sse_partial_reg_dependency == true)
536       {
537         xorps  reg, reg
538         movlps mem, reg
539         movhps mem+8, reg
540       }
541     else
542       {
543         movlps mem, reg
544         movhps mem+8, reg
545       }
546
547   Code generation for unaligned packed loads of double precision data
548   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
549     if (x86_sse_unaligned_move_optimal)
550       movupd mem, reg
551
552     if (x86_sse_split_regs == true)
553       {
554         movlpd mem, reg
555         movhpd mem+8, reg
556       }
557     else
558       {
559         movsd  mem, reg
560         movhpd mem+8, reg
561       }
562 */
563
564void
565ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
566{
567  rtx op0, op1, m;
568
569  op0 = operands[0];
570  op1 = operands[1];
571
572  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
573  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
574    {
575      emit_insn (gen_rtx_SET (op0, op1));
576      return;
577    }
578
579  if (TARGET_AVX)
580    {
581      if (GET_MODE_SIZE (mode) == 32)
582	ix86_avx256_split_vector_move_misalign (op0, op1);
583      else
584	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
585	emit_insn (gen_rtx_SET (op0, op1));
586      return;
587    }
588
589  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
590      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
591    {
592      emit_insn (gen_rtx_SET (op0, op1));
593      return;
594    }
595
596  /* ??? If we have typed data, then it would appear that using
597     movdqu is the only way to get unaligned data loaded with
598     integer type.  */
599  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
600    {
601      emit_insn (gen_rtx_SET (op0, op1));
602      return;
603    }
604
605  if (MEM_P (op1))
606    {
607      if (TARGET_SSE2 && mode == V2DFmode)
608        {
609          rtx zero;
610
611	  /* When SSE registers are split into halves, we can avoid
612	     writing to the top half twice.  */
613	  if (TARGET_SSE_SPLIT_REGS)
614	    {
615	      emit_clobber (op0);
616	      zero = op0;
617	    }
618	  else
619	    {
620	      /* ??? Not sure about the best option for the Intel chips.
621		 The following would seem to satisfy; the register is
622		 entirely cleared, breaking the dependency chain.  We
623		 then store to the upper half, with a dependency depth
624		 of one.  A rumor has it that Intel recommends two movsd
625		 followed by an unpacklpd, but this is unconfirmed.  And
626		 given that the dependency depth of the unpacklpd would
627		 still be one, I'm not sure why this would be better.  */
628	      zero = CONST0_RTX (V2DFmode);
629	    }
630
631	  m = adjust_address (op1, DFmode, 0);
632	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
633	  m = adjust_address (op1, DFmode, 8);
634	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
635	}
636      else
637        {
638	  rtx t;
639
640	  if (mode != V4SFmode)
641	    t = gen_reg_rtx (V4SFmode);
642	  else
643	    t = op0;
644
645	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
646	    emit_move_insn (t, CONST0_RTX (V4SFmode));
647	  else
648	    emit_clobber (t);
649
650	  m = adjust_address (op1, V2SFmode, 0);
651	  emit_insn (gen_sse_loadlps (t, t, m));
652	  m = adjust_address (op1, V2SFmode, 8);
653	  emit_insn (gen_sse_loadhps (t, t, m));
654	  if (mode != V4SFmode)
655	    emit_move_insn (op0, gen_lowpart (mode, t));
656	}
657    }
658  else if (MEM_P (op0))
659    {
660      if (TARGET_SSE2 && mode == V2DFmode)
661	{
662	  m = adjust_address (op0, DFmode, 0);
663	  emit_insn (gen_sse2_storelpd (m, op1));
664	  m = adjust_address (op0, DFmode, 8);
665	  emit_insn (gen_sse2_storehpd (m, op1));
666	}
667      else
668	{
669	  if (mode != V4SFmode)
670	    op1 = gen_lowpart (V4SFmode, op1);
671
672	  m = adjust_address (op0, V2SFmode, 0);
673	  emit_insn (gen_sse_storelps (m, op1));
674	  m = adjust_address (op0, V2SFmode, 8);
675	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
676	}
677    }
678  else
679    gcc_unreachable ();
680}
681
682/* Move bits 64:95 to bits 32:63.  */
683
684void
685ix86_move_vector_high_sse_to_mmx (rtx op)
686{
687  rtx mask = gen_rtx_PARALLEL (VOIDmode,
688			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
689					  GEN_INT (0), GEN_INT (0)));
690  rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
691  op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
692  rtx insn = gen_rtx_SET (dest, op);
693  emit_insn (insn);
694}
695
696/* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
697
698void
699ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
700{
701  rtx op0 = operands[0];
702  rtx op1 = operands[1];
703  rtx op2 = operands[2];
704
705  machine_mode dmode = GET_MODE (op0);
706  machine_mode smode = GET_MODE (op1);
707  machine_mode inner_dmode = GET_MODE_INNER (dmode);
708  machine_mode inner_smode = GET_MODE_INNER (smode);
709
710  /* Get the corresponding SSE mode for destination.  */
711  int nunits = 16 / GET_MODE_SIZE (inner_dmode);
712  machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
713					    nunits).require ();
714  machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
715						 nunits / 2).require ();
716
717  /* Get the corresponding SSE mode for source.  */
718  nunits = 16 / GET_MODE_SIZE (inner_smode);
719  machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
720					    nunits).require ();
721
722  /* Generate SSE pack with signed/unsigned saturation.  */
723  rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
724  op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
725  op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
726
727  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
728  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
729  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
730						    op1, op2));
731  emit_insn (insn);
732
733  ix86_move_vector_high_sse_to_mmx (op0);
734}
735
736/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
737
738void
739ix86_split_mmx_punpck (rtx operands[], bool high_p)
740{
741  rtx op0 = operands[0];
742  rtx op1 = operands[1];
743  rtx op2 = operands[2];
744  machine_mode mode = GET_MODE (op0);
745  rtx mask;
746  /* The corresponding SSE mode.  */
747  machine_mode sse_mode, double_sse_mode;
748
749  switch (mode)
750    {
751    case E_V8QImode:
752      sse_mode = V16QImode;
753      double_sse_mode = V32QImode;
754      mask = gen_rtx_PARALLEL (VOIDmode,
755			       gen_rtvec (16,
756					  GEN_INT (0), GEN_INT (16),
757					  GEN_INT (1), GEN_INT (17),
758					  GEN_INT (2), GEN_INT (18),
759					  GEN_INT (3), GEN_INT (19),
760					  GEN_INT (4), GEN_INT (20),
761					  GEN_INT (5), GEN_INT (21),
762					  GEN_INT (6), GEN_INT (22),
763					  GEN_INT (7), GEN_INT (23)));
764      break;
765
766    case E_V4HImode:
767      sse_mode = V8HImode;
768      double_sse_mode = V16HImode;
769      mask = gen_rtx_PARALLEL (VOIDmode,
770			       gen_rtvec (8,
771					  GEN_INT (0), GEN_INT (8),
772					  GEN_INT (1), GEN_INT (9),
773					  GEN_INT (2), GEN_INT (10),
774					  GEN_INT (3), GEN_INT (11)));
775      break;
776
777    case E_V2SImode:
778      sse_mode = V4SImode;
779      double_sse_mode = V8SImode;
780      mask = gen_rtx_PARALLEL (VOIDmode,
781			       gen_rtvec (4,
782					  GEN_INT (0), GEN_INT (4),
783					  GEN_INT (1), GEN_INT (5)));
784      break;
785
786    default:
787      gcc_unreachable ();
788    }
789
790  /* Generate SSE punpcklXX.  */
791  rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
792  op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
793  op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
794
795  op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
796  op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
797  rtx insn = gen_rtx_SET (dest, op2);
798  emit_insn (insn);
799
800  if (high_p)
801    {
802      /* Move bits 64:127 to bits 0:63.  */
803      mask = gen_rtx_PARALLEL (VOIDmode,
804			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
805					  GEN_INT (0), GEN_INT (0)));
806      dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
807      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
808      insn = gen_rtx_SET (dest, op1);
809      emit_insn (insn);
810    }
811}
812
813/* Helper function of ix86_fixup_binary_operands to canonicalize
814   operand order.  Returns true if the operands should be swapped.  */
815
816static bool
817ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
818			     rtx operands[])
819{
820  rtx dst = operands[0];
821  rtx src1 = operands[1];
822  rtx src2 = operands[2];
823
824  /* If the operation is not commutative, we can't do anything.  */
825  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
826      && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
827    return false;
828
829  /* Highest priority is that src1 should match dst.  */
830  if (rtx_equal_p (dst, src1))
831    return false;
832  if (rtx_equal_p (dst, src2))
833    return true;
834
835  /* Next highest priority is that immediate constants come second.  */
836  if (immediate_operand (src2, mode))
837    return false;
838  if (immediate_operand (src1, mode))
839    return true;
840
841  /* Lowest priority is that memory references should come second.  */
842  if (MEM_P (src2))
843    return false;
844  if (MEM_P (src1))
845    return true;
846
847  return false;
848}
849
850
851/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
852   destination to use for the operation.  If different from the true
853   destination in operands[0], a copy operation will be required.  */
854
855rtx
856ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
857			    rtx operands[])
858{
859  rtx dst = operands[0];
860  rtx src1 = operands[1];
861  rtx src2 = operands[2];
862
863  /* Canonicalize operand order.  */
864  if (ix86_swap_binary_operands_p (code, mode, operands))
865    {
866      /* It is invalid to swap operands of different modes.  */
867      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
868
869      std::swap (src1, src2);
870    }
871
872  /* Both source operands cannot be in memory.  */
873  if (MEM_P (src1) && MEM_P (src2))
874    {
875      /* Optimization: Only read from memory once.  */
876      if (rtx_equal_p (src1, src2))
877	{
878	  src2 = force_reg (mode, src2);
879	  src1 = src2;
880	}
881      else if (rtx_equal_p (dst, src1))
882	src2 = force_reg (mode, src2);
883      else
884	src1 = force_reg (mode, src1);
885    }
886
887  /* If the destination is memory, and we do not have matching source
888     operands, do things in registers.  */
889  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
890    dst = gen_reg_rtx (mode);
891
892  /* Source 1 cannot be a constant.  */
893  if (CONSTANT_P (src1))
894    src1 = force_reg (mode, src1);
895
896  /* Source 1 cannot be a non-matching memory.  */
897  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
898    src1 = force_reg (mode, src1);
899
900  /* Improve address combine.  */
901  if (code == PLUS
902      && GET_MODE_CLASS (mode) == MODE_INT
903      && MEM_P (src2))
904    src2 = force_reg (mode, src2);
905
906  operands[1] = src1;
907  operands[2] = src2;
908  return dst;
909}
910
911/* Similarly, but assume that the destination has already been
912   set up properly.  */
913
914void
915ix86_fixup_binary_operands_no_copy (enum rtx_code code,
916				    machine_mode mode, rtx operands[])
917{
918  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
919  gcc_assert (dst == operands[0]);
920}
921
922/* Attempt to expand a binary operator.  Make the expansion closer to the
923   actual machine, then just general_operand, which will allow 3 separate
924   memory references (one output, two input) in a single insn.  */
925
926void
927ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
928			     rtx operands[])
929{
930  rtx src1, src2, dst, op, clob;
931
932  dst = ix86_fixup_binary_operands (code, mode, operands);
933  src1 = operands[1];
934  src2 = operands[2];
935
936 /* Emit the instruction.  */
937
938  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
939
940  if (reload_completed
941      && code == PLUS
942      && !rtx_equal_p (dst, src1))
943    {
944      /* This is going to be an LEA; avoid splitting it later.  */
945      emit_insn (op);
946    }
947  else
948    {
949      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
950      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
951    }
952
953  /* Fix up the destination if needed.  */
954  if (dst != operands[0])
955    emit_move_insn (operands[0], dst);
956}
957
958/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
959   the given OPERANDS.  */
960
961void
962ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
963				     rtx operands[])
964{
965  rtx op1 = NULL_RTX, op2 = NULL_RTX;
966  if (SUBREG_P (operands[1]))
967    {
968      op1 = operands[1];
969      op2 = operands[2];
970    }
971  else if (SUBREG_P (operands[2]))
972    {
973      op1 = operands[2];
974      op2 = operands[1];
975    }
976  /* Optimize (__m128i) d | (__m128i) e and similar code
977     when d and e are float vectors into float vector logical
978     insn.  In C/C++ without using intrinsics there is no other way
979     to express vector logical operation on float vectors than
980     to cast them temporarily to integer vectors.  */
981  if (op1
982      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
983      && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
984      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
985      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
986      && SUBREG_BYTE (op1) == 0
987      && (GET_CODE (op2) == CONST_VECTOR
988	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
989	      && SUBREG_BYTE (op2) == 0))
990      && can_create_pseudo_p ())
991    {
992      rtx dst;
993      switch (GET_MODE (SUBREG_REG (op1)))
994	{
995	case E_V4SFmode:
996	case E_V8SFmode:
997	case E_V16SFmode:
998	case E_V2DFmode:
999	case E_V4DFmode:
1000	case E_V8DFmode:
1001	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1002	  if (GET_CODE (op2) == CONST_VECTOR)
1003	    {
1004	      op2 = gen_lowpart (GET_MODE (dst), op2);
1005	      op2 = force_reg (GET_MODE (dst), op2);
1006	    }
1007	  else
1008	    {
1009	      op1 = operands[1];
1010	      op2 = SUBREG_REG (operands[2]);
1011	      if (!vector_operand (op2, GET_MODE (dst)))
1012		op2 = force_reg (GET_MODE (dst), op2);
1013	    }
1014	  op1 = SUBREG_REG (op1);
1015	  if (!vector_operand (op1, GET_MODE (dst)))
1016	    op1 = force_reg (GET_MODE (dst), op1);
1017	  emit_insn (gen_rtx_SET (dst,
1018				  gen_rtx_fmt_ee (code, GET_MODE (dst),
1019						  op1, op2)));
1020	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
1021	  return;
1022	default:
1023	  break;
1024	}
1025    }
1026  if (!vector_operand (operands[1], mode))
1027    operands[1] = force_reg (mode, operands[1]);
1028  if (!vector_operand (operands[2], mode))
1029    operands[2] = force_reg (mode, operands[2]);
1030  ix86_fixup_binary_operands_no_copy (code, mode, operands);
1031  emit_insn (gen_rtx_SET (operands[0],
1032			  gen_rtx_fmt_ee (code, mode, operands[1],
1033					  operands[2])));
1034}
1035
1036/* Return TRUE or FALSE depending on whether the binary operator meets the
1037   appropriate constraints.  */
1038
1039bool
1040ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1041			 rtx operands[3])
1042{
1043  rtx dst = operands[0];
1044  rtx src1 = operands[1];
1045  rtx src2 = operands[2];
1046
1047  /* Both source operands cannot be in memory.  */
1048  if (MEM_P (src1) && MEM_P (src2))
1049    return false;
1050
1051  /* Canonicalize operand order for commutative operators.  */
1052  if (ix86_swap_binary_operands_p (code, mode, operands))
1053    std::swap (src1, src2);
1054
1055  /* If the destination is memory, we must have a matching source operand.  */
1056  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1057    return false;
1058
1059  /* Source 1 cannot be a constant.  */
1060  if (CONSTANT_P (src1))
1061    return false;
1062
1063  /* Source 1 cannot be a non-matching memory.  */
1064  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1065    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
1066    return (code == AND
1067	    && (mode == HImode
1068		|| mode == SImode
1069		|| (TARGET_64BIT && mode == DImode))
1070	    && satisfies_constraint_L (src2));
1071
1072  return true;
1073}
1074
1075/* Attempt to expand a unary operator.  Make the expansion closer to the
1076   actual machine, then just general_operand, which will allow 2 separate
1077   memory references (one output, one input) in a single insn.  */
1078
1079void
1080ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1081			    rtx operands[])
1082{
1083  bool matching_memory = false;
1084  rtx src, dst, op, clob;
1085
1086  dst = operands[0];
1087  src = operands[1];
1088
1089  /* If the destination is memory, and we do not have matching source
1090     operands, do things in registers.  */
1091  if (MEM_P (dst))
1092    {
1093      if (rtx_equal_p (dst, src))
1094	matching_memory = true;
1095      else
1096	dst = gen_reg_rtx (mode);
1097    }
1098
1099  /* When source operand is memory, destination must match.  */
1100  if (MEM_P (src) && !matching_memory)
1101    src = force_reg (mode, src);
1102
1103  /* Emit the instruction.  */
1104
1105  op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1106
1107  if (code == NOT)
1108    emit_insn (op);
1109  else
1110    {
1111      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1112      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1113    }
1114
1115  /* Fix up the destination if needed.  */
1116  if (dst != operands[0])
1117    emit_move_insn (operands[0], dst);
1118}
1119
1120/* Predict just emitted jump instruction to be taken with probability PROB.  */
1121
1122static void
1123predict_jump (int prob)
1124{
1125  rtx_insn *insn = get_last_insn ();
1126  gcc_assert (JUMP_P (insn));
1127  add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1128}
1129
1130/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1131   divisor are within the range [0-255].  */
1132
1133void
1134ix86_split_idivmod (machine_mode mode, rtx operands[],
1135		    bool unsigned_p)
1136{
1137  rtx_code_label *end_label, *qimode_label;
1138  rtx div, mod;
1139  rtx_insn *insn;
1140  rtx scratch, tmp0, tmp1, tmp2;
1141  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1142
1143  operands[2] = force_reg (mode, operands[2]);
1144  operands[3] = force_reg (mode, operands[3]);
1145
1146  switch (mode)
1147    {
1148    case E_SImode:
1149      if (GET_MODE (operands[0]) == SImode)
1150	{
1151	  if (GET_MODE (operands[1]) == SImode)
1152	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1153	  else
1154	    gen_divmod4_1
1155	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1156	}
1157      else
1158	gen_divmod4_1
1159	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1160      break;
1161
1162    case E_DImode:
1163      gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1164      break;
1165
1166    default:
1167      gcc_unreachable ();
1168    }
1169
1170  end_label = gen_label_rtx ();
1171  qimode_label = gen_label_rtx ();
1172
1173  scratch = gen_reg_rtx (mode);
1174
1175  /* Use 8bit unsigned divimod if dividend and divisor are within
1176     the range [0-255].  */
1177  emit_move_insn (scratch, operands[2]);
1178  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1179				 scratch, 1, OPTAB_DIRECT);
1180  emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1181  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1182  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1183  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1184			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1185			       pc_rtx);
1186  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1187  predict_jump (REG_BR_PROB_BASE * 50 / 100);
1188  JUMP_LABEL (insn) = qimode_label;
1189
1190  /* Generate original signed/unsigned divimod.  */
1191  div = gen_divmod4_1 (operands[0], operands[1],
1192		       operands[2], operands[3]);
1193  emit_insn (div);
1194
1195  /* Branch to the end.  */
1196  emit_jump_insn (gen_jump (end_label));
1197  emit_barrier ();
1198
1199  /* Generate 8bit unsigned divide.  */
1200  emit_label (qimode_label);
1201  /* Don't use operands[0] for result of 8bit divide since not all
1202     registers support QImode ZERO_EXTRACT.  */
1203  tmp0 = lowpart_subreg (HImode, scratch, mode);
1204  tmp1 = lowpart_subreg (HImode, operands[2], mode);
1205  tmp2 = lowpart_subreg (QImode, operands[3], mode);
1206  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1207
1208  if (unsigned_p)
1209    {
1210      div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1211      mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1212    }
1213  else
1214    {
1215      div = gen_rtx_DIV (mode, operands[2], operands[3]);
1216      mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1217    }
1218  if (mode == SImode)
1219    {
1220      if (GET_MODE (operands[0]) != SImode)
1221	div = gen_rtx_ZERO_EXTEND (DImode, div);
1222      if (GET_MODE (operands[1]) != SImode)
1223	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1224    }
1225
1226  /* Extract remainder from AH.  */
1227  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1228			       tmp0, GEN_INT (8), GEN_INT (8));
1229  if (REG_P (operands[1]))
1230    insn = emit_move_insn (operands[1], tmp1);
1231  else
1232    {
1233      /* Need a new scratch register since the old one has result
1234	 of 8bit divide.  */
1235      scratch = gen_reg_rtx (GET_MODE (operands[1]));
1236      emit_move_insn (scratch, tmp1);
1237      insn = emit_move_insn (operands[1], scratch);
1238    }
1239  set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241  /* Zero extend quotient from AL.  */
1242  tmp1 = gen_lowpart (QImode, tmp0);
1243  insn = emit_insn (gen_extend_insn
1244		    (operands[0], tmp1,
1245		     GET_MODE (operands[0]), QImode, 1));
1246  set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248  emit_label (end_label);
1249}
1250
1251/* Emit x86 binary operand CODE in mode MODE, where the first operand
1252   matches destination.  RTX includes clobber of FLAGS_REG.  */
1253
1254void
1255ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256		 rtx dst, rtx src)
1257{
1258  rtx op, clob;
1259
1260  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264}
1265
1266/* Return true if regno1 def is nearest to the insn.  */
1267
1268static bool
1269find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270{
1271  rtx_insn *prev = insn;
1272  rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274  if (insn == start)
1275    return false;
1276  while (prev && prev != start)
1277    {
1278      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279	{
1280	  prev = PREV_INSN (prev);
1281	  continue;
1282	}
1283      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284	return true;
1285      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286	return false;
1287      prev = PREV_INSN (prev);
1288    }
1289
1290  /* None of the regs is defined in the bb.  */
1291  return false;
1292}
1293
1294/* Split lea instructions into a sequence of instructions
1295   which are executed on ALU to avoid AGU stalls.
1296   It is assumed that it is allowed to clobber flags register
1297   at lea position.  */
1298
1299void
1300ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301{
1302  unsigned int regno0, regno1, regno2;
1303  struct ix86_address parts;
1304  rtx target, tmp;
1305  int ok, adds;
1306
1307  ok = ix86_decompose_address (operands[1], &parts);
1308  gcc_assert (ok);
1309
1310  target = gen_lowpart (mode, operands[0]);
1311
1312  regno0 = true_regnum (target);
1313  regno1 = INVALID_REGNUM;
1314  regno2 = INVALID_REGNUM;
1315
1316  if (parts.base)
1317    {
1318      parts.base = gen_lowpart (mode, parts.base);
1319      regno1 = true_regnum (parts.base);
1320    }
1321
1322  if (parts.index)
1323    {
1324      parts.index = gen_lowpart (mode, parts.index);
1325      regno2 = true_regnum (parts.index);
1326    }
1327
1328  if (parts.disp)
1329    parts.disp = gen_lowpart (mode, parts.disp);
1330
1331  if (parts.scale > 1)
1332    {
1333      /* Case r1 = r1 + ...  */
1334      if (regno1 == regno0)
1335	{
1336	  /* If we have a case r1 = r1 + C * r2 then we
1337	     should use multiplication which is very
1338	     expensive.  Assume cost model is wrong if we
1339	     have such case here.  */
1340	  gcc_assert (regno2 != regno0);
1341
1342	  for (adds = parts.scale; adds > 0; adds--)
1343	    ix86_emit_binop (PLUS, mode, target, parts.index);
1344	}
1345      else
1346	{
1347	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
1348	  if (regno0 != regno2)
1349	    emit_insn (gen_rtx_SET (target, parts.index));
1350
1351	  /* Use shift for scaling.  */
1352	  ix86_emit_binop (ASHIFT, mode, target,
1353			   GEN_INT (exact_log2 (parts.scale)));
1354
1355	  if (parts.base)
1356	    ix86_emit_binop (PLUS, mode, target, parts.base);
1357
1358	  if (parts.disp && parts.disp != const0_rtx)
1359	    ix86_emit_binop (PLUS, mode, target, parts.disp);
1360	}
1361    }
1362  else if (!parts.base && !parts.index)
1363    {
1364      gcc_assert(parts.disp);
1365      emit_insn (gen_rtx_SET (target, parts.disp));
1366    }
1367  else
1368    {
1369      if (!parts.base)
1370	{
1371	  if (regno0 != regno2)
1372	    emit_insn (gen_rtx_SET (target, parts.index));
1373	}
1374      else if (!parts.index)
1375	{
1376	  if (regno0 != regno1)
1377	    emit_insn (gen_rtx_SET (target, parts.base));
1378	}
1379      else
1380	{
1381	  if (regno0 == regno1)
1382	    tmp = parts.index;
1383	  else if (regno0 == regno2)
1384	    tmp = parts.base;
1385	  else
1386	    {
1387	      rtx tmp1;
1388
1389	      /* Find better operand for SET instruction, depending
1390		 on which definition is farther from the insn.  */
1391	      if (find_nearest_reg_def (insn, regno1, regno2))
1392		tmp = parts.index, tmp1 = parts.base;
1393	      else
1394		tmp = parts.base, tmp1 = parts.index;
1395
1396	      emit_insn (gen_rtx_SET (target, tmp));
1397
1398	      if (parts.disp && parts.disp != const0_rtx)
1399		ix86_emit_binop (PLUS, mode, target, parts.disp);
1400
1401	      ix86_emit_binop (PLUS, mode, target, tmp1);
1402	      return;
1403	    }
1404
1405	  ix86_emit_binop (PLUS, mode, target, tmp);
1406	}
1407
1408      if (parts.disp && parts.disp != const0_rtx)
1409	ix86_emit_binop (PLUS, mode, target, parts.disp);
1410    }
1411}
1412
1413/* Post-reload splitter for converting an SF or DFmode value in an
1414   SSE register into an unsigned SImode.  */
1415
1416void
1417ix86_split_convert_uns_si_sse (rtx operands[])
1418{
1419  machine_mode vecmode;
1420  rtx value, large, zero_or_two31, input, two31, x;
1421
1422  large = operands[1];
1423  zero_or_two31 = operands[2];
1424  input = operands[3];
1425  two31 = operands[4];
1426  vecmode = GET_MODE (large);
1427  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1428
1429  /* Load up the value into the low element.  We must ensure that the other
1430     elements are valid floats -- zero is the easiest such value.  */
1431  if (MEM_P (input))
1432    {
1433      if (vecmode == V4SFmode)
1434	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1435      else
1436	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1437    }
1438  else
1439    {
1440      input = gen_rtx_REG (vecmode, REGNO (input));
1441      emit_move_insn (value, CONST0_RTX (vecmode));
1442      if (vecmode == V4SFmode)
1443	emit_insn (gen_sse_movss (value, value, input));
1444      else
1445	emit_insn (gen_sse2_movsd (value, value, input));
1446    }
1447
1448  emit_move_insn (large, two31);
1449  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1450
1451  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1452  emit_insn (gen_rtx_SET (large, x));
1453
1454  x = gen_rtx_AND (vecmode, zero_or_two31, large);
1455  emit_insn (gen_rtx_SET (zero_or_two31, x));
1456
1457  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1458  emit_insn (gen_rtx_SET (value, x));
1459
1460  large = gen_rtx_REG (V4SImode, REGNO (large));
1461  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1462
1463  x = gen_rtx_REG (V4SImode, REGNO (value));
1464  if (vecmode == V4SFmode)
1465    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1466  else
1467    emit_insn (gen_sse2_cvttpd2dq (x, value));
1468  value = x;
1469
1470  emit_insn (gen_xorv4si3 (value, value, large));
1471}
1472
1473static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1474						 machine_mode mode, rtx target,
1475						 rtx var, int one_var);
1476
1477/* Convert an unsigned DImode value into a DFmode, using only SSE.
1478   Expects the 64-bit DImode to be supplied in a pair of integral
1479   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
1480   -mfpmath=sse, !optimize_size only.  */
1481
1482void
1483ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1484{
1485  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1486  rtx int_xmm, fp_xmm;
1487  rtx biases, exponents;
1488  rtx x;
1489
1490  int_xmm = gen_reg_rtx (V4SImode);
1491  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1492    emit_insn (gen_movdi_to_sse (int_xmm, input));
1493  else if (TARGET_SSE_SPLIT_REGS)
1494    {
1495      emit_clobber (int_xmm);
1496      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1497    }
1498  else
1499    {
1500      x = gen_reg_rtx (V2DImode);
1501      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1502      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1503    }
1504
1505  x = gen_rtx_CONST_VECTOR (V4SImode,
1506			    gen_rtvec (4, GEN_INT (0x43300000UL),
1507				       GEN_INT (0x45300000UL),
1508				       const0_rtx, const0_rtx));
1509  exponents = validize_mem (force_const_mem (V4SImode, x));
1510
1511  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1513
1514  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517     (0x1.0p84 + double(fp_value_hi_xmm)).
1518     Note these exponents differ by 32.  */
1519
1520  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1521
1522  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
1524  real_ldexp (&bias_lo_rvt, &dconst1, 52);
1525  real_ldexp (&bias_hi_rvt, &dconst1, 84);
1526  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1527  x = const_double_from_real_value (bias_hi_rvt, DFmode);
1528  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1529  biases = validize_mem (force_const_mem (V2DFmode, biases));
1530  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1531
1532  /* Add the upper and lower DFmode values together.  */
1533  if (TARGET_SSE3)
1534    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1535  else
1536    {
1537      x = copy_to_mode_reg (V2DFmode, fp_xmm);
1538      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1539      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1540    }
1541
1542  ix86_expand_vector_extract (false, target, fp_xmm, 0);
1543}
1544
1545/* Not used, but eases macroization of patterns.  */
1546void
1547ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1548{
1549  gcc_unreachable ();
1550}
1551
1552/* Convert an unsigned SImode value into a DFmode.  Only currently used
1553   for SSE, but applicable anywhere.  */
1554
1555void
1556ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1557{
1558  REAL_VALUE_TYPE TWO31r;
1559  rtx x, fp;
1560
1561  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1562			   NULL, 1, OPTAB_DIRECT);
1563
1564  fp = gen_reg_rtx (DFmode);
1565  emit_insn (gen_floatsidf2 (fp, x));
1566
1567  real_ldexp (&TWO31r, &dconst1, 31);
1568  x = const_double_from_real_value (TWO31r, DFmode);
1569
1570  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1571  if (x != target)
1572    emit_move_insn (target, x);
1573}
1574
1575/* Convert a signed DImode value into a DFmode.  Only used for SSE in
1576   32-bit mode; otherwise we have a direct convert instruction.  */
1577
1578void
1579ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1580{
1581  REAL_VALUE_TYPE TWO32r;
1582  rtx fp_lo, fp_hi, x;
1583
1584  fp_lo = gen_reg_rtx (DFmode);
1585  fp_hi = gen_reg_rtx (DFmode);
1586
1587  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1588
1589  real_ldexp (&TWO32r, &dconst1, 32);
1590  x = const_double_from_real_value (TWO32r, DFmode);
1591  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1592
1593  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1594
1595  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1596			   0, OPTAB_DIRECT);
1597  if (x != target)
1598    emit_move_insn (target, x);
1599}
1600
1601/* Convert an unsigned SImode value into a SFmode, using only SSE.
1602   For x86_32, -mfpmath=sse, !optimize_size only.  */
1603void
1604ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1605{
1606  REAL_VALUE_TYPE ONE16r;
1607  rtx fp_hi, fp_lo, int_hi, int_lo, x;
1608
1609  real_ldexp (&ONE16r, &dconst1, 16);
1610  x = const_double_from_real_value (ONE16r, SFmode);
1611  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1612				      NULL, 0, OPTAB_DIRECT);
1613  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1614				      NULL, 0, OPTAB_DIRECT);
1615  fp_hi = gen_reg_rtx (SFmode);
1616  fp_lo = gen_reg_rtx (SFmode);
1617  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1618  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1619  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1620			       0, OPTAB_DIRECT);
1621  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1622			       0, OPTAB_DIRECT);
1623  if (!rtx_equal_p (target, fp_hi))
1624    emit_move_insn (target, fp_hi);
1625}
1626
1627/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
1628   a vector of unsigned ints VAL to vector of floats TARGET.  */
1629
1630void
1631ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1632{
1633  rtx tmp[8];
1634  REAL_VALUE_TYPE TWO16r;
1635  machine_mode intmode = GET_MODE (val);
1636  machine_mode fltmode = GET_MODE (target);
1637  rtx (*cvt) (rtx, rtx);
1638
1639  if (intmode == V4SImode)
1640    cvt = gen_floatv4siv4sf2;
1641  else
1642    cvt = gen_floatv8siv8sf2;
1643  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1644  tmp[0] = force_reg (intmode, tmp[0]);
1645  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1646				OPTAB_DIRECT);
1647  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1648				NULL_RTX, 1, OPTAB_DIRECT);
1649  tmp[3] = gen_reg_rtx (fltmode);
1650  emit_insn (cvt (tmp[3], tmp[1]));
1651  tmp[4] = gen_reg_rtx (fltmode);
1652  emit_insn (cvt (tmp[4], tmp[2]));
1653  real_ldexp (&TWO16r, &dconst1, 16);
1654  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1655  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1656  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1657				OPTAB_DIRECT);
1658  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1659				OPTAB_DIRECT);
1660  if (tmp[7] != target)
1661    emit_move_insn (target, tmp[7]);
1662}
1663
1664/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666   This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
1668
1669rtx
1670ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1671{
1672  REAL_VALUE_TYPE TWO31r;
1673  rtx two31r, tmp[4];
1674  machine_mode mode = GET_MODE (val);
1675  machine_mode scalarmode = GET_MODE_INNER (mode);
1676  machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1677  rtx (*cmp) (rtx, rtx, rtx, rtx);
1678  int i;
1679
1680  for (i = 0; i < 3; i++)
1681    tmp[i] = gen_reg_rtx (mode);
1682  real_ldexp (&TWO31r, &dconst1, 31);
1683  two31r = const_double_from_real_value (TWO31r, scalarmode);
1684  two31r = ix86_build_const_vector (mode, 1, two31r);
1685  two31r = force_reg (mode, two31r);
1686  switch (mode)
1687    {
1688    case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1689    case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1690    case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1691    case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1692    default: gcc_unreachable ();
1693    }
1694  tmp[3] = gen_rtx_LE (mode, two31r, val);
1695  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1696  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1697				0, OPTAB_DIRECT);
1698  if (intmode == V4SImode || TARGET_AVX2)
1699    *xorp = expand_simple_binop (intmode, ASHIFT,
1700				 gen_lowpart (intmode, tmp[0]),
1701				 GEN_INT (31), NULL_RTX, 0,
1702				 OPTAB_DIRECT);
1703  else
1704    {
1705      rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1706      two31 = ix86_build_const_vector (intmode, 1, two31);
1707      *xorp = expand_simple_binop (intmode, AND,
1708				   gen_lowpart (intmode, tmp[0]),
1709				   two31, NULL_RTX, 0,
1710				   OPTAB_DIRECT);
1711    }
1712  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1713			      0, OPTAB_DIRECT);
1714}
1715
1716/* Generate code for floating point ABS or NEG.  */
1717
1718void
1719ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1720				rtx operands[])
1721{
1722  rtx set, dst, src;
1723  bool use_sse = false;
1724  bool vector_mode = VECTOR_MODE_P (mode);
1725  machine_mode vmode = mode;
1726  rtvec par;
1727
1728  if (vector_mode)
1729    use_sse = true;
1730  else if (mode == TFmode)
1731    use_sse = true;
1732  else if (TARGET_SSE_MATH)
1733    {
1734      use_sse = SSE_FLOAT_MODE_P (mode);
1735      if (mode == SFmode)
1736	vmode = V4SFmode;
1737      else if (mode == DFmode)
1738	vmode = V2DFmode;
1739    }
1740
1741  dst = operands[0];
1742  src = operands[1];
1743
1744  set = gen_rtx_fmt_e (code, mode, src);
1745  set = gen_rtx_SET (dst, set);
1746
1747  if (use_sse)
1748    {
1749      rtx mask, use, clob;
1750
1751      /* NEG and ABS performed with SSE use bitwise mask operations.
1752	 Create the appropriate mask now.  */
1753      mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1754      use = gen_rtx_USE (VOIDmode, mask);
1755      if (vector_mode)
1756	par = gen_rtvec (2, set, use);
1757      else
1758	{
1759          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1760	  par = gen_rtvec (3, set, use, clob);
1761        }
1762    }
1763  else
1764    {
1765      rtx clob;
1766
1767      /* Changing of sign for FP values is doable using integer unit too.  */
1768      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1769      par = gen_rtvec (2, set, clob);
1770    }
1771
1772  emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1773}
1774
1775/* Deconstruct a floating point ABS or NEG operation
1776   with integer registers into integer operations.  */
1777
1778void
1779ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1780			       rtx operands[])
1781{
1782  enum rtx_code absneg_op;
1783  rtx dst, set;
1784
1785  gcc_assert (operands_match_p (operands[0], operands[1]));
1786
1787  switch (mode)
1788    {
1789    case E_SFmode:
1790      dst = gen_lowpart (SImode, operands[0]);
1791
1792      if (code == ABS)
1793	{
1794	  set = gen_int_mode (0x7fffffff, SImode);
1795	  absneg_op = AND;
1796	}
1797      else
1798	{
1799	  set = gen_int_mode (0x80000000, SImode);
1800	  absneg_op = XOR;
1801	}
1802      set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1803      break;
1804
1805    case E_DFmode:
1806      if (TARGET_64BIT)
1807	{
1808	  dst = gen_lowpart (DImode, operands[0]);
1809	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1810
1811	  if (code == ABS)
1812	    set = const0_rtx;
1813	  else
1814	    set = gen_rtx_NOT (DImode, dst);
1815	}
1816      else
1817	{
1818	  dst = gen_highpart (SImode, operands[0]);
1819
1820	  if (code == ABS)
1821	    {
1822	      set = gen_int_mode (0x7fffffff, SImode);
1823	      absneg_op = AND;
1824	    }
1825	  else
1826	    {
1827	      set = gen_int_mode (0x80000000, SImode);
1828	      absneg_op = XOR;
1829	    }
1830	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1831	}
1832      break;
1833
1834    case E_XFmode:
1835      dst = gen_rtx_REG (SImode,
1836			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1837      if (code == ABS)
1838	{
1839	  set = GEN_INT (0x7fff);
1840	  absneg_op = AND;
1841	}
1842      else
1843	{
1844	  set = GEN_INT (0x8000);
1845	  absneg_op = XOR;
1846	}
1847      set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1848      break;
1849
1850    default:
1851      gcc_unreachable ();
1852    }
1853
1854  set = gen_rtx_SET (dst, set);
1855
1856  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1857  rtvec par = gen_rtvec (2, set, clob);
1858
1859  emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1860}
1861
1862/* Expand a copysign operation.  Special case operand 0 being a constant.  */
1863
1864void
1865ix86_expand_copysign (rtx operands[])
1866{
1867  machine_mode mode, vmode;
1868  rtx dest, op0, op1, mask;
1869
1870  dest = operands[0];
1871  op0 = operands[1];
1872  op1 = operands[2];
1873
1874  mode = GET_MODE (dest);
1875
1876  if (mode == SFmode)
1877    vmode = V4SFmode;
1878  else if (mode == DFmode)
1879    vmode = V2DFmode;
1880  else if (mode == TFmode)
1881    vmode = mode;
1882  else
1883    gcc_unreachable ();
1884
1885  mask = ix86_build_signbit_mask (vmode, 0, 0);
1886
1887  if (CONST_DOUBLE_P (op0))
1888    {
1889      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1890	op0 = simplify_unary_operation (ABS, mode, op0, mode);
1891
1892      if (mode == SFmode || mode == DFmode)
1893	{
1894	  if (op0 == CONST0_RTX (mode))
1895	    op0 = CONST0_RTX (vmode);
1896	  else
1897	    {
1898	      rtx v = ix86_build_const_vector (vmode, false, op0);
1899
1900	      op0 = force_reg (vmode, v);
1901	    }
1902	}
1903      else if (op0 != CONST0_RTX (mode))
1904	op0 = force_reg (mode, op0);
1905
1906      emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1907    }
1908  else
1909    {
1910      rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1911
1912      emit_insn (gen_copysign3_var
1913		 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1914    }
1915}
1916
1917/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
1918   be a constant, and so has already been expanded into a vector constant.  */
1919
1920void
1921ix86_split_copysign_const (rtx operands[])
1922{
1923  machine_mode mode, vmode;
1924  rtx dest, op0, mask, x;
1925
1926  dest = operands[0];
1927  op0 = operands[1];
1928  mask = operands[3];
1929
1930  mode = GET_MODE (dest);
1931  vmode = GET_MODE (mask);
1932
1933  dest = lowpart_subreg (vmode, dest, mode);
1934  x = gen_rtx_AND (vmode, dest, mask);
1935  emit_insn (gen_rtx_SET (dest, x));
1936
1937  if (op0 != CONST0_RTX (vmode))
1938    {
1939      x = gen_rtx_IOR (vmode, dest, op0);
1940      emit_insn (gen_rtx_SET (dest, x));
1941    }
1942}
1943
1944/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
1945   so we have to do two masks.  */
1946
1947void
1948ix86_split_copysign_var (rtx operands[])
1949{
1950  machine_mode mode, vmode;
1951  rtx dest, scratch, op0, op1, mask, nmask, x;
1952
1953  dest = operands[0];
1954  scratch = operands[1];
1955  op0 = operands[2];
1956  op1 = operands[3];
1957  nmask = operands[4];
1958  mask = operands[5];
1959
1960  mode = GET_MODE (dest);
1961  vmode = GET_MODE (mask);
1962
1963  if (rtx_equal_p (op0, op1))
1964    {
1965      /* Shouldn't happen often (it's useless, obviously), but when it does
1966	 we'd generate incorrect code if we continue below.  */
1967      emit_move_insn (dest, op0);
1968      return;
1969    }
1970
1971  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
1972    {
1973      gcc_assert (REGNO (op1) == REGNO (scratch));
1974
1975      x = gen_rtx_AND (vmode, scratch, mask);
1976      emit_insn (gen_rtx_SET (scratch, x));
1977
1978      dest = mask;
1979      op0 = lowpart_subreg (vmode, op0, mode);
1980      x = gen_rtx_NOT (vmode, dest);
1981      x = gen_rtx_AND (vmode, x, op0);
1982      emit_insn (gen_rtx_SET (dest, x));
1983    }
1984  else
1985    {
1986      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
1987	{
1988	  x = gen_rtx_AND (vmode, scratch, mask);
1989	}
1990      else						/* alternative 2,4 */
1991	{
1992          gcc_assert (REGNO (mask) == REGNO (scratch));
1993          op1 = lowpart_subreg (vmode, op1, mode);
1994	  x = gen_rtx_AND (vmode, scratch, op1);
1995	}
1996      emit_insn (gen_rtx_SET (scratch, x));
1997
1998      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
1999	{
2000	  dest = lowpart_subreg (vmode, op0, mode);
2001	  x = gen_rtx_AND (vmode, dest, nmask);
2002	}
2003      else						/* alternative 3,4 */
2004	{
2005          gcc_assert (REGNO (nmask) == REGNO (dest));
2006	  dest = nmask;
2007	  op0 = lowpart_subreg (vmode, op0, mode);
2008	  x = gen_rtx_AND (vmode, dest, op0);
2009	}
2010      emit_insn (gen_rtx_SET (dest, x));
2011    }
2012
2013  x = gen_rtx_IOR (vmode, dest, scratch);
2014  emit_insn (gen_rtx_SET (dest, x));
2015}
2016
2017/* Expand an xorsign operation.  */
2018
2019void
2020ix86_expand_xorsign (rtx operands[])
2021{
2022  machine_mode mode, vmode;
2023  rtx dest, op0, op1, mask;
2024
2025  dest = operands[0];
2026  op0 = operands[1];
2027  op1 = operands[2];
2028
2029  mode = GET_MODE (dest);
2030
2031  if (mode == SFmode)
2032    vmode = V4SFmode;
2033  else if (mode == DFmode)
2034    vmode = V2DFmode;
2035  else
2036    gcc_unreachable ();
2037
2038  mask = ix86_build_signbit_mask (vmode, 0, 0);
2039
2040  emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2041}
2042
2043/* Deconstruct an xorsign operation into bit masks.  */
2044
2045void
2046ix86_split_xorsign (rtx operands[])
2047{
2048  machine_mode mode, vmode;
2049  rtx dest, op0, mask, x;
2050
2051  dest = operands[0];
2052  op0 = operands[1];
2053  mask = operands[3];
2054
2055  mode = GET_MODE (dest);
2056  vmode = GET_MODE (mask);
2057
2058  dest = lowpart_subreg (vmode, dest, mode);
2059  x = gen_rtx_AND (vmode, dest, mask);
2060  emit_insn (gen_rtx_SET (dest, x));
2061
2062  op0 = lowpart_subreg (vmode, op0, mode);
2063  x = gen_rtx_XOR (vmode, dest, op0);
2064  emit_insn (gen_rtx_SET (dest, x));
2065}
2066
2067static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2068
2069void
2070ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2071{
2072  machine_mode mode = GET_MODE (op0);
2073  rtx tmp;
2074
2075  /* Handle special case - vector comparsion with boolean result, transform
2076     it using ptest instruction.  */
2077  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2078    {
2079      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2080      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2081
2082      gcc_assert (code == EQ || code == NE);
2083      /* Generate XOR since we can't check that one operand is zero vector.  */
2084      tmp = gen_reg_rtx (mode);
2085      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2086      tmp = gen_lowpart (p_mode, tmp);
2087      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2088			      gen_rtx_UNSPEC (CCmode,
2089					      gen_rtvec (2, tmp, tmp),
2090					      UNSPEC_PTEST)));
2091      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2092      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2093				  gen_rtx_LABEL_REF (VOIDmode, label),
2094				  pc_rtx);
2095      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2096      return;
2097    }
2098
2099  switch (mode)
2100    {
2101    case E_SFmode:
2102    case E_DFmode:
2103    case E_XFmode:
2104    case E_QImode:
2105    case E_HImode:
2106    case E_SImode:
2107      simple:
2108      tmp = ix86_expand_compare (code, op0, op1);
2109      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2110				  gen_rtx_LABEL_REF (VOIDmode, label),
2111				  pc_rtx);
2112      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2113      return;
2114
2115    case E_DImode:
2116      if (TARGET_64BIT)
2117	goto simple;
2118      /* For 32-bit target DI comparison may be performed on
2119	 SSE registers.  To allow this we should avoid split
2120	 to SI mode which is achieved by doing xor in DI mode
2121	 and then comparing with zero (which is recognized by
2122	 STV pass).  We don't compare using xor when optimizing
2123	 for size.  */
2124      if (!optimize_insn_for_size_p ()
2125	  && TARGET_STV
2126	  && (code == EQ || code == NE))
2127	{
2128	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2129	  op1 = const0_rtx;
2130	}
2131      /* FALLTHRU */
2132    case E_TImode:
2133      /* Expand DImode branch into multiple compare+branch.  */
2134      {
2135	rtx lo[2], hi[2];
2136	rtx_code_label *label2;
2137	enum rtx_code code1, code2, code3;
2138	machine_mode submode;
2139
2140	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2141	  {
2142	    std::swap (op0, op1);
2143	    code = swap_condition (code);
2144	  }
2145
2146	split_double_mode (mode, &op0, 1, lo+0, hi+0);
2147	split_double_mode (mode, &op1, 1, lo+1, hi+1);
2148
2149	submode = mode == DImode ? SImode : DImode;
2150
2151	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2152	   avoid two branches.  This costs one extra insn, so disable when
2153	   optimizing for size.  */
2154
2155	if ((code == EQ || code == NE)
2156	    && (!optimize_insn_for_size_p ()
2157	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
2158	  {
2159	    rtx xor0, xor1;
2160
2161	    xor1 = hi[0];
2162	    if (hi[1] != const0_rtx)
2163	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2164				   NULL_RTX, 0, OPTAB_WIDEN);
2165
2166	    xor0 = lo[0];
2167	    if (lo[1] != const0_rtx)
2168	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2169				   NULL_RTX, 0, OPTAB_WIDEN);
2170
2171	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
2172				NULL_RTX, 0, OPTAB_WIDEN);
2173
2174	    ix86_expand_branch (code, tmp, const0_rtx, label);
2175	    return;
2176	  }
2177
2178	/* Otherwise, if we are doing less-than or greater-or-equal-than,
2179	   op1 is a constant and the low word is zero, then we can just
2180	   examine the high word.  Similarly for low word -1 and
2181	   less-or-equal-than or greater-than.  */
2182
2183	if (CONST_INT_P (hi[1]))
2184	  switch (code)
2185	    {
2186	    case LT: case LTU: case GE: case GEU:
2187	      if (lo[1] == const0_rtx)
2188		{
2189		  ix86_expand_branch (code, hi[0], hi[1], label);
2190		  return;
2191		}
2192	      break;
2193	    case LE: case LEU: case GT: case GTU:
2194	      if (lo[1] == constm1_rtx)
2195		{
2196		  ix86_expand_branch (code, hi[0], hi[1], label);
2197		  return;
2198		}
2199	      break;
2200	    default:
2201	      break;
2202	    }
2203
2204	/* Emulate comparisons that do not depend on Zero flag with
2205	   double-word subtraction.  Note that only Overflow, Sign
2206	   and Carry flags are valid, so swap arguments and condition
2207	   of comparisons that would otherwise test Zero flag.  */
2208
2209	switch (code)
2210	  {
2211	  case LE: case LEU: case GT: case GTU:
2212	    std::swap (lo[0], lo[1]);
2213	    std::swap (hi[0], hi[1]);
2214	    code = swap_condition (code);
2215	    /* FALLTHRU */
2216
2217	  case LT: case LTU: case GE: case GEU:
2218	    {
2219	      bool uns = (code == LTU || code == GEU);
2220	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2221		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2222
2223	      if (!nonimmediate_operand (lo[0], submode))
2224		lo[0] = force_reg (submode, lo[0]);
2225	      if (!x86_64_general_operand (lo[1], submode))
2226		lo[1] = force_reg (submode, lo[1]);
2227
2228	      if (!register_operand (hi[0], submode))
2229		hi[0] = force_reg (submode, hi[0]);
2230	      if ((uns && !nonimmediate_operand (hi[1], submode))
2231		  || (!uns && !x86_64_general_operand (hi[1], submode)))
2232		hi[1] = force_reg (submode, hi[1]);
2233
2234	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2235
2236	      tmp = gen_rtx_SCRATCH (submode);
2237	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2238
2239	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2240	      ix86_expand_branch (code, tmp, const0_rtx, label);
2241	      return;
2242	    }
2243
2244	  default:
2245	    break;
2246	  }
2247
2248	/* Otherwise, we need two or three jumps.  */
2249
2250	label2 = gen_label_rtx ();
2251
2252	code1 = code;
2253	code2 = swap_condition (code);
2254	code3 = unsigned_condition (code);
2255
2256	switch (code)
2257	  {
2258	  case LT: case GT: case LTU: case GTU:
2259	    break;
2260
2261	  case LE:   code1 = LT;  code2 = GT;  break;
2262	  case GE:   code1 = GT;  code2 = LT;  break;
2263	  case LEU:  code1 = LTU; code2 = GTU; break;
2264	  case GEU:  code1 = GTU; code2 = LTU; break;
2265
2266	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
2267	  case NE:   code2 = UNKNOWN; break;
2268
2269	  default:
2270	    gcc_unreachable ();
2271	  }
2272
2273	/*
2274	 * a < b =>
2275	 *    if (hi(a) < hi(b)) goto true;
2276	 *    if (hi(a) > hi(b)) goto false;
2277	 *    if (lo(a) < lo(b)) goto true;
2278	 *  false:
2279	 */
2280
2281	if (code1 != UNKNOWN)
2282	  ix86_expand_branch (code1, hi[0], hi[1], label);
2283	if (code2 != UNKNOWN)
2284	  ix86_expand_branch (code2, hi[0], hi[1], label2);
2285
2286	ix86_expand_branch (code3, lo[0], lo[1], label);
2287
2288	if (code2 != UNKNOWN)
2289	  emit_label (label2);
2290	return;
2291      }
2292
2293    default:
2294      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2295      goto simple;
2296    }
2297}
2298
2299/* Figure out whether to use unordered fp comparisons.  */
2300
2301static bool
2302ix86_unordered_fp_compare (enum rtx_code code)
2303{
2304  if (!TARGET_IEEE_FP)
2305    return false;
2306
2307  switch (code)
2308    {
2309    case LT:
2310    case LE:
2311    case GT:
2312    case GE:
2313    case LTGT:
2314      return false;
2315
2316    case EQ:
2317    case NE:
2318
2319    case UNORDERED:
2320    case ORDERED:
2321    case UNLT:
2322    case UNLE:
2323    case UNGT:
2324    case UNGE:
2325    case UNEQ:
2326      return true;
2327
2328    default:
2329      gcc_unreachable ();
2330    }
2331}
2332
2333/* Return a comparison we can do and that it is equivalent to
2334   swap_condition (code) apart possibly from orderedness.
2335   But, never change orderedness if TARGET_IEEE_FP, returning
2336   UNKNOWN in that case if necessary.  */
2337
2338static enum rtx_code
2339ix86_fp_swap_condition (enum rtx_code code)
2340{
2341  switch (code)
2342    {
2343    case GT:                   /* GTU - CF=0 & ZF=0 */
2344      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2345    case GE:                   /* GEU - CF=0 */
2346      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2347    case UNLT:                 /* LTU - CF=1 */
2348      return TARGET_IEEE_FP ? UNKNOWN : GT;
2349    case UNLE:                 /* LEU - CF=1 | ZF=1 */
2350      return TARGET_IEEE_FP ? UNKNOWN : GE;
2351    default:
2352      return swap_condition (code);
2353    }
2354}
2355
2356/* Return cost of comparison CODE using the best strategy for performance.
2357   All following functions do use number of instructions as a cost metrics.
2358   In future this should be tweaked to compute bytes for optimize_size and
2359   take into account performance of various instructions on various CPUs.  */
2360
2361static int
2362ix86_fp_comparison_cost (enum rtx_code code)
2363{
2364  int arith_cost;
2365
2366  /* The cost of code using bit-twiddling on %ah.  */
2367  switch (code)
2368    {
2369    case UNLE:
2370    case UNLT:
2371    case LTGT:
2372    case GT:
2373    case GE:
2374    case UNORDERED:
2375    case ORDERED:
2376    case UNEQ:
2377      arith_cost = 4;
2378      break;
2379    case LT:
2380    case NE:
2381    case EQ:
2382    case UNGE:
2383      arith_cost = TARGET_IEEE_FP ? 5 : 4;
2384      break;
2385    case LE:
2386    case UNGT:
2387      arith_cost = TARGET_IEEE_FP ? 6 : 4;
2388      break;
2389    default:
2390      gcc_unreachable ();
2391    }
2392
2393  switch (ix86_fp_comparison_strategy (code))
2394    {
2395    case IX86_FPCMP_COMI:
2396      return arith_cost > 4 ? 3 : 2;
2397    case IX86_FPCMP_SAHF:
2398      return arith_cost > 4 ? 4 : 3;
2399    default:
2400      return arith_cost;
2401    }
2402}
2403
2404/* Swap, force into registers, or otherwise massage the two operands
2405   to a fp comparison.  The operands are updated in place; the new
2406   comparison code is returned.  */
2407
2408static enum rtx_code
2409ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2410{
2411  bool unordered_compare = ix86_unordered_fp_compare (code);
2412  rtx op0 = *pop0, op1 = *pop1;
2413  machine_mode op_mode = GET_MODE (op0);
2414  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2415
2416  /* All of the unordered compare instructions only work on registers.
2417     The same is true of the fcomi compare instructions.  The XFmode
2418     compare instructions require registers except when comparing
2419     against zero or when converting operand 1 from fixed point to
2420     floating point.  */
2421
2422  if (!is_sse
2423      && (unordered_compare
2424	  || (op_mode == XFmode
2425	      && ! (standard_80387_constant_p (op0) == 1
2426		    || standard_80387_constant_p (op1) == 1)
2427	      && GET_CODE (op1) != FLOAT)
2428	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2429    {
2430      op0 = force_reg (op_mode, op0);
2431      op1 = force_reg (op_mode, op1);
2432    }
2433  else
2434    {
2435      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
2436	 things around if they appear profitable, otherwise force op0
2437	 into a register.  */
2438
2439      if (standard_80387_constant_p (op0) == 0
2440	  || (MEM_P (op0)
2441	      && ! (standard_80387_constant_p (op1) == 0
2442		    || MEM_P (op1))))
2443	{
2444	  enum rtx_code new_code = ix86_fp_swap_condition (code);
2445	  if (new_code != UNKNOWN)
2446	    {
2447	      std::swap (op0, op1);
2448	      code = new_code;
2449	    }
2450	}
2451
2452      if (!REG_P (op0))
2453	op0 = force_reg (op_mode, op0);
2454
2455      if (CONSTANT_P (op1))
2456	{
2457	  int tmp = standard_80387_constant_p (op1);
2458	  if (tmp == 0)
2459	    op1 = validize_mem (force_const_mem (op_mode, op1));
2460	  else if (tmp == 1)
2461	    {
2462	      if (TARGET_CMOVE)
2463		op1 = force_reg (op_mode, op1);
2464	    }
2465	  else
2466	    op1 = force_reg (op_mode, op1);
2467	}
2468    }
2469
2470  /* Try to rearrange the comparison to make it cheaper.  */
2471  if (ix86_fp_comparison_cost (code)
2472      > ix86_fp_comparison_cost (swap_condition (code))
2473      && (REG_P (op1) || can_create_pseudo_p ()))
2474    {
2475      std::swap (op0, op1);
2476      code = swap_condition (code);
2477      if (!REG_P (op0))
2478	op0 = force_reg (op_mode, op0);
2479    }
2480
2481  *pop0 = op0;
2482  *pop1 = op1;
2483  return code;
2484}
2485
2486/* Generate insn patterns to do a floating point compare of OPERANDS.  */
2487
2488static rtx
2489ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2490{
2491  bool unordered_compare = ix86_unordered_fp_compare (code);
2492  machine_mode cmp_mode;
2493  rtx tmp, scratch;
2494
2495  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2496
2497  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2498  if (unordered_compare)
2499    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2500
2501  /* Do fcomi/sahf based test when profitable.  */
2502  switch (ix86_fp_comparison_strategy (code))
2503    {
2504    case IX86_FPCMP_COMI:
2505      cmp_mode = CCFPmode;
2506      emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2507      break;
2508
2509    case IX86_FPCMP_SAHF:
2510      cmp_mode = CCFPmode;
2511      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2512      scratch = gen_reg_rtx (HImode);
2513      emit_insn (gen_rtx_SET (scratch, tmp));
2514      emit_insn (gen_x86_sahf_1 (scratch));
2515      break;
2516
2517    case IX86_FPCMP_ARITH:
2518      cmp_mode = CCNOmode;
2519      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2520      scratch = gen_reg_rtx (HImode);
2521      emit_insn (gen_rtx_SET (scratch, tmp));
2522
2523      /* In the unordered case, we have to check C2 for NaN's, which
2524	 doesn't happen to work out to anything nice combination-wise.
2525	 So do some bit twiddling on the value we've got in AH to come
2526	 up with an appropriate set of condition codes.  */
2527
2528      switch (code)
2529	{
2530	case GT:
2531	case UNGT:
2532	  if (code == GT || !TARGET_IEEE_FP)
2533	    {
2534	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2535	      code = EQ;
2536	    }
2537	  else
2538	    {
2539	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2540	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2541	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2542	      cmp_mode = CCmode;
2543	      code = GEU;
2544	    }
2545	  break;
2546	case LT:
2547	case UNLT:
2548	  if (code == LT && TARGET_IEEE_FP)
2549	    {
2550	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2551	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2552	      cmp_mode = CCmode;
2553	      code = EQ;
2554	    }
2555	  else
2556	    {
2557	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2558	      code = NE;
2559	    }
2560	  break;
2561	case GE:
2562	case UNGE:
2563	  if (code == GE || !TARGET_IEEE_FP)
2564	    {
2565	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2566	      code = EQ;
2567	    }
2568	  else
2569	    {
2570	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2571	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2572	      code = NE;
2573	    }
2574	  break;
2575	case LE:
2576	case UNLE:
2577	  if (code == LE && TARGET_IEEE_FP)
2578	    {
2579	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2580	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2581	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2582	      cmp_mode = CCmode;
2583	      code = LTU;
2584	    }
2585	  else
2586	    {
2587	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2588	      code = NE;
2589	    }
2590	  break;
2591	case EQ:
2592	case UNEQ:
2593	  if (code == EQ && TARGET_IEEE_FP)
2594	    {
2595	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2596	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2597	      cmp_mode = CCmode;
2598	      code = EQ;
2599	    }
2600	  else
2601	    {
2602	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2603	      code = NE;
2604	    }
2605	  break;
2606	case NE:
2607	case LTGT:
2608	  if (code == NE && TARGET_IEEE_FP)
2609	    {
2610	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2611	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2612					     GEN_INT (0x40)));
2613	      code = NE;
2614	    }
2615	  else
2616	    {
2617	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2618	      code = EQ;
2619	    }
2620	  break;
2621
2622	case UNORDERED:
2623	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2624	  code = NE;
2625	  break;
2626	case ORDERED:
2627	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2628	  code = EQ;
2629	  break;
2630
2631	default:
2632	  gcc_unreachable ();
2633	}
2634	break;
2635
2636    default:
2637      gcc_unreachable();
2638    }
2639
2640  /* Return the test that should be put into the flags user, i.e.
2641     the bcc, scc, or cmov instruction.  */
2642  return gen_rtx_fmt_ee (code, VOIDmode,
2643			 gen_rtx_REG (cmp_mode, FLAGS_REG),
2644			 const0_rtx);
2645}
2646
2647/* Generate insn patterns to do an integer compare of OPERANDS.  */
2648
2649static rtx
2650ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2651{
2652  machine_mode cmpmode;
2653  rtx tmp, flags;
2654
2655  cmpmode = SELECT_CC_MODE (code, op0, op1);
2656  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2657
2658  /* This is very simple, but making the interface the same as in the
2659     FP case makes the rest of the code easier.  */
2660  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2661  emit_insn (gen_rtx_SET (flags, tmp));
2662
2663  /* Return the test that should be put into the flags user, i.e.
2664     the bcc, scc, or cmov instruction.  */
2665  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2666}
2667
2668static rtx
2669ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2670{
2671  rtx ret;
2672
2673  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2674    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2675
2676  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2677    {
2678      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2679      ret = ix86_expand_fp_compare (code, op0, op1);
2680    }
2681  else
2682    ret = ix86_expand_int_compare (code, op0, op1);
2683
2684  return ret;
2685}
2686
2687void
2688ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2689{
2690  rtx ret;
2691
2692  gcc_assert (GET_MODE (dest) == QImode);
2693
2694  ret = ix86_expand_compare (code, op0, op1);
2695  PUT_MODE (ret, QImode);
2696  emit_insn (gen_rtx_SET (dest, ret));
2697}
2698
2699/* Expand comparison setting or clearing carry flag.  Return true when
2700   successful and set pop for the operation.  */
2701static bool
2702ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2703{
2704  machine_mode mode
2705    = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2706
2707  /* Do not handle double-mode compares that go through special path.  */
2708  if (mode == (TARGET_64BIT ? TImode : DImode))
2709    return false;
2710
2711  if (SCALAR_FLOAT_MODE_P (mode))
2712    {
2713      rtx compare_op;
2714      rtx_insn *compare_seq;
2715
2716      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2717
2718      /* Shortcut:  following common codes never translate
2719	 into carry flag compares.  */
2720      if (code == EQ || code == NE || code == UNEQ || code == LTGT
2721	  || code == ORDERED || code == UNORDERED)
2722	return false;
2723
2724      /* These comparisons require zero flag; swap operands so they won't.  */
2725      if ((code == GT || code == UNLE || code == LE || code == UNGT)
2726	  && !TARGET_IEEE_FP)
2727	{
2728	  std::swap (op0, op1);
2729	  code = swap_condition (code);
2730	}
2731
2732      /* Try to expand the comparison and verify that we end up with
2733	 carry flag based comparison.  This fails to be true only when
2734	 we decide to expand comparison using arithmetic that is not
2735	 too common scenario.  */
2736      start_sequence ();
2737      compare_op = ix86_expand_fp_compare (code, op0, op1);
2738      compare_seq = get_insns ();
2739      end_sequence ();
2740
2741      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2742        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2743      else
2744	code = GET_CODE (compare_op);
2745
2746      if (code != LTU && code != GEU)
2747	return false;
2748
2749      emit_insn (compare_seq);
2750      *pop = compare_op;
2751      return true;
2752    }
2753
2754  if (!INTEGRAL_MODE_P (mode))
2755    return false;
2756
2757  switch (code)
2758    {
2759    case LTU:
2760    case GEU:
2761      break;
2762
2763    /* Convert a==0 into (unsigned)a<1.  */
2764    case EQ:
2765    case NE:
2766      if (op1 != const0_rtx)
2767	return false;
2768      op1 = const1_rtx;
2769      code = (code == EQ ? LTU : GEU);
2770      break;
2771
2772    /* Convert a>b into b<a or a>=b-1.  */
2773    case GTU:
2774    case LEU:
2775      if (CONST_INT_P (op1))
2776	{
2777	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2778	  /* Bail out on overflow.  We still can swap operands but that
2779	     would force loading of the constant into register.  */
2780	  if (op1 == const0_rtx
2781	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2782	    return false;
2783	  code = (code == GTU ? GEU : LTU);
2784	}
2785      else
2786	{
2787	  std::swap (op0, op1);
2788	  code = (code == GTU ? LTU : GEU);
2789	}
2790      break;
2791
2792    /* Convert a>=0 into (unsigned)a<0x80000000.  */
2793    case LT:
2794    case GE:
2795      if (mode == DImode || op1 != const0_rtx)
2796	return false;
2797      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2798      code = (code == LT ? GEU : LTU);
2799      break;
2800    case LE:
2801    case GT:
2802      if (mode == DImode || op1 != constm1_rtx)
2803	return false;
2804      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2805      code = (code == LE ? GEU : LTU);
2806      break;
2807
2808    default:
2809      return false;
2810    }
2811  /* Swapping operands may cause constant to appear as first operand.  */
2812  if (!nonimmediate_operand (op0, VOIDmode))
2813    {
2814      if (!can_create_pseudo_p ())
2815	return false;
2816      op0 = force_reg (mode, op0);
2817    }
2818  *pop = ix86_expand_compare (code, op0, op1);
2819  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2820  return true;
2821}
2822
2823/* Expand conditional increment or decrement using adb/sbb instructions.
2824   The default case using setcc followed by the conditional move can be
2825   done by generic code.  */
2826bool
2827ix86_expand_int_addcc (rtx operands[])
2828{
2829  enum rtx_code code = GET_CODE (operands[1]);
2830  rtx flags;
2831  rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2832  rtx compare_op;
2833  rtx val = const0_rtx;
2834  bool fpcmp = false;
2835  machine_mode mode;
2836  rtx op0 = XEXP (operands[1], 0);
2837  rtx op1 = XEXP (operands[1], 1);
2838
2839  if (operands[3] != const1_rtx
2840      && operands[3] != constm1_rtx)
2841    return false;
2842  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2843     return false;
2844  code = GET_CODE (compare_op);
2845
2846  flags = XEXP (compare_op, 0);
2847
2848  if (GET_MODE (flags) == CCFPmode)
2849    {
2850      fpcmp = true;
2851      code = ix86_fp_compare_code_to_integer (code);
2852    }
2853
2854  if (code != LTU)
2855    {
2856      val = constm1_rtx;
2857      if (fpcmp)
2858	PUT_CODE (compare_op,
2859		  reverse_condition_maybe_unordered
2860		    (GET_CODE (compare_op)));
2861      else
2862	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2863    }
2864
2865  mode = GET_MODE (operands[0]);
2866
2867  /* Construct either adc or sbb insn.  */
2868  if ((code == LTU) == (operands[3] == constm1_rtx))
2869    insn = gen_sub3_carry;
2870  else
2871    insn = gen_add3_carry;
2872
2873  emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2874
2875  return true;
2876}
2877
2878bool
2879ix86_expand_int_movcc (rtx operands[])
2880{
2881  enum rtx_code code = GET_CODE (operands[1]), compare_code;
2882  rtx_insn *compare_seq;
2883  rtx compare_op;
2884  machine_mode mode = GET_MODE (operands[0]);
2885  bool sign_bit_compare_p = false;
2886  rtx op0 = XEXP (operands[1], 0);
2887  rtx op1 = XEXP (operands[1], 1);
2888
2889  if (GET_MODE (op0) == TImode
2890      || (GET_MODE (op0) == DImode
2891	  && !TARGET_64BIT))
2892    return false;
2893
2894  start_sequence ();
2895  compare_op = ix86_expand_compare (code, op0, op1);
2896  compare_seq = get_insns ();
2897  end_sequence ();
2898
2899  compare_code = GET_CODE (compare_op);
2900
2901  if ((op1 == const0_rtx && (code == GE || code == LT))
2902      || (op1 == constm1_rtx && (code == GT || code == LE)))
2903    sign_bit_compare_p = true;
2904
2905  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2906     HImode insns, we'd be swallowed in word prefix ops.  */
2907
2908  if ((mode != HImode || TARGET_FAST_PREFIX)
2909      && (mode != (TARGET_64BIT ? TImode : DImode))
2910      && CONST_INT_P (operands[2])
2911      && CONST_INT_P (operands[3]))
2912    {
2913      rtx out = operands[0];
2914      HOST_WIDE_INT ct = INTVAL (operands[2]);
2915      HOST_WIDE_INT cf = INTVAL (operands[3]);
2916      HOST_WIDE_INT diff;
2917
2918      diff = ct - cf;
2919      /*  Sign bit compares are better done using shifts than we do by using
2920	  sbb.  */
2921      if (sign_bit_compare_p
2922	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2923	{
2924	  /* Detect overlap between destination and compare sources.  */
2925	  rtx tmp = out;
2926
2927          if (!sign_bit_compare_p)
2928	    {
2929	      rtx flags;
2930	      bool fpcmp = false;
2931
2932	      compare_code = GET_CODE (compare_op);
2933
2934	      flags = XEXP (compare_op, 0);
2935
2936	      if (GET_MODE (flags) == CCFPmode)
2937		{
2938		  fpcmp = true;
2939		  compare_code
2940		    = ix86_fp_compare_code_to_integer (compare_code);
2941		}
2942
2943	      /* To simplify rest of code, restrict to the GEU case.  */
2944	      if (compare_code == LTU)
2945		{
2946		  std::swap (ct, cf);
2947		  compare_code = reverse_condition (compare_code);
2948		  code = reverse_condition (code);
2949		}
2950	      else
2951		{
2952		  if (fpcmp)
2953		    PUT_CODE (compare_op,
2954			      reverse_condition_maybe_unordered
2955			        (GET_CODE (compare_op)));
2956		  else
2957		    PUT_CODE (compare_op,
2958			      reverse_condition (GET_CODE (compare_op)));
2959		}
2960	      diff = ct - cf;
2961
2962	      if (reg_overlap_mentioned_p (out, op0)
2963		  || reg_overlap_mentioned_p (out, op1))
2964		tmp = gen_reg_rtx (mode);
2965
2966	      if (mode == DImode)
2967		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2968	      else
2969		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
2970						 flags, compare_op));
2971	    }
2972	  else
2973	    {
2974	      if (code == GT || code == GE)
2975		code = reverse_condition (code);
2976	      else
2977		{
2978		  std::swap (ct, cf);
2979		  diff = ct - cf;
2980		}
2981	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2982	    }
2983
2984	  if (diff == 1)
2985	    {
2986	      /*
2987	       * cmpl op0,op1
2988	       * sbbl dest,dest
2989	       * [addl dest, ct]
2990	       *
2991	       * Size 5 - 8.
2992	       */
2993	      if (ct)
2994		tmp = expand_simple_binop (mode, PLUS,
2995					   tmp, GEN_INT (ct),
2996					   copy_rtx (tmp), 1, OPTAB_DIRECT);
2997	    }
2998	  else if (cf == -1)
2999	    {
3000	      /*
3001	       * cmpl op0,op1
3002	       * sbbl dest,dest
3003	       * orl $ct, dest
3004	       *
3005	       * Size 8.
3006	       */
3007	      tmp = expand_simple_binop (mode, IOR,
3008					 tmp, GEN_INT (ct),
3009					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3010	    }
3011	  else if (diff == -1 && ct)
3012	    {
3013	      /*
3014	       * cmpl op0,op1
3015	       * sbbl dest,dest
3016	       * notl dest
3017	       * [addl dest, cf]
3018	       *
3019	       * Size 8 - 11.
3020	       */
3021	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3022	      if (cf)
3023		tmp = expand_simple_binop (mode, PLUS,
3024					   copy_rtx (tmp), GEN_INT (cf),
3025					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3026	    }
3027	  else
3028	    {
3029	      /*
3030	       * cmpl op0,op1
3031	       * sbbl dest,dest
3032	       * [notl dest]
3033	       * andl cf - ct, dest
3034	       * [addl dest, ct]
3035	       *
3036	       * Size 8 - 11.
3037	       */
3038
3039	      if (cf == 0)
3040		{
3041		  cf = ct;
3042		  ct = 0;
3043		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3044		}
3045
3046	      tmp = expand_simple_binop (mode, AND,
3047					 copy_rtx (tmp),
3048					 gen_int_mode (cf - ct, mode),
3049					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3050	      if (ct)
3051		tmp = expand_simple_binop (mode, PLUS,
3052					   copy_rtx (tmp), GEN_INT (ct),
3053					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3054	    }
3055
3056	  if (!rtx_equal_p (tmp, out))
3057	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3058
3059	  return true;
3060	}
3061
3062      if (diff < 0)
3063	{
3064	  machine_mode cmp_mode = GET_MODE (op0);
3065	  enum rtx_code new_code;
3066
3067	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
3068	    {
3069	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3070
3071	      /* We may be reversing a non-trapping
3072		 comparison to a trapping comparison.  */
3073		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3074		      && code != EQ && code != NE
3075		      && code != ORDERED && code != UNORDERED)
3076		    new_code = UNKNOWN;
3077		  else
3078		    new_code = reverse_condition_maybe_unordered (code);
3079	    }
3080	  else
3081	    new_code = ix86_reverse_condition (code, cmp_mode);
3082	  if (new_code != UNKNOWN)
3083	    {
3084	      std::swap (ct, cf);
3085	      diff = -diff;
3086	      code = new_code;
3087	    }
3088	}
3089
3090      compare_code = UNKNOWN;
3091      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3092	  && CONST_INT_P (op1))
3093	{
3094	  if (op1 == const0_rtx
3095	      && (code == LT || code == GE))
3096	    compare_code = code;
3097	  else if (op1 == constm1_rtx)
3098	    {
3099	      if (code == LE)
3100		compare_code = LT;
3101	      else if (code == GT)
3102		compare_code = GE;
3103	    }
3104	}
3105
3106      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
3107      if (compare_code != UNKNOWN
3108	  && GET_MODE (op0) == GET_MODE (out)
3109	  && (cf == -1 || ct == -1))
3110	{
3111	  /* If lea code below could be used, only optimize
3112	     if it results in a 2 insn sequence.  */
3113
3114	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3115		 || diff == 3 || diff == 5 || diff == 9)
3116	      || (compare_code == LT && ct == -1)
3117	      || (compare_code == GE && cf == -1))
3118	    {
3119	      /*
3120	       * notl op1	(if necessary)
3121	       * sarl $31, op1
3122	       * orl cf, op1
3123	       */
3124	      if (ct != -1)
3125		{
3126		  cf = ct;
3127		  ct = -1;
3128		  code = reverse_condition (code);
3129		}
3130
3131	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3132
3133	      out = expand_simple_binop (mode, IOR,
3134					 out, GEN_INT (cf),
3135					 out, 1, OPTAB_DIRECT);
3136	      if (out != operands[0])
3137		emit_move_insn (operands[0], out);
3138
3139	      return true;
3140	    }
3141	}
3142
3143
3144      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3145	   || diff == 3 || diff == 5 || diff == 9)
3146	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3147	  && (mode != DImode
3148	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3149	{
3150	  /*
3151	   * xorl dest,dest
3152	   * cmpl op1,op2
3153	   * setcc dest
3154	   * lea cf(dest*(ct-cf)),dest
3155	   *
3156	   * Size 14.
3157	   *
3158	   * This also catches the degenerate setcc-only case.
3159	   */
3160
3161	  rtx tmp;
3162	  int nops;
3163
3164	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3165
3166	  nops = 0;
3167	  /* On x86_64 the lea instruction operates on Pmode, so we need
3168	     to get arithmetics done in proper mode to match.  */
3169	  if (diff == 1)
3170	    tmp = copy_rtx (out);
3171	  else
3172	    {
3173	      rtx out1;
3174	      out1 = copy_rtx (out);
3175	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3176	      nops++;
3177	      if (diff & 1)
3178		{
3179		  tmp = gen_rtx_PLUS (mode, tmp, out1);
3180		  nops++;
3181		}
3182	    }
3183	  if (cf != 0)
3184	    {
3185	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3186	      nops++;
3187	    }
3188	  if (!rtx_equal_p (tmp, out))
3189	    {
3190	      if (nops == 1)
3191		out = force_operand (tmp, copy_rtx (out));
3192	      else
3193		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3194	    }
3195	  if (!rtx_equal_p (out, operands[0]))
3196	    emit_move_insn (operands[0], copy_rtx (out));
3197
3198	  return true;
3199	}
3200
3201      /*
3202       * General case:			Jumpful:
3203       *   xorl dest,dest		cmpl op1, op2
3204       *   cmpl op1, op2		movl ct, dest
3205       *   setcc dest			jcc 1f
3206       *   decl dest			movl cf, dest
3207       *   andl (cf-ct),dest		1:
3208       *   addl ct,dest
3209       *
3210       * Size 20.			Size 14.
3211       *
3212       * This is reasonably steep, but branch mispredict costs are
3213       * high on modern cpus, so consider failing only if optimizing
3214       * for space.
3215       */
3216
3217      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3218	  && BRANCH_COST (optimize_insn_for_speed_p (),
3219		  	  false) >= 2)
3220	{
3221	  if (cf == 0)
3222	    {
3223	      machine_mode cmp_mode = GET_MODE (op0);
3224	      enum rtx_code new_code;
3225
3226	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
3227		{
3228		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3229
3230		  /* We may be reversing a non-trapping
3231		     comparison to a trapping comparison.  */
3232		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3233		      && code != EQ && code != NE
3234		      && code != ORDERED && code != UNORDERED)
3235		    new_code = UNKNOWN;
3236		  else
3237		    new_code = reverse_condition_maybe_unordered (code);
3238
3239		}
3240	      else
3241		{
3242		  new_code = ix86_reverse_condition (code, cmp_mode);
3243		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
3244		    compare_code = reverse_condition (compare_code);
3245		}
3246
3247	      if (new_code != UNKNOWN)
3248		{
3249		  cf = ct;
3250		  ct = 0;
3251		  code = new_code;
3252		}
3253	    }
3254
3255	  if (compare_code != UNKNOWN)
3256	    {
3257	      /* notl op1	(if needed)
3258		 sarl $31, op1
3259		 andl (cf-ct), op1
3260		 addl ct, op1
3261
3262		 For x < 0 (resp. x <= -1) there will be no notl,
3263		 so if possible swap the constants to get rid of the
3264		 complement.
3265		 True/false will be -1/0 while code below (store flag
3266		 followed by decrement) is 0/-1, so the constants need
3267		 to be exchanged once more.  */
3268
3269	      if (compare_code == GE || !cf)
3270		{
3271		  code = reverse_condition (code);
3272		  compare_code = LT;
3273		}
3274	      else
3275		std::swap (ct, cf);
3276
3277	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3278	    }
3279	  else
3280	    {
3281	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3282
3283	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3284					 constm1_rtx,
3285					 copy_rtx (out), 1, OPTAB_DIRECT);
3286	    }
3287
3288	  out = expand_simple_binop (mode, AND, copy_rtx (out),
3289				     gen_int_mode (cf - ct, mode),
3290				     copy_rtx (out), 1, OPTAB_DIRECT);
3291	  if (ct)
3292	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3293				       copy_rtx (out), 1, OPTAB_DIRECT);
3294	  if (!rtx_equal_p (out, operands[0]))
3295	    emit_move_insn (operands[0], copy_rtx (out));
3296
3297	  return true;
3298	}
3299    }
3300
3301  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3302    {
3303      /* Try a few things more with specific constants and a variable.  */
3304
3305      optab op;
3306      rtx var, orig_out, out, tmp;
3307
3308      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3309	return false;
3310
3311      /* If one of the two operands is an interesting constant, load a
3312	 constant with the above and mask it in with a logical operation.  */
3313
3314      if (CONST_INT_P (operands[2]))
3315	{
3316	  var = operands[3];
3317	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3318	    operands[3] = constm1_rtx, op = and_optab;
3319	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3320	    operands[3] = const0_rtx, op = ior_optab;
3321	  else
3322	    return false;
3323	}
3324      else if (CONST_INT_P (operands[3]))
3325	{
3326	  var = operands[2];
3327	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3328	    operands[2] = constm1_rtx, op = and_optab;
3329	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3330	    operands[2] = const0_rtx, op = ior_optab;
3331	  else
3332	    return false;
3333	}
3334      else
3335        return false;
3336
3337      orig_out = operands[0];
3338      tmp = gen_reg_rtx (mode);
3339      operands[0] = tmp;
3340
3341      /* Recurse to get the constant loaded.  */
3342      if (!ix86_expand_int_movcc (operands))
3343        return false;
3344
3345      /* Mask in the interesting variable.  */
3346      out = expand_binop (mode, op, var, tmp, orig_out, 0,
3347			  OPTAB_WIDEN);
3348      if (!rtx_equal_p (out, orig_out))
3349	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3350
3351      return true;
3352    }
3353
3354  /*
3355   * For comparison with above,
3356   *
3357   * movl cf,dest
3358   * movl ct,tmp
3359   * cmpl op1,op2
3360   * cmovcc tmp,dest
3361   *
3362   * Size 15.
3363   */
3364
3365  if (! nonimmediate_operand (operands[2], mode))
3366    operands[2] = force_reg (mode, operands[2]);
3367  if (! nonimmediate_operand (operands[3], mode))
3368    operands[3] = force_reg (mode, operands[3]);
3369
3370  if (! register_operand (operands[2], VOIDmode)
3371      && (mode == QImode
3372          || ! register_operand (operands[3], VOIDmode)))
3373    operands[2] = force_reg (mode, operands[2]);
3374
3375  if (mode == QImode
3376      && ! register_operand (operands[3], VOIDmode))
3377    operands[3] = force_reg (mode, operands[3]);
3378
3379  emit_insn (compare_seq);
3380  emit_insn (gen_rtx_SET (operands[0],
3381			  gen_rtx_IF_THEN_ELSE (mode,
3382						compare_op, operands[2],
3383						operands[3])));
3384  return true;
3385}
3386
3387/* Detect conditional moves that exactly match min/max operational
3388   semantics.  Note that this is IEEE safe, as long as we don't
3389   interchange the operands.
3390
3391   Returns FALSE if this conditional move doesn't match a MIN/MAX,
3392   and TRUE if the operation is successful and instructions are emitted.  */
3393
3394static bool
3395ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3396			   rtx cmp_op1, rtx if_true, rtx if_false)
3397{
3398  machine_mode mode;
3399  bool is_min;
3400  rtx tmp;
3401
3402  if (code == LT)
3403    ;
3404  else if (code == UNGE)
3405    std::swap (if_true, if_false);
3406  else
3407    return false;
3408
3409  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3410    is_min = true;
3411  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3412    is_min = false;
3413  else
3414    return false;
3415
3416  mode = GET_MODE (dest);
3417
3418  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3419     but MODE may be a vector mode and thus not appropriate.  */
3420  if (!flag_finite_math_only || flag_signed_zeros)
3421    {
3422      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3423      rtvec v;
3424
3425      if_true = force_reg (mode, if_true);
3426      v = gen_rtvec (2, if_true, if_false);
3427      tmp = gen_rtx_UNSPEC (mode, v, u);
3428    }
3429  else
3430    {
3431      code = is_min ? SMIN : SMAX;
3432      if (MEM_P (if_true) && MEM_P (if_false))
3433	if_true = force_reg (mode, if_true);
3434      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3435    }
3436
3437  emit_insn (gen_rtx_SET (dest, tmp));
3438  return true;
3439}
3440
3441/* Return true if MODE is valid for vector compare to mask register,
3442   Same result for conditionl vector move with mask register.  */
3443static bool
3444ix86_valid_mask_cmp_mode (machine_mode mode)
3445{
3446  /* XOP has its own vector conditional movement.  */
3447  if (TARGET_XOP && !TARGET_AVX512F)
3448    return false;
3449
3450  /* AVX512F is needed for mask operation.  */
3451  if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3452    return false;
3453
3454  /* AVX512BW is needed for vector QI/HImode,
3455     AVX512VL is needed for 128/256-bit vector.  */
3456  machine_mode inner_mode = GET_MODE_INNER (mode);
3457  int vector_size = GET_MODE_SIZE (mode);
3458  if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3459    return false;
3460
3461  return vector_size == 64 || TARGET_AVX512VL;
3462}
3463
3464/* Expand an SSE comparison.  Return the register with the result.  */
3465
3466static rtx
3467ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3468		     rtx op_true, rtx op_false)
3469{
3470  machine_mode mode = GET_MODE (dest);
3471  machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3472
3473  /* In general case result of comparison can differ from operands' type.  */
3474  machine_mode cmp_mode;
3475
3476  /* In AVX512F the result of comparison is an integer mask.  */
3477  bool maskcmp = false;
3478  rtx x;
3479
3480  if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3481    {
3482      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3483      maskcmp = true;
3484      cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3485    }
3486  else
3487    cmp_mode = cmp_ops_mode;
3488
3489  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3490
3491  int (*op1_predicate)(rtx, machine_mode)
3492    = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3493
3494  if (!op1_predicate (cmp_op1, cmp_ops_mode))
3495    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3496
3497  if (optimize
3498      || (maskcmp && cmp_mode != mode)
3499      || (op_true && reg_overlap_mentioned_p (dest, op_true))
3500      || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3501    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3502
3503  if (maskcmp)
3504    {
3505      bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3506      gcc_assert (ok);
3507      return dest;
3508    }
3509
3510  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3511
3512  if (cmp_mode != mode && !maskcmp)
3513    {
3514      x = force_reg (cmp_ops_mode, x);
3515      convert_move (dest, x, false);
3516    }
3517  else
3518    emit_insn (gen_rtx_SET (dest, x));
3519
3520  return dest;
3521}
3522
3523/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3524   operations.  This is used for both scalar and vector conditional moves.  */
3525
3526void
3527ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3528{
3529  machine_mode mode = GET_MODE (dest);
3530  machine_mode cmpmode = GET_MODE (cmp);
3531
3532  /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
3533  if (rtx_equal_p (op_true, op_false))
3534    {
3535      emit_move_insn (dest, op_true);
3536      return;
3537    }
3538
3539  /* In AVX512F the result of comparison is an integer mask.  */
3540  bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3541
3542  rtx t2, t3, x;
3543
3544  /* If we have an integer mask and FP value then we need
3545     to cast mask to FP mode.  */
3546  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3547    {
3548      cmp = force_reg (cmpmode, cmp);
3549      cmp = gen_rtx_SUBREG (mode, cmp, 0);
3550    }
3551
3552  if (maskcmp)
3553    {
3554      /* Using vector move with mask register.  */
3555      cmp = force_reg (cmpmode, cmp);
3556      /* Optimize for mask zero.  */
3557      op_true = (op_true != CONST0_RTX (mode)
3558		 ? force_reg (mode, op_true) : op_true);
3559      op_false = (op_false != CONST0_RTX (mode)
3560		  ? force_reg (mode, op_false) : op_false);
3561      if (op_true == CONST0_RTX (mode))
3562	{
3563	  rtx (*gen_not) (rtx, rtx);
3564	  switch (cmpmode)
3565	    {
3566	    case E_QImode: gen_not = gen_knotqi; break;
3567	    case E_HImode: gen_not = gen_knothi; break;
3568	    case E_SImode: gen_not = gen_knotsi; break;
3569	    case E_DImode: gen_not = gen_knotdi; break;
3570	    default: gcc_unreachable ();
3571	    }
3572	  rtx n = gen_reg_rtx (cmpmode);
3573	  emit_insn (gen_not (n, cmp));
3574	  cmp = n;
3575	  /* Reverse op_true op_false.  */
3576	  std::swap (op_true, op_false);
3577	}
3578
3579      rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3580      emit_insn (gen_rtx_SET (dest, vec_merge));
3581      return;
3582    }
3583  else if (vector_all_ones_operand (op_true, mode)
3584	   && op_false == CONST0_RTX (mode))
3585    {
3586      emit_insn (gen_rtx_SET (dest, cmp));
3587      return;
3588    }
3589  else if (op_false == CONST0_RTX (mode))
3590    {
3591      op_true = force_reg (mode, op_true);
3592      x = gen_rtx_AND (mode, cmp, op_true);
3593      emit_insn (gen_rtx_SET (dest, x));
3594      return;
3595    }
3596  else if (op_true == CONST0_RTX (mode))
3597    {
3598      op_false = force_reg (mode, op_false);
3599      x = gen_rtx_NOT (mode, cmp);
3600      x = gen_rtx_AND (mode, x, op_false);
3601      emit_insn (gen_rtx_SET (dest, x));
3602      return;
3603    }
3604  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3605    {
3606      op_false = force_reg (mode, op_false);
3607      x = gen_rtx_IOR (mode, cmp, op_false);
3608      emit_insn (gen_rtx_SET (dest, x));
3609      return;
3610    }
3611  else if (TARGET_XOP)
3612    {
3613      op_true = force_reg (mode, op_true);
3614
3615      if (!nonimmediate_operand (op_false, mode))
3616	op_false = force_reg (mode, op_false);
3617
3618      emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3619							  op_true,
3620							  op_false)));
3621      return;
3622    }
3623
3624  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3625  rtx d = dest;
3626
3627  if (!vector_operand (op_true, mode))
3628    op_true = force_reg (mode, op_true);
3629
3630  op_false = force_reg (mode, op_false);
3631
3632  switch (mode)
3633    {
3634    case E_V4SFmode:
3635      if (TARGET_SSE4_1)
3636	gen = gen_sse4_1_blendvps;
3637      break;
3638    case E_V2DFmode:
3639      if (TARGET_SSE4_1)
3640	gen = gen_sse4_1_blendvpd;
3641      break;
3642    case E_SFmode:
3643      if (TARGET_SSE4_1)
3644	{
3645	  gen = gen_sse4_1_blendvss;
3646	  op_true = force_reg (mode, op_true);
3647	}
3648      break;
3649    case E_DFmode:
3650      if (TARGET_SSE4_1)
3651	{
3652	  gen = gen_sse4_1_blendvsd;
3653	  op_true = force_reg (mode, op_true);
3654	}
3655      break;
3656    case E_V16QImode:
3657    case E_V8HImode:
3658    case E_V4SImode:
3659    case E_V2DImode:
3660      if (TARGET_SSE4_1)
3661	{
3662	  gen = gen_sse4_1_pblendvb;
3663	  if (mode != V16QImode)
3664	    d = gen_reg_rtx (V16QImode);
3665	  op_false = gen_lowpart (V16QImode, op_false);
3666	  op_true = gen_lowpart (V16QImode, op_true);
3667	  cmp = gen_lowpart (V16QImode, cmp);
3668	}
3669      break;
3670    case E_V8SFmode:
3671      if (TARGET_AVX)
3672	gen = gen_avx_blendvps256;
3673      break;
3674    case E_V4DFmode:
3675      if (TARGET_AVX)
3676	gen = gen_avx_blendvpd256;
3677      break;
3678    case E_V32QImode:
3679    case E_V16HImode:
3680    case E_V8SImode:
3681    case E_V4DImode:
3682      if (TARGET_AVX2)
3683	{
3684	  gen = gen_avx2_pblendvb;
3685	  if (mode != V32QImode)
3686	    d = gen_reg_rtx (V32QImode);
3687	  op_false = gen_lowpart (V32QImode, op_false);
3688	  op_true = gen_lowpart (V32QImode, op_true);
3689	  cmp = gen_lowpart (V32QImode, cmp);
3690	}
3691      break;
3692
3693    case E_V64QImode:
3694      gen = gen_avx512bw_blendmv64qi;
3695      break;
3696    case E_V32HImode:
3697      gen = gen_avx512bw_blendmv32hi;
3698      break;
3699    case E_V16SImode:
3700      gen = gen_avx512f_blendmv16si;
3701      break;
3702    case E_V8DImode:
3703      gen = gen_avx512f_blendmv8di;
3704      break;
3705    case E_V8DFmode:
3706      gen = gen_avx512f_blendmv8df;
3707      break;
3708    case E_V16SFmode:
3709      gen = gen_avx512f_blendmv16sf;
3710      break;
3711
3712    default:
3713      break;
3714    }
3715
3716  if (gen != NULL)
3717    {
3718      emit_insn (gen (d, op_false, op_true, cmp));
3719      if (d != dest)
3720	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3721    }
3722  else
3723    {
3724      op_true = force_reg (mode, op_true);
3725
3726      t2 = gen_reg_rtx (mode);
3727      if (optimize)
3728	t3 = gen_reg_rtx (mode);
3729      else
3730	t3 = dest;
3731
3732      x = gen_rtx_AND (mode, op_true, cmp);
3733      emit_insn (gen_rtx_SET (t2, x));
3734
3735      x = gen_rtx_NOT (mode, cmp);
3736      x = gen_rtx_AND (mode, x, op_false);
3737      emit_insn (gen_rtx_SET (t3, x));
3738
3739      x = gen_rtx_IOR (mode, t3, t2);
3740      emit_insn (gen_rtx_SET (dest, x));
3741    }
3742}
3743
3744/* Swap, force into registers, or otherwise massage the two operands
3745   to an sse comparison with a mask result.  Thus we differ a bit from
3746   ix86_prepare_fp_compare_args which expects to produce a flags result.
3747
3748   The DEST operand exists to help determine whether to commute commutative
3749   operators.  The POP0/POP1 operands are updated in place.  The new
3750   comparison code is returned, or UNKNOWN if not implementable.  */
3751
3752static enum rtx_code
3753ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3754				  rtx *pop0, rtx *pop1)
3755{
3756  switch (code)
3757    {
3758    case LTGT:
3759    case UNEQ:
3760      /* AVX supports all the needed comparisons.  */
3761      if (TARGET_AVX)
3762	break;
3763      /* We have no LTGT as an operator.  We could implement it with
3764	 NE & ORDERED, but this requires an extra temporary.  It's
3765	 not clear that it's worth it.  */
3766      return UNKNOWN;
3767
3768    case LT:
3769    case LE:
3770    case UNGT:
3771    case UNGE:
3772      /* These are supported directly.  */
3773      break;
3774
3775    case EQ:
3776    case NE:
3777    case UNORDERED:
3778    case ORDERED:
3779      /* AVX has 3 operand comparisons, no need to swap anything.  */
3780      if (TARGET_AVX)
3781	break;
3782      /* For commutative operators, try to canonicalize the destination
3783	 operand to be first in the comparison - this helps reload to
3784	 avoid extra moves.  */
3785      if (!dest || !rtx_equal_p (dest, *pop1))
3786	break;
3787      /* FALLTHRU */
3788
3789    case GE:
3790    case GT:
3791    case UNLE:
3792    case UNLT:
3793      /* These are not supported directly before AVX, and furthermore
3794	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
3795	 comparison operands to transform into something that is
3796	 supported.  */
3797      std::swap (*pop0, *pop1);
3798      code = swap_condition (code);
3799      break;
3800
3801    default:
3802      gcc_unreachable ();
3803    }
3804
3805  return code;
3806}
3807
3808/* Expand a floating-point conditional move.  Return true if successful.  */
3809
3810bool
3811ix86_expand_fp_movcc (rtx operands[])
3812{
3813  machine_mode mode = GET_MODE (operands[0]);
3814  enum rtx_code code = GET_CODE (operands[1]);
3815  rtx tmp, compare_op;
3816  rtx op0 = XEXP (operands[1], 0);
3817  rtx op1 = XEXP (operands[1], 1);
3818
3819  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3820    {
3821      machine_mode cmode;
3822
3823      /* Since we've no cmove for sse registers, don't force bad register
3824	 allocation just to gain access to it.  Deny movcc when the
3825	 comparison mode doesn't match the move mode.  */
3826      cmode = GET_MODE (op0);
3827      if (cmode == VOIDmode)
3828	cmode = GET_MODE (op1);
3829      if (cmode != mode)
3830	return false;
3831
3832      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3833      if (code == UNKNOWN)
3834	return false;
3835
3836      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3837				     operands[2], operands[3]))
3838	return true;
3839
3840      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3841				 operands[2], operands[3]);
3842      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3843      return true;
3844    }
3845
3846  if (GET_MODE (op0) == TImode
3847      || (GET_MODE (op0) == DImode
3848	  && !TARGET_64BIT))
3849    return false;
3850
3851  /* The floating point conditional move instructions don't directly
3852     support conditions resulting from a signed integer comparison.  */
3853
3854  compare_op = ix86_expand_compare (code, op0, op1);
3855  if (!fcmov_comparison_operator (compare_op, VOIDmode))
3856    {
3857      tmp = gen_reg_rtx (QImode);
3858      ix86_expand_setcc (tmp, code, op0, op1);
3859
3860      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3861    }
3862
3863  emit_insn (gen_rtx_SET (operands[0],
3864			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
3865						operands[2], operands[3])));
3866
3867  return true;
3868}
3869
3870/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
3871
3872static int
3873ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3874{
3875  switch (code)
3876    {
3877    case EQ:
3878      return 0;
3879    case LT:
3880    case LTU:
3881      return 1;
3882    case LE:
3883    case LEU:
3884      return 2;
3885    case NE:
3886      return 4;
3887    case GE:
3888    case GEU:
3889      return 5;
3890    case GT:
3891    case GTU:
3892      return 6;
3893    default:
3894      gcc_unreachable ();
3895    }
3896}
3897
3898/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
3899
3900static int
3901ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3902{
3903  switch (code)
3904    {
3905    case EQ:
3906      return 0x00;
3907    case NE:
3908      return 0x04;
3909    case GT:
3910      return 0x0e;
3911    case LE:
3912      return 0x02;
3913    case GE:
3914      return 0x0d;
3915    case LT:
3916      return 0x01;
3917    case UNLE:
3918      return 0x0a;
3919    case UNLT:
3920      return 0x09;
3921    case UNGE:
3922      return 0x05;
3923    case UNGT:
3924      return 0x06;
3925    case UNEQ:
3926      return 0x18;
3927    case LTGT:
3928      return 0x0c;
3929    case ORDERED:
3930      return 0x07;
3931    case UNORDERED:
3932      return 0x03;
3933    default:
3934      gcc_unreachable ();
3935    }
3936}
3937
3938/* Return immediate value to be used in UNSPEC_PCMP
3939   for comparison CODE in MODE.  */
3940
3941static int
3942ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3943{
3944  if (FLOAT_MODE_P (mode))
3945    return ix86_fp_cmp_code_to_pcmp_immediate (code);
3946  return ix86_int_cmp_code_to_pcmp_immediate (code);
3947}
3948
3949/* Expand AVX-512 vector comparison.  */
3950
3951bool
3952ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3953{
3954  machine_mode mask_mode = GET_MODE (dest);
3955  machine_mode cmp_mode = GET_MODE (cmp_op0);
3956  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3957  int unspec_code;
3958  rtx unspec;
3959
3960  switch (code)
3961    {
3962    case LEU:
3963    case GTU:
3964    case GEU:
3965    case LTU:
3966      unspec_code = UNSPEC_UNSIGNED_PCMP;
3967      break;
3968
3969    default:
3970      unspec_code = UNSPEC_PCMP;
3971    }
3972
3973  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
3974			   unspec_code);
3975  emit_insn (gen_rtx_SET (dest, unspec));
3976
3977  return true;
3978}
3979
3980/* Expand fp vector comparison.  */
3981
3982bool
3983ix86_expand_fp_vec_cmp (rtx operands[])
3984{
3985  enum rtx_code code = GET_CODE (operands[1]);
3986  rtx cmp;
3987
3988  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3989					   &operands[2], &operands[3]);
3990  if (code == UNKNOWN)
3991    {
3992      rtx temp;
3993      switch (GET_CODE (operands[1]))
3994	{
3995	case LTGT:
3996	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3997				      operands[3], NULL, NULL);
3998	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3999				     operands[3], NULL, NULL);
4000	  code = AND;
4001	  break;
4002	case UNEQ:
4003	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4004				      operands[3], NULL, NULL);
4005	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4006				     operands[3], NULL, NULL);
4007	  code = IOR;
4008	  break;
4009	default:
4010	  gcc_unreachable ();
4011	}
4012      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4013				 OPTAB_DIRECT);
4014    }
4015  else
4016    cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4017			       operands[1], operands[2]);
4018
4019  if (operands[0] != cmp)
4020    emit_move_insn (operands[0], cmp);
4021
4022  return true;
4023}
4024
4025static rtx
4026ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4027			 rtx op_true, rtx op_false, bool *negate)
4028{
4029  machine_mode data_mode = GET_MODE (dest);
4030  machine_mode mode = GET_MODE (cop0);
4031  rtx x;
4032
4033  *negate = false;
4034
4035  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
4036  if (TARGET_XOP
4037      && (mode == V16QImode || mode == V8HImode
4038	  || mode == V4SImode || mode == V2DImode))
4039    ;
4040  /* AVX512F supports all of the comparsions
4041     on all 128/256/512-bit vector int types.  */
4042  else if (ix86_valid_mask_cmp_mode (mode))
4043    ;
4044  else
4045    {
4046      /* Canonicalize the comparison to EQ, GT, GTU.  */
4047      switch (code)
4048	{
4049	case EQ:
4050	case GT:
4051	case GTU:
4052	  break;
4053
4054	case NE:
4055	case LE:
4056	case LEU:
4057	  code = reverse_condition (code);
4058	  *negate = true;
4059	  break;
4060
4061	case GE:
4062	case GEU:
4063	  code = reverse_condition (code);
4064	  *negate = true;
4065	  /* FALLTHRU */
4066
4067	case LT:
4068	case LTU:
4069	  std::swap (cop0, cop1);
4070	  code = swap_condition (code);
4071	  break;
4072
4073	default:
4074	  gcc_unreachable ();
4075	}
4076
4077      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
4078      if (mode == V2DImode)
4079	{
4080	  switch (code)
4081	    {
4082	    case EQ:
4083	      /* SSE4.1 supports EQ.  */
4084	      if (!TARGET_SSE4_1)
4085		return NULL;
4086	      break;
4087
4088	    case GT:
4089	    case GTU:
4090	      /* SSE4.2 supports GT/GTU.  */
4091	      if (!TARGET_SSE4_2)
4092		return NULL;
4093	      break;
4094
4095	    default:
4096	      gcc_unreachable ();
4097	    }
4098	}
4099
4100      rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4101      rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4102      if (*negate)
4103	std::swap (optrue, opfalse);
4104
4105      /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4106	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4107	 min (x, y) == x).  While we add one instruction (the minimum),
4108	 we remove the need for two instructions in the negation, as the
4109	 result is done this way.
4110	 When using masks, do it for SI/DImode element types, as it is shorter
4111	 than the two subtractions.  */
4112      if ((code != EQ
4113	   && GET_MODE_SIZE (mode) != 64
4114	   && vector_all_ones_operand (opfalse, data_mode)
4115	   && optrue == CONST0_RTX (data_mode))
4116	  || (code == GTU
4117	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4118	      /* Don't do it if not using integer masks and we'd end up with
4119		 the right values in the registers though.  */
4120	      && (GET_MODE_SIZE (mode) == 64
4121		  || !vector_all_ones_operand (optrue, data_mode)
4122		  || opfalse != CONST0_RTX (data_mode))))
4123	{
4124	  rtx (*gen) (rtx, rtx, rtx) = NULL;
4125
4126	  switch (mode)
4127	    {
4128	    case E_V16SImode:
4129	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4130	      break;
4131	    case E_V8DImode:
4132	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4133	      cop0 = force_reg (mode, cop0);
4134	      cop1 = force_reg (mode, cop1);
4135	      break;
4136	    case E_V32QImode:
4137	      if (TARGET_AVX2)
4138		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4139	      break;
4140	    case E_V16HImode:
4141	      if (TARGET_AVX2)
4142		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4143	      break;
4144	    case E_V8SImode:
4145	      if (TARGET_AVX2)
4146		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4147	      break;
4148	    case E_V4DImode:
4149	      if (TARGET_AVX512VL)
4150		{
4151		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4152		  cop0 = force_reg (mode, cop0);
4153		  cop1 = force_reg (mode, cop1);
4154		}
4155	      break;
4156	    case E_V16QImode:
4157	      if (code == GTU && TARGET_SSE2)
4158		gen = gen_uminv16qi3;
4159	      else if (code == GT && TARGET_SSE4_1)
4160		gen = gen_sminv16qi3;
4161	      break;
4162	    case E_V8HImode:
4163	      if (code == GTU && TARGET_SSE4_1)
4164		gen = gen_uminv8hi3;
4165	      else if (code == GT && TARGET_SSE2)
4166		gen = gen_sminv8hi3;
4167	      break;
4168	    case E_V4SImode:
4169	      if (TARGET_SSE4_1)
4170		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4171	      break;
4172	    case E_V2DImode:
4173	      if (TARGET_AVX512VL)
4174		{
4175		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4176		  cop0 = force_reg (mode, cop0);
4177		  cop1 = force_reg (mode, cop1);
4178		}
4179	      break;
4180	    default:
4181	      break;
4182	    }
4183
4184	  if (gen)
4185	    {
4186	      rtx tem = gen_reg_rtx (mode);
4187	      if (!vector_operand (cop0, mode))
4188		cop0 = force_reg (mode, cop0);
4189	      if (!vector_operand (cop1, mode))
4190		cop1 = force_reg (mode, cop1);
4191	      *negate = !*negate;
4192	      emit_insn (gen (tem, cop0, cop1));
4193	      cop1 = tem;
4194	      code = EQ;
4195	    }
4196	}
4197
4198      /* Unsigned parallel compare is not supported by the hardware.
4199	 Play some tricks to turn this into a signed comparison
4200	 against 0.  */
4201      if (code == GTU)
4202	{
4203	  cop0 = force_reg (mode, cop0);
4204
4205	  switch (mode)
4206	    {
4207	    case E_V16SImode:
4208	    case E_V8DImode:
4209	    case E_V8SImode:
4210	    case E_V4DImode:
4211	    case E_V4SImode:
4212	    case E_V2DImode:
4213		{
4214		  rtx t1, t2, mask;
4215
4216		  /* Subtract (-(INT MAX) - 1) from both operands to make
4217		     them signed.  */
4218		  mask = ix86_build_signbit_mask (mode, true, false);
4219		  t1 = gen_reg_rtx (mode);
4220		  emit_insn (gen_sub3_insn (t1, cop0, mask));
4221
4222		  t2 = gen_reg_rtx (mode);
4223		  emit_insn (gen_sub3_insn (t2, cop1, mask));
4224
4225		  cop0 = t1;
4226		  cop1 = t2;
4227		  code = GT;
4228		}
4229	      break;
4230
4231	    case E_V64QImode:
4232	    case E_V32HImode:
4233	    case E_V32QImode:
4234	    case E_V16HImode:
4235	    case E_V16QImode:
4236	    case E_V8HImode:
4237	      /* Perform a parallel unsigned saturating subtraction.  */
4238	      x = gen_reg_rtx (mode);
4239	      emit_insn (gen_rtx_SET
4240			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4241	      cop0 = x;
4242	      cop1 = CONST0_RTX (mode);
4243	      code = EQ;
4244	      *negate = !*negate;
4245	      break;
4246
4247	    default:
4248	      gcc_unreachable ();
4249	    }
4250	}
4251    }
4252
4253  if (*negate)
4254    std::swap (op_true, op_false);
4255
4256  /* Allow the comparison to be done in one mode, but the movcc to
4257     happen in another mode.  */
4258  if (data_mode == mode)
4259    {
4260      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4261			       op_true, op_false);
4262    }
4263  else
4264    {
4265      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4266      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4267			       op_true, op_false);
4268      if (GET_MODE (x) == mode)
4269	x = gen_lowpart (data_mode, x);
4270    }
4271
4272  return x;
4273}
4274
4275/* Expand integer vector comparison.  */
4276
4277bool
4278ix86_expand_int_vec_cmp (rtx operands[])
4279{
4280  rtx_code code = GET_CODE (operands[1]);
4281  bool negate = false;
4282  rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4283				     operands[3], NULL, NULL, &negate);
4284
4285  if (!cmp)
4286    return false;
4287
4288  if (negate)
4289    cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4290				   CONST0_RTX (GET_MODE (cmp)),
4291				   NULL, NULL, &negate);
4292
4293  gcc_assert (!negate);
4294
4295  if (operands[0] != cmp)
4296    emit_move_insn (operands[0], cmp);
4297
4298  return true;
4299}
4300
4301/* Expand a floating-point vector conditional move; a vcond operation
4302   rather than a movcc operation.  */
4303
4304bool
4305ix86_expand_fp_vcond (rtx operands[])
4306{
4307  enum rtx_code code = GET_CODE (operands[3]);
4308  rtx cmp;
4309
4310  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4311					   &operands[4], &operands[5]);
4312  if (code == UNKNOWN)
4313    {
4314      rtx temp;
4315      switch (GET_CODE (operands[3]))
4316	{
4317	case LTGT:
4318	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4319				      operands[5], operands[0], operands[0]);
4320	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4321				     operands[5], operands[1], operands[2]);
4322	  code = AND;
4323	  break;
4324	case UNEQ:
4325	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4326				      operands[5], operands[0], operands[0]);
4327	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4328				     operands[5], operands[1], operands[2]);
4329	  code = IOR;
4330	  break;
4331	default:
4332	  gcc_unreachable ();
4333	}
4334      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4335				 OPTAB_DIRECT);
4336      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4337      return true;
4338    }
4339
4340  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4341				 operands[5], operands[1], operands[2]))
4342    return true;
4343
4344  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4345			     operands[1], operands[2]);
4346  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4347  return true;
4348}
4349
4350/* Expand a signed/unsigned integral vector conditional move.  */
4351
4352bool
4353ix86_expand_int_vcond (rtx operands[])
4354{
4355  machine_mode data_mode = GET_MODE (operands[0]);
4356  machine_mode mode = GET_MODE (operands[4]);
4357  enum rtx_code code = GET_CODE (operands[3]);
4358  bool negate = false;
4359  rtx x, cop0, cop1;
4360
4361  cop0 = operands[4];
4362  cop1 = operands[5];
4363
4364  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4365     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
4366  if ((code == LT || code == GE)
4367      && data_mode == mode
4368      && cop1 == CONST0_RTX (mode)
4369      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4370      && GET_MODE_UNIT_SIZE (data_mode) > 1
4371      && GET_MODE_UNIT_SIZE (data_mode) <= 8
4372      && (GET_MODE_SIZE (data_mode) == 16
4373	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4374    {
4375      rtx negop = operands[2 - (code == LT)];
4376      int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4377      if (negop == CONST1_RTX (data_mode))
4378	{
4379	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4380					 operands[0], 1, OPTAB_DIRECT);
4381	  if (res != operands[0])
4382	    emit_move_insn (operands[0], res);
4383	  return true;
4384	}
4385      else if (GET_MODE_INNER (data_mode) != DImode
4386	       && vector_all_ones_operand (negop, data_mode))
4387	{
4388	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4389					 operands[0], 0, OPTAB_DIRECT);
4390	  if (res != operands[0])
4391	    emit_move_insn (operands[0], res);
4392	  return true;
4393	}
4394    }
4395
4396  if (!nonimmediate_operand (cop1, mode))
4397    cop1 = force_reg (mode, cop1);
4398  if (!general_operand (operands[1], data_mode))
4399    operands[1] = force_reg (data_mode, operands[1]);
4400  if (!general_operand (operands[2], data_mode))
4401    operands[2] = force_reg (data_mode, operands[2]);
4402
4403  x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4404			       operands[1], operands[2], &negate);
4405
4406  if (!x)
4407    return false;
4408
4409  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4410			 operands[2-negate]);
4411  return true;
4412}
4413
4414static bool
4415ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4416			      struct expand_vec_perm_d *d)
4417{
4418  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4419     expander, so args are either in d, or in op0, op1 etc.  */
4420  machine_mode mode = GET_MODE (d ? d->op0 : op0);
4421  machine_mode maskmode = mode;
4422  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4423
4424  switch (mode)
4425    {
4426    case E_V8HImode:
4427      if (TARGET_AVX512VL && TARGET_AVX512BW)
4428	gen = gen_avx512vl_vpermt2varv8hi3;
4429      break;
4430    case E_V16HImode:
4431      if (TARGET_AVX512VL && TARGET_AVX512BW)
4432	gen = gen_avx512vl_vpermt2varv16hi3;
4433      break;
4434    case E_V64QImode:
4435      if (TARGET_AVX512VBMI)
4436	gen = gen_avx512bw_vpermt2varv64qi3;
4437      break;
4438    case E_V32HImode:
4439      if (TARGET_AVX512BW)
4440	gen = gen_avx512bw_vpermt2varv32hi3;
4441      break;
4442    case E_V4SImode:
4443      if (TARGET_AVX512VL)
4444	gen = gen_avx512vl_vpermt2varv4si3;
4445      break;
4446    case E_V8SImode:
4447      if (TARGET_AVX512VL)
4448	gen = gen_avx512vl_vpermt2varv8si3;
4449      break;
4450    case E_V16SImode:
4451      if (TARGET_AVX512F)
4452	gen = gen_avx512f_vpermt2varv16si3;
4453      break;
4454    case E_V4SFmode:
4455      if (TARGET_AVX512VL)
4456	{
4457	  gen = gen_avx512vl_vpermt2varv4sf3;
4458	  maskmode = V4SImode;
4459	}
4460      break;
4461    case E_V8SFmode:
4462      if (TARGET_AVX512VL)
4463	{
4464	  gen = gen_avx512vl_vpermt2varv8sf3;
4465	  maskmode = V8SImode;
4466	}
4467      break;
4468    case E_V16SFmode:
4469      if (TARGET_AVX512F)
4470	{
4471	  gen = gen_avx512f_vpermt2varv16sf3;
4472	  maskmode = V16SImode;
4473	}
4474      break;
4475    case E_V2DImode:
4476      if (TARGET_AVX512VL)
4477	gen = gen_avx512vl_vpermt2varv2di3;
4478      break;
4479    case E_V4DImode:
4480      if (TARGET_AVX512VL)
4481	gen = gen_avx512vl_vpermt2varv4di3;
4482      break;
4483    case E_V8DImode:
4484      if (TARGET_AVX512F)
4485	gen = gen_avx512f_vpermt2varv8di3;
4486      break;
4487    case E_V2DFmode:
4488      if (TARGET_AVX512VL)
4489	{
4490	  gen = gen_avx512vl_vpermt2varv2df3;
4491	  maskmode = V2DImode;
4492	}
4493      break;
4494    case E_V4DFmode:
4495      if (TARGET_AVX512VL)
4496	{
4497	  gen = gen_avx512vl_vpermt2varv4df3;
4498	  maskmode = V4DImode;
4499	}
4500      break;
4501    case E_V8DFmode:
4502      if (TARGET_AVX512F)
4503	{
4504	  gen = gen_avx512f_vpermt2varv8df3;
4505	  maskmode = V8DImode;
4506	}
4507      break;
4508    default:
4509      break;
4510    }
4511
4512  if (gen == NULL)
4513    return false;
4514
4515  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4516     expander, so args are either in d, or in op0, op1 etc.  */
4517  if (d)
4518    {
4519      rtx vec[64];
4520      target = d->target;
4521      op0 = d->op0;
4522      op1 = d->op1;
4523      for (int i = 0; i < d->nelt; ++i)
4524	vec[i] = GEN_INT (d->perm[i]);
4525      mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4526    }
4527
4528  emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4529  return true;
4530}
4531
4532/* Expand a variable vector permutation.  */
4533
4534void
4535ix86_expand_vec_perm (rtx operands[])
4536{
4537  rtx target = operands[0];
4538  rtx op0 = operands[1];
4539  rtx op1 = operands[2];
4540  rtx mask = operands[3];
4541  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4542  machine_mode mode = GET_MODE (op0);
4543  machine_mode maskmode = GET_MODE (mask);
4544  int w, e, i;
4545  bool one_operand_shuffle = rtx_equal_p (op0, op1);
4546
4547  /* Number of elements in the vector.  */
4548  w = GET_MODE_NUNITS (mode);
4549  e = GET_MODE_UNIT_SIZE (mode);
4550  gcc_assert (w <= 64);
4551
4552  if (TARGET_AVX512F && one_operand_shuffle)
4553    {
4554      rtx (*gen) (rtx, rtx, rtx) = NULL;
4555      switch (mode)
4556	{
4557	case E_V16SImode:
4558	  gen =gen_avx512f_permvarv16si;
4559	  break;
4560	case E_V16SFmode:
4561	  gen = gen_avx512f_permvarv16sf;
4562	  break;
4563	case E_V8DImode:
4564	  gen = gen_avx512f_permvarv8di;
4565	  break;
4566	case E_V8DFmode:
4567	  gen = gen_avx512f_permvarv8df;
4568	  break;
4569	default:
4570	  break;
4571	}
4572      if (gen != NULL)
4573	{
4574	  emit_insn (gen (target, op0, mask));
4575	  return;
4576	}
4577    }
4578
4579  if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4580    return;
4581
4582  if (TARGET_AVX2)
4583    {
4584      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4585	{
4586	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4587	     an constant shuffle operand.  With a tiny bit of effort we can
4588	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
4589	     unfortunate but there's no avoiding it.
4590	     Similarly for V16HImode we don't have instructions for variable
4591	     shuffling, while for V32QImode we can use after preparing suitable
4592	     masks vpshufb; vpshufb; vpermq; vpor.  */
4593
4594	  if (mode == V16HImode)
4595	    {
4596	      maskmode = mode = V32QImode;
4597	      w = 32;
4598	      e = 1;
4599	    }
4600	  else
4601	    {
4602	      maskmode = mode = V8SImode;
4603	      w = 8;
4604	      e = 4;
4605	    }
4606	  t1 = gen_reg_rtx (maskmode);
4607
4608	  /* Replicate the low bits of the V4DImode mask into V8SImode:
4609	       mask = { A B C D }
4610	       t1 = { A A B B C C D D }.  */
4611	  for (i = 0; i < w / 2; ++i)
4612	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4613	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4614	  vt = force_reg (maskmode, vt);
4615	  mask = gen_lowpart (maskmode, mask);
4616	  if (maskmode == V8SImode)
4617	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4618	  else
4619	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4620
4621	  /* Multiply the shuffle indicies by two.  */
4622	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4623				    OPTAB_DIRECT);
4624
4625	  /* Add one to the odd shuffle indicies:
4626		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
4627	  for (i = 0; i < w / 2; ++i)
4628	    {
4629	      vec[i * 2] = const0_rtx;
4630	      vec[i * 2 + 1] = const1_rtx;
4631	    }
4632	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4633	  vt = validize_mem (force_const_mem (maskmode, vt));
4634	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4635				    OPTAB_DIRECT);
4636
4637	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
4638	  operands[3] = mask = t1;
4639	  target = gen_reg_rtx (mode);
4640	  op0 = gen_lowpart (mode, op0);
4641	  op1 = gen_lowpart (mode, op1);
4642	}
4643
4644      switch (mode)
4645	{
4646	case E_V8SImode:
4647	  /* The VPERMD and VPERMPS instructions already properly ignore
4648	     the high bits of the shuffle elements.  No need for us to
4649	     perform an AND ourselves.  */
4650	  if (one_operand_shuffle)
4651	    {
4652	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4653	      if (target != operands[0])
4654		emit_move_insn (operands[0],
4655				gen_lowpart (GET_MODE (operands[0]), target));
4656	    }
4657	  else
4658	    {
4659	      t1 = gen_reg_rtx (V8SImode);
4660	      t2 = gen_reg_rtx (V8SImode);
4661	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4662	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4663	      goto merge_two;
4664	    }
4665	  return;
4666
4667	case E_V8SFmode:
4668	  mask = gen_lowpart (V8SImode, mask);
4669	  if (one_operand_shuffle)
4670	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4671	  else
4672	    {
4673	      t1 = gen_reg_rtx (V8SFmode);
4674	      t2 = gen_reg_rtx (V8SFmode);
4675	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4676	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4677	      goto merge_two;
4678	    }
4679	  return;
4680
4681        case E_V4SImode:
4682	  /* By combining the two 128-bit input vectors into one 256-bit
4683	     input vector, we can use VPERMD and VPERMPS for the full
4684	     two-operand shuffle.  */
4685	  t1 = gen_reg_rtx (V8SImode);
4686	  t2 = gen_reg_rtx (V8SImode);
4687	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4688	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4689	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4690	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4691	  return;
4692
4693        case E_V4SFmode:
4694	  t1 = gen_reg_rtx (V8SFmode);
4695	  t2 = gen_reg_rtx (V8SImode);
4696	  mask = gen_lowpart (V4SImode, mask);
4697	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4698	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4699	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4700	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4701	  return;
4702
4703	case E_V32QImode:
4704	  t1 = gen_reg_rtx (V32QImode);
4705	  t2 = gen_reg_rtx (V32QImode);
4706	  t3 = gen_reg_rtx (V32QImode);
4707	  vt2 = GEN_INT (-128);
4708	  vt = gen_const_vec_duplicate (V32QImode, vt2);
4709	  vt = force_reg (V32QImode, vt);
4710	  for (i = 0; i < 32; i++)
4711	    vec[i] = i < 16 ? vt2 : const0_rtx;
4712	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4713	  vt2 = force_reg (V32QImode, vt2);
4714	  /* From mask create two adjusted masks, which contain the same
4715	     bits as mask in the low 7 bits of each vector element.
4716	     The first mask will have the most significant bit clear
4717	     if it requests element from the same 128-bit lane
4718	     and MSB set if it requests element from the other 128-bit lane.
4719	     The second mask will have the opposite values of the MSB,
4720	     and additionally will have its 128-bit lanes swapped.
4721	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4722	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
4723	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4724	     stands for other 12 bytes.  */
4725	  /* The bit whether element is from the same lane or the other
4726	     lane is bit 4, so shift it up by 3 to the MSB position.  */
4727	  t5 = gen_reg_rtx (V4DImode);
4728	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4729				    GEN_INT (3)));
4730	  /* Clear MSB bits from the mask just in case it had them set.  */
4731	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4732	  /* After this t1 will have MSB set for elements from other lane.  */
4733	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4734	  /* Clear bits other than MSB.  */
4735	  emit_insn (gen_andv32qi3 (t1, t1, vt));
4736	  /* Or in the lower bits from mask into t3.  */
4737	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
4738	  /* And invert MSB bits in t1, so MSB is set for elements from the same
4739	     lane.  */
4740	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
4741	  /* Swap 128-bit lanes in t3.  */
4742	  t6 = gen_reg_rtx (V4DImode);
4743	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4744					  const2_rtx, GEN_INT (3),
4745					  const0_rtx, const1_rtx));
4746	  /* And or in the lower bits from mask into t1.  */
4747	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
4748	  if (one_operand_shuffle)
4749	    {
4750	      /* Each of these shuffles will put 0s in places where
4751		 element from the other 128-bit lane is needed, otherwise
4752		 will shuffle in the requested value.  */
4753	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4754						gen_lowpart (V32QImode, t6)));
4755	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4756	      /* For t3 the 128-bit lanes are swapped again.  */
4757	      t7 = gen_reg_rtx (V4DImode);
4758	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4759					      const2_rtx, GEN_INT (3),
4760					      const0_rtx, const1_rtx));
4761	      /* And oring both together leads to the result.  */
4762	      emit_insn (gen_iorv32qi3 (target, t1,
4763					gen_lowpart (V32QImode, t7)));
4764	      if (target != operands[0])
4765		emit_move_insn (operands[0],
4766				gen_lowpart (GET_MODE (operands[0]), target));
4767	      return;
4768	    }
4769
4770	  t4 = gen_reg_rtx (V32QImode);
4771	  /* Similarly to the above one_operand_shuffle code,
4772	     just for repeated twice for each operand.  merge_two:
4773	     code will merge the two results together.  */
4774	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4775					    gen_lowpart (V32QImode, t6)));
4776	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4777					    gen_lowpart (V32QImode, t6)));
4778	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4779	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4780	  t7 = gen_reg_rtx (V4DImode);
4781	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4782					  const2_rtx, GEN_INT (3),
4783					  const0_rtx, const1_rtx));
4784	  t8 = gen_reg_rtx (V4DImode);
4785	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4786					  const2_rtx, GEN_INT (3),
4787					  const0_rtx, const1_rtx));
4788	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4789	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4790	  t1 = t4;
4791	  t2 = t3;
4792	  goto merge_two;
4793
4794	default:
4795	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
4796	  break;
4797	}
4798    }
4799
4800  if (TARGET_XOP)
4801    {
4802      /* The XOP VPPERM insn supports three inputs.  By ignoring the
4803	 one_operand_shuffle special case, we avoid creating another
4804	 set of constant vectors in memory.  */
4805      one_operand_shuffle = false;
4806
4807      /* mask = mask & {2*w-1, ...} */
4808      vt = GEN_INT (2*w - 1);
4809    }
4810  else
4811    {
4812      /* mask = mask & {w-1, ...} */
4813      vt = GEN_INT (w - 1);
4814    }
4815
4816  vt = gen_const_vec_duplicate (maskmode, vt);
4817  mask = expand_simple_binop (maskmode, AND, mask, vt,
4818			      NULL_RTX, 0, OPTAB_DIRECT);
4819
4820  /* For non-QImode operations, convert the word permutation control
4821     into a byte permutation control.  */
4822  if (mode != V16QImode)
4823    {
4824      mask = expand_simple_binop (maskmode, ASHIFT, mask,
4825				  GEN_INT (exact_log2 (e)),
4826				  NULL_RTX, 0, OPTAB_DIRECT);
4827
4828      /* Convert mask to vector of chars.  */
4829      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4830
4831      /* Replicate each of the input bytes into byte positions:
4832	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4833	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4834	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
4835      for (i = 0; i < 16; ++i)
4836	vec[i] = GEN_INT (i/e * e);
4837      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4838      vt = validize_mem (force_const_mem (V16QImode, vt));
4839      if (TARGET_XOP)
4840	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4841      else
4842	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4843
4844      /* Convert it into the byte positions by doing
4845	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
4846      for (i = 0; i < 16; ++i)
4847	vec[i] = GEN_INT (i % e);
4848      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4849      vt = validize_mem (force_const_mem (V16QImode, vt));
4850      emit_insn (gen_addv16qi3 (mask, mask, vt));
4851    }
4852
4853  /* The actual shuffle operations all operate on V16QImode.  */
4854  op0 = gen_lowpart (V16QImode, op0);
4855  op1 = gen_lowpart (V16QImode, op1);
4856
4857  if (TARGET_XOP)
4858    {
4859      if (GET_MODE (target) != V16QImode)
4860	target = gen_reg_rtx (V16QImode);
4861      emit_insn (gen_xop_pperm (target, op0, op1, mask));
4862      if (target != operands[0])
4863	emit_move_insn (operands[0],
4864			gen_lowpart (GET_MODE (operands[0]), target));
4865    }
4866  else if (one_operand_shuffle)
4867    {
4868      if (GET_MODE (target) != V16QImode)
4869	target = gen_reg_rtx (V16QImode);
4870      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4871      if (target != operands[0])
4872	emit_move_insn (operands[0],
4873			gen_lowpart (GET_MODE (operands[0]), target));
4874    }
4875  else
4876    {
4877      rtx xops[6];
4878      bool ok;
4879
4880      /* Shuffle the two input vectors independently.  */
4881      t1 = gen_reg_rtx (V16QImode);
4882      t2 = gen_reg_rtx (V16QImode);
4883      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4884      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4885
4886 merge_two:
4887      /* Then merge them together.  The key is whether any given control
4888         element contained a bit set that indicates the second word.  */
4889      mask = operands[3];
4890      vt = GEN_INT (w);
4891      if (maskmode == V2DImode && !TARGET_SSE4_1)
4892	{
4893	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
4894	     more shuffle to convert the V2DI input mask into a V4SI
4895	     input mask.  At which point the masking that expand_int_vcond
4896	     will work as desired.  */
4897	  rtx t3 = gen_reg_rtx (V4SImode);
4898	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4899				        const0_rtx, const0_rtx,
4900				        const2_rtx, const2_rtx));
4901	  mask = t3;
4902	  maskmode = V4SImode;
4903	  e = w = 4;
4904	}
4905
4906      vt = gen_const_vec_duplicate (maskmode, vt);
4907      vt = force_reg (maskmode, vt);
4908      mask = expand_simple_binop (maskmode, AND, mask, vt,
4909				  NULL_RTX, 0, OPTAB_DIRECT);
4910
4911      if (GET_MODE (target) != mode)
4912	target = gen_reg_rtx (mode);
4913      xops[0] = target;
4914      xops[1] = gen_lowpart (mode, t2);
4915      xops[2] = gen_lowpart (mode, t1);
4916      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4917      xops[4] = mask;
4918      xops[5] = vt;
4919      ok = ix86_expand_int_vcond (xops);
4920      gcc_assert (ok);
4921      if (target != operands[0])
4922	emit_move_insn (operands[0],
4923			gen_lowpart (GET_MODE (operands[0]), target));
4924    }
4925}
4926
4927/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
4928   true if we should do zero extension, else sign extension.  HIGH_P is
4929   true if we want the N/2 high elements, else the low elements.  */
4930
4931void
4932ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4933{
4934  machine_mode imode = GET_MODE (src);
4935  rtx tmp;
4936
4937  if (TARGET_SSE4_1)
4938    {
4939      rtx (*unpack)(rtx, rtx);
4940      rtx (*extract)(rtx, rtx) = NULL;
4941      machine_mode halfmode = BLKmode;
4942
4943      switch (imode)
4944	{
4945	case E_V64QImode:
4946	  if (unsigned_p)
4947	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4948	  else
4949	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4950	  halfmode = V32QImode;
4951	  extract
4952	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4953	  break;
4954	case E_V32QImode:
4955	  if (unsigned_p)
4956	    unpack = gen_avx2_zero_extendv16qiv16hi2;
4957	  else
4958	    unpack = gen_avx2_sign_extendv16qiv16hi2;
4959	  halfmode = V16QImode;
4960	  extract
4961	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4962	  break;
4963	case E_V32HImode:
4964	  if (unsigned_p)
4965	    unpack = gen_avx512f_zero_extendv16hiv16si2;
4966	  else
4967	    unpack = gen_avx512f_sign_extendv16hiv16si2;
4968	  halfmode = V16HImode;
4969	  extract
4970	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4971	  break;
4972	case E_V16HImode:
4973	  if (unsigned_p)
4974	    unpack = gen_avx2_zero_extendv8hiv8si2;
4975	  else
4976	    unpack = gen_avx2_sign_extendv8hiv8si2;
4977	  halfmode = V8HImode;
4978	  extract
4979	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4980	  break;
4981	case E_V16SImode:
4982	  if (unsigned_p)
4983	    unpack = gen_avx512f_zero_extendv8siv8di2;
4984	  else
4985	    unpack = gen_avx512f_sign_extendv8siv8di2;
4986	  halfmode = V8SImode;
4987	  extract
4988	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4989	  break;
4990	case E_V8SImode:
4991	  if (unsigned_p)
4992	    unpack = gen_avx2_zero_extendv4siv4di2;
4993	  else
4994	    unpack = gen_avx2_sign_extendv4siv4di2;
4995	  halfmode = V4SImode;
4996	  extract
4997	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4998	  break;
4999	case E_V16QImode:
5000	  if (unsigned_p)
5001	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5002	  else
5003	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5004	  break;
5005	case E_V8HImode:
5006	  if (unsigned_p)
5007	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
5008	  else
5009	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
5010	  break;
5011	case E_V4SImode:
5012	  if (unsigned_p)
5013	    unpack = gen_sse4_1_zero_extendv2siv2di2;
5014	  else
5015	    unpack = gen_sse4_1_sign_extendv2siv2di2;
5016	  break;
5017	default:
5018	  gcc_unreachable ();
5019	}
5020
5021      if (GET_MODE_SIZE (imode) >= 32)
5022	{
5023	  tmp = gen_reg_rtx (halfmode);
5024	  emit_insn (extract (tmp, src));
5025	}
5026      else if (high_p)
5027	{
5028	  /* Shift higher 8 bytes to lower 8 bytes.  */
5029	  tmp = gen_reg_rtx (V1TImode);
5030	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5031					 GEN_INT (64)));
5032	  tmp = gen_lowpart (imode, tmp);
5033	}
5034      else
5035	tmp = src;
5036
5037      emit_insn (unpack (dest, tmp));
5038    }
5039  else
5040    {
5041      rtx (*unpack)(rtx, rtx, rtx);
5042
5043      switch (imode)
5044	{
5045	case E_V16QImode:
5046	  if (high_p)
5047	    unpack = gen_vec_interleave_highv16qi;
5048	  else
5049	    unpack = gen_vec_interleave_lowv16qi;
5050	  break;
5051	case E_V8HImode:
5052	  if (high_p)
5053	    unpack = gen_vec_interleave_highv8hi;
5054	  else
5055	    unpack = gen_vec_interleave_lowv8hi;
5056	  break;
5057	case E_V4SImode:
5058	  if (high_p)
5059	    unpack = gen_vec_interleave_highv4si;
5060	  else
5061	    unpack = gen_vec_interleave_lowv4si;
5062	  break;
5063	default:
5064	  gcc_unreachable ();
5065	}
5066
5067      if (unsigned_p)
5068	tmp = force_reg (imode, CONST0_RTX (imode));
5069      else
5070	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5071				   src, pc_rtx, pc_rtx);
5072
5073      rtx tmp2 = gen_reg_rtx (imode);
5074      emit_insn (unpack (tmp2, src, tmp));
5075      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5076    }
5077}
5078
5079/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
5080   but works for floating pointer parameters and nonoffsetable memories.
5081   For pushes, it returns just stack offsets; the values will be saved
5082   in the right order.  Maximally three parts are generated.  */
5083
5084static int
5085ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5086{
5087  int size;
5088
5089  if (!TARGET_64BIT)
5090    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5091  else
5092    size = (GET_MODE_SIZE (mode) + 4) / 8;
5093
5094  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5095  gcc_assert (size >= 2 && size <= 4);
5096
5097  /* Optimize constant pool reference to immediates.  This is used by fp
5098     moves, that force all constants to memory to allow combining.  */
5099  if (MEM_P (operand) && MEM_READONLY_P (operand))
5100    operand = avoid_constant_pool_reference (operand);
5101
5102  if (MEM_P (operand) && !offsettable_memref_p (operand))
5103    {
5104      /* The only non-offsetable memories we handle are pushes.  */
5105      int ok = push_operand (operand, VOIDmode);
5106
5107      gcc_assert (ok);
5108
5109      operand = copy_rtx (operand);
5110      PUT_MODE (operand, word_mode);
5111      parts[0] = parts[1] = parts[2] = parts[3] = operand;
5112      return size;
5113    }
5114
5115  if (GET_CODE (operand) == CONST_VECTOR)
5116    {
5117      scalar_int_mode imode = int_mode_for_mode (mode).require ();
5118      /* Caution: if we looked through a constant pool memory above,
5119	 the operand may actually have a different mode now.  That's
5120	 ok, since we want to pun this all the way back to an integer.  */
5121      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5122      gcc_assert (operand != NULL);
5123      mode = imode;
5124    }
5125
5126  if (!TARGET_64BIT)
5127    {
5128      if (mode == DImode)
5129	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5130      else
5131	{
5132	  int i;
5133
5134	  if (REG_P (operand))
5135	    {
5136	      gcc_assert (reload_completed);
5137	      for (i = 0; i < size; i++)
5138		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5139	    }
5140	  else if (offsettable_memref_p (operand))
5141	    {
5142	      operand = adjust_address (operand, SImode, 0);
5143	      parts[0] = operand;
5144	      for (i = 1; i < size; i++)
5145		parts[i] = adjust_address (operand, SImode, 4 * i);
5146	    }
5147	  else if (CONST_DOUBLE_P (operand))
5148	    {
5149	      const REAL_VALUE_TYPE *r;
5150	      long l[4];
5151
5152	      r = CONST_DOUBLE_REAL_VALUE (operand);
5153	      switch (mode)
5154		{
5155		case E_TFmode:
5156		  real_to_target (l, r, mode);
5157		  parts[3] = gen_int_mode (l[3], SImode);
5158		  parts[2] = gen_int_mode (l[2], SImode);
5159		  break;
5160		case E_XFmode:
5161		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5162		     long double may not be 80-bit.  */
5163		  real_to_target (l, r, mode);
5164		  parts[2] = gen_int_mode (l[2], SImode);
5165		  break;
5166		case E_DFmode:
5167		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5168		  break;
5169		default:
5170		  gcc_unreachable ();
5171		}
5172	      parts[1] = gen_int_mode (l[1], SImode);
5173	      parts[0] = gen_int_mode (l[0], SImode);
5174	    }
5175	  else
5176	    gcc_unreachable ();
5177	}
5178    }
5179  else
5180    {
5181      if (mode == TImode)
5182	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5183      if (mode == XFmode || mode == TFmode)
5184	{
5185	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5186	  if (REG_P (operand))
5187	    {
5188	      gcc_assert (reload_completed);
5189	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5190	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5191	    }
5192	  else if (offsettable_memref_p (operand))
5193	    {
5194	      operand = adjust_address (operand, DImode, 0);
5195	      parts[0] = operand;
5196	      parts[1] = adjust_address (operand, upper_mode, 8);
5197	    }
5198	  else if (CONST_DOUBLE_P (operand))
5199	    {
5200	      long l[4];
5201
5202	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5203
5204	      /* real_to_target puts 32-bit pieces in each long.  */
5205	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5206				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5207					  << 32), DImode);
5208
5209	      if (upper_mode == SImode)
5210	        parts[1] = gen_int_mode (l[2], SImode);
5211	      else
5212	        parts[1]
5213		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5214				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5215				     << 32), DImode);
5216	    }
5217	  else
5218	    gcc_unreachable ();
5219	}
5220    }
5221
5222  return size;
5223}
5224
5225/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5226   Return false when normal moves are needed; true when all required
5227   insns have been emitted.  Operands 2-4 contain the input values
5228   int the correct order; operands 5-7 contain the output values.  */
5229
5230void
5231ix86_split_long_move (rtx operands[])
5232{
5233  rtx part[2][4];
5234  int nparts, i, j;
5235  int push = 0;
5236  int collisions = 0;
5237  machine_mode mode = GET_MODE (operands[0]);
5238  bool collisionparts[4];
5239
5240  /* The DFmode expanders may ask us to move double.
5241     For 64bit target this is single move.  By hiding the fact
5242     here we simplify i386.md splitters.  */
5243  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5244    {
5245      /* Optimize constant pool reference to immediates.  This is used by
5246	 fp moves, that force all constants to memory to allow combining.  */
5247
5248      if (MEM_P (operands[1])
5249	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5250	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5251	operands[1] = get_pool_constant (XEXP (operands[1], 0));
5252      if (push_operand (operands[0], VOIDmode))
5253	{
5254	  operands[0] = copy_rtx (operands[0]);
5255	  PUT_MODE (operands[0], word_mode);
5256	}
5257      else
5258        operands[0] = gen_lowpart (DImode, operands[0]);
5259      operands[1] = gen_lowpart (DImode, operands[1]);
5260      emit_move_insn (operands[0], operands[1]);
5261      return;
5262    }
5263
5264  /* The only non-offsettable memory we handle is push.  */
5265  if (push_operand (operands[0], VOIDmode))
5266    push = 1;
5267  else
5268    gcc_assert (!MEM_P (operands[0])
5269		|| offsettable_memref_p (operands[0]));
5270
5271  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5272  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5273
5274  /* When emitting push, take care for source operands on the stack.  */
5275  if (push && MEM_P (operands[1])
5276      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5277    {
5278      rtx src_base = XEXP (part[1][nparts - 1], 0);
5279
5280      /* Compensate for the stack decrement by 4.  */
5281      if (!TARGET_64BIT && nparts == 3
5282	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5283	src_base = plus_constant (Pmode, src_base, 4);
5284
5285      /* src_base refers to the stack pointer and is
5286	 automatically decreased by emitted push.  */
5287      for (i = 0; i < nparts; i++)
5288	part[1][i] = change_address (part[1][i],
5289				     GET_MODE (part[1][i]), src_base);
5290    }
5291
5292  /* We need to do copy in the right order in case an address register
5293     of the source overlaps the destination.  */
5294  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5295    {
5296      rtx tmp;
5297
5298      for (i = 0; i < nparts; i++)
5299	{
5300	  collisionparts[i]
5301	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5302	  if (collisionparts[i])
5303	    collisions++;
5304	}
5305
5306      /* Collision in the middle part can be handled by reordering.  */
5307      if (collisions == 1 && nparts == 3 && collisionparts [1])
5308	{
5309	  std::swap (part[0][1], part[0][2]);
5310	  std::swap (part[1][1], part[1][2]);
5311	}
5312      else if (collisions == 1
5313	       && nparts == 4
5314	       && (collisionparts [1] || collisionparts [2]))
5315	{
5316	  if (collisionparts [1])
5317	    {
5318	      std::swap (part[0][1], part[0][2]);
5319	      std::swap (part[1][1], part[1][2]);
5320	    }
5321	  else
5322	    {
5323	      std::swap (part[0][2], part[0][3]);
5324	      std::swap (part[1][2], part[1][3]);
5325	    }
5326	}
5327
5328      /* If there are more collisions, we can't handle it by reordering.
5329	 Do an lea to the last part and use only one colliding move.  */
5330      else if (collisions > 1)
5331	{
5332	  rtx base, addr;
5333
5334	  collisions = 1;
5335
5336	  base = part[0][nparts - 1];
5337
5338	  /* Handle the case when the last part isn't valid for lea.
5339	     Happens in 64-bit mode storing the 12-byte XFmode.  */
5340	  if (GET_MODE (base) != Pmode)
5341	    base = gen_rtx_REG (Pmode, REGNO (base));
5342
5343	  addr = XEXP (part[1][0], 0);
5344	  if (TARGET_TLS_DIRECT_SEG_REFS)
5345	    {
5346	      struct ix86_address parts;
5347	      int ok = ix86_decompose_address (addr, &parts);
5348	      gcc_assert (ok);
5349	      /* It is not valid to use %gs: or %fs: in lea.  */
5350	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5351	    }
5352	  emit_insn (gen_rtx_SET (base, addr));
5353	  part[1][0] = replace_equiv_address (part[1][0], base);
5354	  for (i = 1; i < nparts; i++)
5355	    {
5356	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5357	      part[1][i] = replace_equiv_address (part[1][i], tmp);
5358	    }
5359	}
5360    }
5361
5362  if (push)
5363    {
5364      if (!TARGET_64BIT)
5365	{
5366	  if (nparts == 3)
5367	    {
5368	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5369                emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5370	      emit_move_insn (part[0][2], part[1][2]);
5371	    }
5372	  else if (nparts == 4)
5373	    {
5374	      emit_move_insn (part[0][3], part[1][3]);
5375	      emit_move_insn (part[0][2], part[1][2]);
5376	    }
5377	}
5378      else
5379	{
5380	  /* In 64bit mode we don't have 32bit push available.  In case this is
5381	     register, it is OK - we will just use larger counterpart.  We also
5382	     retype memory - these comes from attempt to avoid REX prefix on
5383	     moving of second half of TFmode value.  */
5384	  if (GET_MODE (part[1][1]) == SImode)
5385	    {
5386	      switch (GET_CODE (part[1][1]))
5387		{
5388		case MEM:
5389		  part[1][1] = adjust_address (part[1][1], DImode, 0);
5390		  break;
5391
5392		case REG:
5393		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5394		  break;
5395
5396		default:
5397		  gcc_unreachable ();
5398		}
5399
5400	      if (GET_MODE (part[1][0]) == SImode)
5401		part[1][0] = part[1][1];
5402	    }
5403	}
5404      emit_move_insn (part[0][1], part[1][1]);
5405      emit_move_insn (part[0][0], part[1][0]);
5406      return;
5407    }
5408
5409  /* Choose correct order to not overwrite the source before it is copied.  */
5410  if ((REG_P (part[0][0])
5411       && REG_P (part[1][1])
5412       && (REGNO (part[0][0]) == REGNO (part[1][1])
5413	   || (nparts == 3
5414	       && REGNO (part[0][0]) == REGNO (part[1][2]))
5415	   || (nparts == 4
5416	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
5417      || (collisions > 0
5418	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5419    {
5420      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5421	{
5422	  operands[2 + i] = part[0][j];
5423	  operands[6 + i] = part[1][j];
5424	}
5425    }
5426  else
5427    {
5428      for (i = 0; i < nparts; i++)
5429	{
5430	  operands[2 + i] = part[0][i];
5431	  operands[6 + i] = part[1][i];
5432	}
5433    }
5434
5435  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
5436  if (optimize_insn_for_size_p ())
5437    {
5438      for (j = 0; j < nparts - 1; j++)
5439	if (CONST_INT_P (operands[6 + j])
5440	    && operands[6 + j] != const0_rtx
5441	    && REG_P (operands[2 + j]))
5442	  for (i = j; i < nparts - 1; i++)
5443	    if (CONST_INT_P (operands[7 + i])
5444		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5445	      operands[7 + i] = operands[2 + j];
5446    }
5447
5448  for (i = 0; i < nparts; i++)
5449    emit_move_insn (operands[2 + i], operands[6 + i]);
5450
5451  return;
5452}
5453
5454/* Helper function of ix86_split_ashl used to generate an SImode/DImode
5455   left shift by a constant, either using a single shift or
5456   a sequence of add instructions.  */
5457
5458static void
5459ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5460{
5461  if (count == 1
5462      || (count * ix86_cost->add <= ix86_cost->shift_const
5463	  && !optimize_insn_for_size_p ()))
5464    {
5465      while (count-- > 0)
5466	emit_insn (gen_add2_insn (operand, operand));
5467    }
5468  else
5469    {
5470      rtx (*insn)(rtx, rtx, rtx);
5471
5472      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5473      emit_insn (insn (operand, operand, GEN_INT (count)));
5474    }
5475}
5476
5477void
5478ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5479{
5480  rtx (*gen_ashl3)(rtx, rtx, rtx);
5481  rtx (*gen_shld)(rtx, rtx, rtx);
5482  int half_width = GET_MODE_BITSIZE (mode) >> 1;
5483  machine_mode half_mode;
5484
5485  rtx low[2], high[2];
5486  int count;
5487
5488  if (CONST_INT_P (operands[2]))
5489    {
5490      split_double_mode (mode, operands, 2, low, high);
5491      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5492
5493      if (count >= half_width)
5494	{
5495	  emit_move_insn (high[0], low[1]);
5496	  emit_move_insn (low[0], const0_rtx);
5497
5498	  if (count > half_width)
5499	    ix86_expand_ashl_const (high[0], count - half_width, mode);
5500	}
5501      else
5502	{
5503	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5504
5505	  if (!rtx_equal_p (operands[0], operands[1]))
5506	    emit_move_insn (operands[0], operands[1]);
5507
5508	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5509	  ix86_expand_ashl_const (low[0], count, mode);
5510	}
5511      return;
5512    }
5513
5514  split_double_mode (mode, operands, 1, low, high);
5515  half_mode = mode == DImode ? SImode : DImode;
5516
5517  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5518
5519  if (operands[1] == const1_rtx)
5520    {
5521      /* Assuming we've chosen a QImode capable registers, then 1 << N
5522	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
5523      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5524	{
5525	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5526
5527	  ix86_expand_clear (low[0]);
5528	  ix86_expand_clear (high[0]);
5529	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5530
5531	  d = gen_lowpart (QImode, low[0]);
5532	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5533	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
5534	  emit_insn (gen_rtx_SET (d, s));
5535
5536	  d = gen_lowpart (QImode, high[0]);
5537	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5538	  s = gen_rtx_NE (QImode, flags, const0_rtx);
5539	  emit_insn (gen_rtx_SET (d, s));
5540	}
5541
5542      /* Otherwise, we can get the same results by manually performing
5543	 a bit extract operation on bit 5/6, and then performing the two
5544	 shifts.  The two methods of getting 0/1 into low/high are exactly
5545	 the same size.  Avoiding the shift in the bit extract case helps
5546	 pentium4 a bit; no one else seems to care much either way.  */
5547      else
5548	{
5549	  rtx (*gen_lshr3)(rtx, rtx, rtx);
5550	  rtx (*gen_and3)(rtx, rtx, rtx);
5551	  rtx (*gen_xor3)(rtx, rtx, rtx);
5552	  HOST_WIDE_INT bits;
5553	  rtx x;
5554
5555	  if (mode == DImode)
5556	    {
5557	      gen_lshr3 = gen_lshrsi3;
5558	      gen_and3 = gen_andsi3;
5559	      gen_xor3 = gen_xorsi3;
5560	      bits = 5;
5561	    }
5562	  else
5563	    {
5564	      gen_lshr3 = gen_lshrdi3;
5565	      gen_and3 = gen_anddi3;
5566	      gen_xor3 = gen_xordi3;
5567	      bits = 6;
5568	    }
5569
5570	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5571	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5572	  else
5573	    x = gen_lowpart (half_mode, operands[2]);
5574	  emit_insn (gen_rtx_SET (high[0], x));
5575
5576	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5577	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5578	  emit_move_insn (low[0], high[0]);
5579	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5580	}
5581
5582      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5583      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5584      return;
5585    }
5586
5587  if (operands[1] == constm1_rtx)
5588    {
5589      /* For -1 << N, we can avoid the shld instruction, because we
5590	 know that we're shifting 0...31/63 ones into a -1.  */
5591      emit_move_insn (low[0], constm1_rtx);
5592      if (optimize_insn_for_size_p ())
5593	emit_move_insn (high[0], low[0]);
5594      else
5595	emit_move_insn (high[0], constm1_rtx);
5596    }
5597  else
5598    {
5599      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5600
5601      if (!rtx_equal_p (operands[0], operands[1]))
5602	emit_move_insn (operands[0], operands[1]);
5603
5604      split_double_mode (mode, operands, 1, low, high);
5605      emit_insn (gen_shld (high[0], low[0], operands[2]));
5606    }
5607
5608  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5609
5610  if (TARGET_CMOVE && scratch)
5611    {
5612      ix86_expand_clear (scratch);
5613      emit_insn (gen_x86_shift_adj_1
5614		 (half_mode, high[0], low[0], operands[2], scratch));
5615    }
5616  else
5617    emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5618}
5619
5620void
5621ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5622{
5623  rtx (*gen_ashr3)(rtx, rtx, rtx)
5624    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5625  rtx (*gen_shrd)(rtx, rtx, rtx);
5626  int half_width = GET_MODE_BITSIZE (mode) >> 1;
5627
5628  rtx low[2], high[2];
5629  int count;
5630
5631  if (CONST_INT_P (operands[2]))
5632    {
5633      split_double_mode (mode, operands, 2, low, high);
5634      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5635
5636      if (count == GET_MODE_BITSIZE (mode) - 1)
5637	{
5638	  emit_move_insn (high[0], high[1]);
5639	  emit_insn (gen_ashr3 (high[0], high[0],
5640				GEN_INT (half_width - 1)));
5641	  emit_move_insn (low[0], high[0]);
5642
5643	}
5644      else if (count >= half_width)
5645	{
5646	  emit_move_insn (low[0], high[1]);
5647	  emit_move_insn (high[0], low[0]);
5648	  emit_insn (gen_ashr3 (high[0], high[0],
5649				GEN_INT (half_width - 1)));
5650
5651	  if (count > half_width)
5652	    emit_insn (gen_ashr3 (low[0], low[0],
5653				  GEN_INT (count - half_width)));
5654	}
5655      else
5656	{
5657	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5658
5659	  if (!rtx_equal_p (operands[0], operands[1]))
5660	    emit_move_insn (operands[0], operands[1]);
5661
5662	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5663	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5664	}
5665    }
5666  else
5667    {
5668      machine_mode half_mode;
5669
5670      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5671
5672     if (!rtx_equal_p (operands[0], operands[1]))
5673	emit_move_insn (operands[0], operands[1]);
5674
5675      split_double_mode (mode, operands, 1, low, high);
5676      half_mode = mode == DImode ? SImode : DImode;
5677
5678      emit_insn (gen_shrd (low[0], high[0], operands[2]));
5679      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5680
5681      if (TARGET_CMOVE && scratch)
5682	{
5683	  emit_move_insn (scratch, high[0]);
5684	  emit_insn (gen_ashr3 (scratch, scratch,
5685				GEN_INT (half_width - 1)));
5686	  emit_insn (gen_x86_shift_adj_1
5687		     (half_mode, low[0], high[0], operands[2], scratch));
5688	}
5689      else
5690	emit_insn (gen_x86_shift_adj_3
5691		   (half_mode, low[0], high[0], operands[2]));
5692    }
5693}
5694
5695void
5696ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5697{
5698  rtx (*gen_lshr3)(rtx, rtx, rtx)
5699    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5700  rtx (*gen_shrd)(rtx, rtx, rtx);
5701  int half_width = GET_MODE_BITSIZE (mode) >> 1;
5702
5703  rtx low[2], high[2];
5704  int count;
5705
5706  if (CONST_INT_P (operands[2]))
5707    {
5708      split_double_mode (mode, operands, 2, low, high);
5709      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5710
5711      if (count >= half_width)
5712	{
5713	  emit_move_insn (low[0], high[1]);
5714	  ix86_expand_clear (high[0]);
5715
5716	  if (count > half_width)
5717	    emit_insn (gen_lshr3 (low[0], low[0],
5718				  GEN_INT (count - half_width)));
5719	}
5720      else
5721	{
5722	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5723
5724	  if (!rtx_equal_p (operands[0], operands[1]))
5725	    emit_move_insn (operands[0], operands[1]);
5726
5727	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5728	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5729	}
5730    }
5731  else
5732    {
5733      machine_mode half_mode;
5734
5735      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5736
5737      if (!rtx_equal_p (operands[0], operands[1]))
5738	emit_move_insn (operands[0], operands[1]);
5739
5740      split_double_mode (mode, operands, 1, low, high);
5741      half_mode = mode == DImode ? SImode : DImode;
5742
5743      emit_insn (gen_shrd (low[0], high[0], operands[2]));
5744      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5745
5746      if (TARGET_CMOVE && scratch)
5747	{
5748	  ix86_expand_clear (scratch);
5749	  emit_insn (gen_x86_shift_adj_1
5750		     (half_mode, low[0], high[0], operands[2], scratch));
5751	}
5752      else
5753	emit_insn (gen_x86_shift_adj_2
5754		   (half_mode, low[0], high[0], operands[2]));
5755    }
5756}
5757
5758/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
5759   DImode for constant loop counts.  */
5760
5761static machine_mode
5762counter_mode (rtx count_exp)
5763{
5764  if (GET_MODE (count_exp) != VOIDmode)
5765    return GET_MODE (count_exp);
5766  if (!CONST_INT_P (count_exp))
5767    return Pmode;
5768  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5769    return DImode;
5770  return SImode;
5771}
5772
5773/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5774   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5775   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
5776   memory by VALUE (supposed to be in MODE).
5777
5778   The size is rounded down to whole number of chunk size moved at once.
5779   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
5780
5781
5782static void
5783expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5784			       rtx destptr, rtx srcptr, rtx value,
5785			       rtx count, machine_mode mode, int unroll,
5786			       int expected_size, bool issetmem)
5787{
5788  rtx_code_label *out_label, *top_label;
5789  rtx iter, tmp;
5790  machine_mode iter_mode = counter_mode (count);
5791  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5792  rtx piece_size = GEN_INT (piece_size_n);
5793  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5794  rtx size;
5795  int i;
5796
5797  top_label = gen_label_rtx ();
5798  out_label = gen_label_rtx ();
5799  iter = gen_reg_rtx (iter_mode);
5800
5801  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5802			      NULL, 1, OPTAB_DIRECT);
5803  /* Those two should combine.  */
5804  if (piece_size == const1_rtx)
5805    {
5806      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5807			       true, out_label);
5808      predict_jump (REG_BR_PROB_BASE * 10 / 100);
5809    }
5810  emit_move_insn (iter, const0_rtx);
5811
5812  emit_label (top_label);
5813
5814  tmp = convert_modes (Pmode, iter_mode, iter, true);
5815
5816  /* This assert could be relaxed - in this case we'll need to compute
5817     smallest power of two, containing in PIECE_SIZE_N and pass it to
5818     offset_address.  */
5819  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5820  destmem = offset_address (destmem, tmp, piece_size_n);
5821  destmem = adjust_address (destmem, mode, 0);
5822
5823  if (!issetmem)
5824    {
5825      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5826      srcmem = adjust_address (srcmem, mode, 0);
5827
5828      /* When unrolling for chips that reorder memory reads and writes,
5829	 we can save registers by using single temporary.
5830	 Also using 4 temporaries is overkill in 32bit mode.  */
5831      if (!TARGET_64BIT && 0)
5832	{
5833	  for (i = 0; i < unroll; i++)
5834	    {
5835	      if (i)
5836		{
5837		  destmem = adjust_address (copy_rtx (destmem), mode,
5838					    GET_MODE_SIZE (mode));
5839		  srcmem = adjust_address (copy_rtx (srcmem), mode,
5840					   GET_MODE_SIZE (mode));
5841		}
5842	      emit_move_insn (destmem, srcmem);
5843	    }
5844	}
5845      else
5846	{
5847	  rtx tmpreg[4];
5848	  gcc_assert (unroll <= 4);
5849	  for (i = 0; i < unroll; i++)
5850	    {
5851	      tmpreg[i] = gen_reg_rtx (mode);
5852	      if (i)
5853		srcmem = adjust_address (copy_rtx (srcmem), mode,
5854					 GET_MODE_SIZE (mode));
5855	      emit_move_insn (tmpreg[i], srcmem);
5856	    }
5857	  for (i = 0; i < unroll; i++)
5858	    {
5859	      if (i)
5860		destmem = adjust_address (copy_rtx (destmem), mode,
5861					  GET_MODE_SIZE (mode));
5862	      emit_move_insn (destmem, tmpreg[i]);
5863	    }
5864	}
5865    }
5866  else
5867    for (i = 0; i < unroll; i++)
5868      {
5869	if (i)
5870	  destmem = adjust_address (copy_rtx (destmem), mode,
5871				    GET_MODE_SIZE (mode));
5872	emit_move_insn (destmem, value);
5873      }
5874
5875  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5876			     true, OPTAB_LIB_WIDEN);
5877  if (tmp != iter)
5878    emit_move_insn (iter, tmp);
5879
5880  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5881			   true, top_label);
5882  if (expected_size != -1)
5883    {
5884      expected_size /= GET_MODE_SIZE (mode) * unroll;
5885      if (expected_size == 0)
5886	predict_jump (0);
5887      else if (expected_size > REG_BR_PROB_BASE)
5888	predict_jump (REG_BR_PROB_BASE - 1);
5889      else
5890        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5891		      / expected_size);
5892    }
5893  else
5894    predict_jump (REG_BR_PROB_BASE * 80 / 100);
5895  iter = ix86_zero_extend_to_Pmode (iter);
5896  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5897			     true, OPTAB_LIB_WIDEN);
5898  if (tmp != destptr)
5899    emit_move_insn (destptr, tmp);
5900  if (!issetmem)
5901    {
5902      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5903				 true, OPTAB_LIB_WIDEN);
5904      if (tmp != srcptr)
5905	emit_move_insn (srcptr, tmp);
5906    }
5907  emit_label (out_label);
5908}
5909
5910/* Divide COUNTREG by SCALE.  */
5911static rtx
5912scale_counter (rtx countreg, int scale)
5913{
5914  rtx sc;
5915
5916  if (scale == 1)
5917    return countreg;
5918  if (CONST_INT_P (countreg))
5919    return GEN_INT (INTVAL (countreg) / scale);
5920  gcc_assert (REG_P (countreg));
5921
5922  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5923			    GEN_INT (exact_log2 (scale)),
5924			    NULL, 1, OPTAB_DIRECT);
5925  return sc;
5926}
5927
5928/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5929   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5930   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5931   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5932   ORIG_VALUE is the original value passed to memset to fill the memory with.
5933   Other arguments have same meaning as for previous function.  */
5934
5935static void
5936expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5937			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5938			   rtx count,
5939			   machine_mode mode, bool issetmem)
5940{
5941  rtx destexp;
5942  rtx srcexp;
5943  rtx countreg;
5944  HOST_WIDE_INT rounded_count;
5945
5946  /* If possible, it is shorter to use rep movs.
5947     TODO: Maybe it is better to move this logic to decide_alg.  */
5948  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5949      && (!issetmem || orig_value == const0_rtx))
5950    mode = SImode;
5951
5952  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5953    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5954
5955  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5956						       GET_MODE_SIZE (mode)));
5957  if (mode != QImode)
5958    {
5959      destexp = gen_rtx_ASHIFT (Pmode, countreg,
5960				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5961      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5962    }
5963  else
5964    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5965  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5966    {
5967      rounded_count
5968	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5969      destmem = shallow_copy_rtx (destmem);
5970      set_mem_size (destmem, rounded_count);
5971    }
5972  else if (MEM_SIZE_KNOWN_P (destmem))
5973    clear_mem_size (destmem);
5974
5975  if (issetmem)
5976    {
5977      value = force_reg (mode, gen_lowpart (mode, value));
5978      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5979    }
5980  else
5981    {
5982      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5983	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5984      if (mode != QImode)
5985	{
5986	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5987				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5988	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5989	}
5990      else
5991	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5992      if (CONST_INT_P (count))
5993	{
5994	  rounded_count
5995	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5996	  srcmem = shallow_copy_rtx (srcmem);
5997	  set_mem_size (srcmem, rounded_count);
5998	}
5999      else
6000	{
6001	  if (MEM_SIZE_KNOWN_P (srcmem))
6002	    clear_mem_size (srcmem);
6003	}
6004      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6005			      destexp, srcexp));
6006    }
6007}
6008
6009/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6010   DESTMEM.
6011   SRC is passed by pointer to be updated on return.
6012   Return value is updated DST.  */
6013static rtx
6014emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6015	     HOST_WIDE_INT size_to_move)
6016{
6017  rtx dst = destmem, src = *srcmem, adjust, tempreg;
6018  enum insn_code code;
6019  machine_mode move_mode;
6020  int piece_size, i;
6021
6022  /* Find the widest mode in which we could perform moves.
6023     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6024     it until move of such size is supported.  */
6025  piece_size = 1 << floor_log2 (size_to_move);
6026  while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6027	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6028    {
6029      gcc_assert (piece_size > 1);
6030      piece_size >>= 1;
6031    }
6032
6033  /* Find the corresponding vector mode with the same size as MOVE_MODE.
6034     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
6035  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6036    {
6037      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6038      if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6039	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6040	{
6041	  move_mode = word_mode;
6042	  piece_size = GET_MODE_SIZE (move_mode);
6043	  code = optab_handler (mov_optab, move_mode);
6044	}
6045    }
6046  gcc_assert (code != CODE_FOR_nothing);
6047
6048  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6049  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6050
6051  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6052  gcc_assert (size_to_move % piece_size == 0);
6053  adjust = GEN_INT (piece_size);
6054  for (i = 0; i < size_to_move; i += piece_size)
6055    {
6056      /* We move from memory to memory, so we'll need to do it via
6057	 a temporary register.  */
6058      tempreg = gen_reg_rtx (move_mode);
6059      emit_insn (GEN_FCN (code) (tempreg, src));
6060      emit_insn (GEN_FCN (code) (dst, tempreg));
6061
6062      emit_move_insn (destptr,
6063		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6064      emit_move_insn (srcptr,
6065		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6066
6067      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6068					  piece_size);
6069      src = adjust_automodify_address_nv (src, move_mode, srcptr,
6070					  piece_size);
6071    }
6072
6073  /* Update DST and SRC rtx.  */
6074  *srcmem = src;
6075  return dst;
6076}
6077
6078/* Helper function for the string operations below.  Dest VARIABLE whether
6079   it is aligned to VALUE bytes.  If true, jump to the label.  */
6080
6081static rtx_code_label *
6082ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6083{
6084  rtx_code_label *label = gen_label_rtx ();
6085  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6086  if (GET_MODE (variable) == DImode)
6087    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6088  else
6089    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6090  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6091			   1, label);
6092  if (epilogue)
6093    predict_jump (REG_BR_PROB_BASE * 50 / 100);
6094  else
6095    predict_jump (REG_BR_PROB_BASE * 90 / 100);
6096  return label;
6097}
6098
6099
6100/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
6101
6102static void
6103expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6104			rtx destptr, rtx srcptr, rtx count, int max_size)
6105{
6106  rtx src, dest;
6107  if (CONST_INT_P (count))
6108    {
6109      HOST_WIDE_INT countval = INTVAL (count);
6110      HOST_WIDE_INT epilogue_size = countval % max_size;
6111      int i;
6112
6113      /* For now MAX_SIZE should be a power of 2.  This assert could be
6114	 relaxed, but it'll require a bit more complicated epilogue
6115	 expanding.  */
6116      gcc_assert ((max_size & (max_size - 1)) == 0);
6117      for (i = max_size; i >= 1; i >>= 1)
6118	{
6119	  if (epilogue_size & i)
6120	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6121	}
6122      return;
6123    }
6124  if (max_size > 8)
6125    {
6126      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6127				    count, 1, OPTAB_DIRECT);
6128      expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6129				     count, QImode, 1, 4, false);
6130      return;
6131    }
6132
6133  /* When there are stringops, we can cheaply increase dest and src pointers.
6134     Otherwise we save code size by maintaining offset (zero is readily
6135     available from preceding rep operation) and using x86 addressing modes.
6136   */
6137  if (TARGET_SINGLE_STRINGOP)
6138    {
6139      if (max_size > 4)
6140	{
6141	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6142	  src = change_address (srcmem, SImode, srcptr);
6143	  dest = change_address (destmem, SImode, destptr);
6144	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6145	  emit_label (label);
6146	  LABEL_NUSES (label) = 1;
6147	}
6148      if (max_size > 2)
6149	{
6150	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6151	  src = change_address (srcmem, HImode, srcptr);
6152	  dest = change_address (destmem, HImode, destptr);
6153	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6154	  emit_label (label);
6155	  LABEL_NUSES (label) = 1;
6156	}
6157      if (max_size > 1)
6158	{
6159	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6160	  src = change_address (srcmem, QImode, srcptr);
6161	  dest = change_address (destmem, QImode, destptr);
6162	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6163	  emit_label (label);
6164	  LABEL_NUSES (label) = 1;
6165	}
6166    }
6167  else
6168    {
6169      rtx offset = force_reg (Pmode, const0_rtx);
6170      rtx tmp;
6171
6172      if (max_size > 4)
6173	{
6174	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6175	  src = change_address (srcmem, SImode, srcptr);
6176	  dest = change_address (destmem, SImode, destptr);
6177	  emit_move_insn (dest, src);
6178	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6179				     true, OPTAB_LIB_WIDEN);
6180	  if (tmp != offset)
6181	    emit_move_insn (offset, tmp);
6182	  emit_label (label);
6183	  LABEL_NUSES (label) = 1;
6184	}
6185      if (max_size > 2)
6186	{
6187	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6188	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6189	  src = change_address (srcmem, HImode, tmp);
6190	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6191	  dest = change_address (destmem, HImode, tmp);
6192	  emit_move_insn (dest, src);
6193	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6194				     true, OPTAB_LIB_WIDEN);
6195	  if (tmp != offset)
6196	    emit_move_insn (offset, tmp);
6197	  emit_label (label);
6198	  LABEL_NUSES (label) = 1;
6199	}
6200      if (max_size > 1)
6201	{
6202	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6203	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6204	  src = change_address (srcmem, QImode, tmp);
6205	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6206	  dest = change_address (destmem, QImode, tmp);
6207	  emit_move_insn (dest, src);
6208	  emit_label (label);
6209	  LABEL_NUSES (label) = 1;
6210	}
6211    }
6212}
6213
6214/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6215   with value PROMOTED_VAL.
6216   SRC is passed by pointer to be updated on return.
6217   Return value is updated DST.  */
6218static rtx
6219emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6220	     HOST_WIDE_INT size_to_move)
6221{
6222  rtx dst = destmem, adjust;
6223  enum insn_code code;
6224  machine_mode move_mode;
6225  int piece_size, i;
6226
6227  /* Find the widest mode in which we could perform moves.
6228     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6229     it until move of such size is supported.  */
6230  move_mode = GET_MODE (promoted_val);
6231  if (move_mode == VOIDmode)
6232    move_mode = QImode;
6233  if (size_to_move < GET_MODE_SIZE (move_mode))
6234    {
6235      unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6236      move_mode = int_mode_for_size (move_bits, 0).require ();
6237      promoted_val = gen_lowpart (move_mode, promoted_val);
6238    }
6239  piece_size = GET_MODE_SIZE (move_mode);
6240  code = optab_handler (mov_optab, move_mode);
6241  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6242
6243  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6244
6245  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6246  gcc_assert (size_to_move % piece_size == 0);
6247  adjust = GEN_INT (piece_size);
6248  for (i = 0; i < size_to_move; i += piece_size)
6249    {
6250      if (piece_size <= GET_MODE_SIZE (word_mode))
6251	{
6252	  emit_insn (gen_strset (destptr, dst, promoted_val));
6253	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6254					      piece_size);
6255	  continue;
6256	}
6257
6258      emit_insn (GEN_FCN (code) (dst, promoted_val));
6259
6260      emit_move_insn (destptr,
6261		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6262
6263      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6264					  piece_size);
6265    }
6266
6267  /* Update DST rtx.  */
6268  return dst;
6269}
6270/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6271static void
6272expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6273				 rtx count, int max_size)
6274{
6275  count = expand_simple_binop (counter_mode (count), AND, count,
6276			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6277  expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6278				 gen_lowpart (QImode, value), count, QImode,
6279				 1, max_size / 2, true);
6280}
6281
6282/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6283static void
6284expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6285			rtx count, int max_size)
6286{
6287  rtx dest;
6288
6289  if (CONST_INT_P (count))
6290    {
6291      HOST_WIDE_INT countval = INTVAL (count);
6292      HOST_WIDE_INT epilogue_size = countval % max_size;
6293      int i;
6294
6295      /* For now MAX_SIZE should be a power of 2.  This assert could be
6296	 relaxed, but it'll require a bit more complicated epilogue
6297	 expanding.  */
6298      gcc_assert ((max_size & (max_size - 1)) == 0);
6299      for (i = max_size; i >= 1; i >>= 1)
6300	{
6301	  if (epilogue_size & i)
6302	    {
6303	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6304		destmem = emit_memset (destmem, destptr, vec_value, i);
6305	      else
6306		destmem = emit_memset (destmem, destptr, value, i);
6307	    }
6308	}
6309      return;
6310    }
6311  if (max_size > 32)
6312    {
6313      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6314      return;
6315    }
6316  if (max_size > 16)
6317    {
6318      rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6319      if (TARGET_64BIT)
6320	{
6321	  dest = change_address (destmem, DImode, destptr);
6322	  emit_insn (gen_strset (destptr, dest, value));
6323	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6324	  emit_insn (gen_strset (destptr, dest, value));
6325	}
6326      else
6327	{
6328	  dest = change_address (destmem, SImode, destptr);
6329	  emit_insn (gen_strset (destptr, dest, value));
6330	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6331	  emit_insn (gen_strset (destptr, dest, value));
6332	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6333	  emit_insn (gen_strset (destptr, dest, value));
6334	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6335	  emit_insn (gen_strset (destptr, dest, value));
6336	}
6337      emit_label (label);
6338      LABEL_NUSES (label) = 1;
6339    }
6340  if (max_size > 8)
6341    {
6342      rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6343      if (TARGET_64BIT)
6344	{
6345	  dest = change_address (destmem, DImode, destptr);
6346	  emit_insn (gen_strset (destptr, dest, value));
6347	}
6348      else
6349	{
6350	  dest = change_address (destmem, SImode, destptr);
6351	  emit_insn (gen_strset (destptr, dest, value));
6352	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6353	  emit_insn (gen_strset (destptr, dest, value));
6354	}
6355      emit_label (label);
6356      LABEL_NUSES (label) = 1;
6357    }
6358  if (max_size > 4)
6359    {
6360      rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6361      dest = change_address (destmem, SImode, destptr);
6362      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6363      emit_label (label);
6364      LABEL_NUSES (label) = 1;
6365    }
6366  if (max_size > 2)
6367    {
6368      rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6369      dest = change_address (destmem, HImode, destptr);
6370      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6371      emit_label (label);
6372      LABEL_NUSES (label) = 1;
6373    }
6374  if (max_size > 1)
6375    {
6376      rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6377      dest = change_address (destmem, QImode, destptr);
6378      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6379      emit_label (label);
6380      LABEL_NUSES (label) = 1;
6381    }
6382}
6383
6384/* Adjust COUNTER by the VALUE.  */
6385static void
6386ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6387{
6388  emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6389}
6390
6391/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6392   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
6393   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6394   ignored.
6395   Return value is updated DESTMEM.  */
6396
6397static rtx
6398expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6399				  rtx destptr, rtx srcptr, rtx value,
6400				  rtx vec_value, rtx count, int align,
6401				  int desired_alignment, bool issetmem)
6402{
6403  int i;
6404  for (i = 1; i < desired_alignment; i <<= 1)
6405    {
6406      if (align <= i)
6407	{
6408	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6409	  if (issetmem)
6410	    {
6411	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6412		destmem = emit_memset (destmem, destptr, vec_value, i);
6413	      else
6414		destmem = emit_memset (destmem, destptr, value, i);
6415	    }
6416	  else
6417	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6418	  ix86_adjust_counter (count, i);
6419	  emit_label (label);
6420	  LABEL_NUSES (label) = 1;
6421	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6422	}
6423    }
6424  return destmem;
6425}
6426
6427/* Test if COUNT&SIZE is nonzero and if so, expand movme
6428   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6429   and jump to DONE_LABEL.  */
6430static void
6431expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6432			       rtx destptr, rtx srcptr,
6433			       rtx value, rtx vec_value,
6434			       rtx count, int size,
6435			       rtx done_label, bool issetmem)
6436{
6437  rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6438  machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6439  rtx modesize;
6440  int n;
6441
6442  /* If we do not have vector value to copy, we must reduce size.  */
6443  if (issetmem)
6444    {
6445      if (!vec_value)
6446	{
6447	  if (GET_MODE (value) == VOIDmode && size > 8)
6448	    mode = Pmode;
6449	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6450	    mode = GET_MODE (value);
6451	}
6452      else
6453	mode = GET_MODE (vec_value), value = vec_value;
6454    }
6455  else
6456    {
6457      /* Choose appropriate vector mode.  */
6458      if (size >= 32)
6459	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6460      else if (size >= 16)
6461	mode = TARGET_SSE ? V16QImode : DImode;
6462      srcmem = change_address (srcmem, mode, srcptr);
6463    }
6464  destmem = change_address (destmem, mode, destptr);
6465  modesize = GEN_INT (GET_MODE_SIZE (mode));
6466  gcc_assert (GET_MODE_SIZE (mode) <= size);
6467  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6468    {
6469      if (issetmem)
6470	emit_move_insn (destmem, gen_lowpart (mode, value));
6471      else
6472	{
6473          emit_move_insn (destmem, srcmem);
6474          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6475	}
6476      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6477    }
6478
6479  destmem = offset_address (destmem, count, 1);
6480  destmem = offset_address (destmem, GEN_INT (-2 * size),
6481			    GET_MODE_SIZE (mode));
6482  if (!issetmem)
6483    {
6484      srcmem = offset_address (srcmem, count, 1);
6485      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6486			       GET_MODE_SIZE (mode));
6487    }
6488  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6489    {
6490      if (issetmem)
6491	emit_move_insn (destmem, gen_lowpart (mode, value));
6492      else
6493	{
6494	  emit_move_insn (destmem, srcmem);
6495	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6496	}
6497      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6498    }
6499  emit_jump_insn (gen_jump (done_label));
6500  emit_barrier ();
6501
6502  emit_label (label);
6503  LABEL_NUSES (label) = 1;
6504}
6505
6506/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6507   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6508   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6509   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6510   DONE_LABEL is a label after the whole copying sequence. The label is created
6511   on demand if *DONE_LABEL is NULL.
6512   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
6513   bounds after the initial copies.
6514
6515   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6516   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6517   we will dispatch to a library call for large blocks.
6518
6519   In pseudocode we do:
6520
6521   if (COUNT < SIZE)
6522     {
6523       Assume that SIZE is 4. Bigger sizes are handled analogously
6524       if (COUNT & 4)
6525	 {
6526	    copy 4 bytes from SRCPTR to DESTPTR
6527	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6528	    goto done_label
6529	 }
6530       if (!COUNT)
6531	 goto done_label;
6532       copy 1 byte from SRCPTR to DESTPTR
6533       if (COUNT & 2)
6534	 {
6535	    copy 2 bytes from SRCPTR to DESTPTR
6536	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6537	 }
6538     }
6539   else
6540     {
6541       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6542       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6543
6544       OLD_DESPTR = DESTPTR;
6545       Align DESTPTR up to DESIRED_ALIGN
6546       SRCPTR += DESTPTR - OLD_DESTPTR
6547       COUNT -= DEST_PTR - OLD_DESTPTR
6548       if (DYNAMIC_CHECK)
6549	 Round COUNT down to multiple of SIZE
6550       << optional caller supplied zero size guard is here >>
6551       << optional caller supplied dynamic check is here >>
6552       << caller supplied main copy loop is here >>
6553     }
6554   done_label:
6555  */
6556static void
6557expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6558							    rtx *destptr, rtx *srcptr,
6559							    machine_mode mode,
6560							    rtx value, rtx vec_value,
6561							    rtx *count,
6562							    rtx_code_label **done_label,
6563							    int size,
6564							    int desired_align,
6565							    int align,
6566							    unsigned HOST_WIDE_INT *min_size,
6567							    bool dynamic_check,
6568							    bool issetmem)
6569{
6570  rtx_code_label *loop_label = NULL, *label;
6571  int n;
6572  rtx modesize;
6573  int prolog_size = 0;
6574  rtx mode_value;
6575
6576  /* Chose proper value to copy.  */
6577  if (issetmem && VECTOR_MODE_P (mode))
6578    mode_value = vec_value;
6579  else
6580    mode_value = value;
6581  gcc_assert (GET_MODE_SIZE (mode) <= size);
6582
6583  /* See if block is big or small, handle small blocks.  */
6584  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6585    {
6586      int size2 = size;
6587      loop_label = gen_label_rtx ();
6588
6589      if (!*done_label)
6590	*done_label = gen_label_rtx ();
6591
6592      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6593			       1, loop_label);
6594      size2 >>= 1;
6595
6596      /* Handle sizes > 3.  */
6597      for (;size2 > 2; size2 >>= 1)
6598	expand_small_cpymem_or_setmem (destmem, srcmem,
6599				       *destptr, *srcptr,
6600				       value, vec_value,
6601				       *count,
6602				       size2, *done_label, issetmem);
6603      /* Nothing to copy?  Jump to DONE_LABEL if so */
6604      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6605			       1, *done_label);
6606
6607      /* Do a byte copy.  */
6608      destmem = change_address (destmem, QImode, *destptr);
6609      if (issetmem)
6610	emit_move_insn (destmem, gen_lowpart (QImode, value));
6611      else
6612	{
6613          srcmem = change_address (srcmem, QImode, *srcptr);
6614          emit_move_insn (destmem, srcmem);
6615	}
6616
6617      /* Handle sizes 2 and 3.  */
6618      label = ix86_expand_aligntest (*count, 2, false);
6619      destmem = change_address (destmem, HImode, *destptr);
6620      destmem = offset_address (destmem, *count, 1);
6621      destmem = offset_address (destmem, GEN_INT (-2), 2);
6622      if (issetmem)
6623        emit_move_insn (destmem, gen_lowpart (HImode, value));
6624      else
6625	{
6626	  srcmem = change_address (srcmem, HImode, *srcptr);
6627	  srcmem = offset_address (srcmem, *count, 1);
6628	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6629	  emit_move_insn (destmem, srcmem);
6630	}
6631
6632      emit_label (label);
6633      LABEL_NUSES (label) = 1;
6634      emit_jump_insn (gen_jump (*done_label));
6635      emit_barrier ();
6636    }
6637  else
6638    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6639		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6640
6641  /* Start memcpy for COUNT >= SIZE.  */
6642  if (loop_label)
6643    {
6644       emit_label (loop_label);
6645       LABEL_NUSES (loop_label) = 1;
6646    }
6647
6648  /* Copy first desired_align bytes.  */
6649  if (!issetmem)
6650    srcmem = change_address (srcmem, mode, *srcptr);
6651  destmem = change_address (destmem, mode, *destptr);
6652  modesize = GEN_INT (GET_MODE_SIZE (mode));
6653  for (n = 0; prolog_size < desired_align - align; n++)
6654    {
6655      if (issetmem)
6656        emit_move_insn (destmem, mode_value);
6657      else
6658	{
6659          emit_move_insn (destmem, srcmem);
6660          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6661	}
6662      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6663      prolog_size += GET_MODE_SIZE (mode);
6664    }
6665
6666
6667  /* Copy last SIZE bytes.  */
6668  destmem = offset_address (destmem, *count, 1);
6669  destmem = offset_address (destmem,
6670			    GEN_INT (-size - prolog_size),
6671			    1);
6672  if (issetmem)
6673    emit_move_insn (destmem, mode_value);
6674  else
6675    {
6676      srcmem = offset_address (srcmem, *count, 1);
6677      srcmem = offset_address (srcmem,
6678			       GEN_INT (-size - prolog_size),
6679			       1);
6680      emit_move_insn (destmem, srcmem);
6681    }
6682  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6683    {
6684      destmem = offset_address (destmem, modesize, 1);
6685      if (issetmem)
6686	emit_move_insn (destmem, mode_value);
6687      else
6688	{
6689          srcmem = offset_address (srcmem, modesize, 1);
6690          emit_move_insn (destmem, srcmem);
6691	}
6692    }
6693
6694  /* Align destination.  */
6695  if (desired_align > 1 && desired_align > align)
6696    {
6697      rtx saveddest = *destptr;
6698
6699      gcc_assert (desired_align <= size);
6700      /* Align destptr up, place it to new register.  */
6701      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6702				      GEN_INT (prolog_size),
6703				      NULL_RTX, 1, OPTAB_DIRECT);
6704      if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6705	REG_POINTER (*destptr) = 1;
6706      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6707				      GEN_INT (-desired_align),
6708				      *destptr, 1, OPTAB_DIRECT);
6709      /* See how many bytes we skipped.  */
6710      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6711				       *destptr,
6712				       saveddest, 1, OPTAB_DIRECT);
6713      /* Adjust srcptr and count.  */
6714      if (!issetmem)
6715	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6716				       saveddest, *srcptr, 1, OPTAB_DIRECT);
6717      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6718				    saveddest, *count, 1, OPTAB_DIRECT);
6719      /* We copied at most size + prolog_size.  */
6720      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6721	*min_size
6722	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6723      else
6724	*min_size = 0;
6725
6726      /* Our loops always round down the block size, but for dispatch to
6727         library we need precise value.  */
6728      if (dynamic_check)
6729	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
6730				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6731    }
6732  else
6733    {
6734      gcc_assert (prolog_size == 0);
6735      /* Decrease count, so we won't end up copying last word twice.  */
6736      if (!CONST_INT_P (*count))
6737	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6738				      constm1_rtx, *count, 1, OPTAB_DIRECT);
6739      else
6740	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6741				      (unsigned HOST_WIDE_INT)size));
6742      if (*min_size)
6743	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6744    }
6745}
6746
6747
6748/* This function is like the previous one, except here we know how many bytes
6749   need to be copied.  That allows us to update alignment not only of DST, which
6750   is returned, but also of SRC, which is passed as a pointer for that
6751   reason.  */
6752static rtx
6753expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6754					   rtx srcreg, rtx value, rtx vec_value,
6755					   int desired_align, int align_bytes,
6756					   bool issetmem)
6757{
6758  rtx src = NULL;
6759  rtx orig_dst = dst;
6760  rtx orig_src = NULL;
6761  int piece_size = 1;
6762  int copied_bytes = 0;
6763
6764  if (!issetmem)
6765    {
6766      gcc_assert (srcp != NULL);
6767      src = *srcp;
6768      orig_src = src;
6769    }
6770
6771  for (piece_size = 1;
6772       piece_size <= desired_align && copied_bytes < align_bytes;
6773       piece_size <<= 1)
6774    {
6775      if (align_bytes & piece_size)
6776	{
6777	  if (issetmem)
6778	    {
6779	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6780		dst = emit_memset (dst, destreg, vec_value, piece_size);
6781	      else
6782		dst = emit_memset (dst, destreg, value, piece_size);
6783	    }
6784	  else
6785	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6786	  copied_bytes += piece_size;
6787	}
6788    }
6789  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6790    set_mem_align (dst, desired_align * BITS_PER_UNIT);
6791  if (MEM_SIZE_KNOWN_P (orig_dst))
6792    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6793
6794  if (!issetmem)
6795    {
6796      int src_align_bytes = get_mem_align_offset (src, desired_align
6797						       * BITS_PER_UNIT);
6798      if (src_align_bytes >= 0)
6799	src_align_bytes = desired_align - src_align_bytes;
6800      if (src_align_bytes >= 0)
6801	{
6802	  unsigned int src_align;
6803	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6804	    {
6805	      if ((src_align_bytes & (src_align - 1))
6806		   == (align_bytes & (src_align - 1)))
6807		break;
6808	    }
6809	  if (src_align > (unsigned int) desired_align)
6810	    src_align = desired_align;
6811	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6812	    set_mem_align (src, src_align * BITS_PER_UNIT);
6813	}
6814      if (MEM_SIZE_KNOWN_P (orig_src))
6815	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6816      *srcp = src;
6817    }
6818
6819  return dst;
6820}
6821
6822/* Return true if ALG can be used in current context.
6823   Assume we expand memset if MEMSET is true.  */
6824static bool
6825alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6826{
6827  if (alg == no_stringop)
6828    return false;
6829  if (alg == vector_loop)
6830    return TARGET_SSE || TARGET_AVX;
6831  /* Algorithms using the rep prefix want at least edi and ecx;
6832     additionally, memset wants eax and memcpy wants esi.  Don't
6833     consider such algorithms if the user has appropriated those
6834     registers for their own purposes, or if we have a non-default
6835     address space, since some string insns cannot override the segment.  */
6836  if (alg == rep_prefix_1_byte
6837      || alg == rep_prefix_4_byte
6838      || alg == rep_prefix_8_byte)
6839    {
6840      if (have_as)
6841	return false;
6842      if (fixed_regs[CX_REG]
6843	  || fixed_regs[DI_REG]
6844	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6845	return false;
6846    }
6847  return true;
6848}
6849
6850/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
6851static enum stringop_alg
6852decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6853	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6854	    bool memset, bool zero_memset, bool have_as,
6855	    int *dynamic_check, bool *noalign, bool recur)
6856{
6857  const struct stringop_algs *algs;
6858  bool optimize_for_speed;
6859  int max = 0;
6860  const struct processor_costs *cost;
6861  int i;
6862  bool any_alg_usable_p = false;
6863
6864  *noalign = false;
6865  *dynamic_check = -1;
6866
6867  /* Even if the string operation call is cold, we still might spend a lot
6868     of time processing large blocks.  */
6869  if (optimize_function_for_size_p (cfun)
6870      || (optimize_insn_for_size_p ()
6871 	  && (max_size < 256
6872              || (expected_size != -1 && expected_size < 256))))
6873    optimize_for_speed = false;
6874  else
6875    optimize_for_speed = true;
6876
6877  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6878  if (memset)
6879    algs = &cost->memset[TARGET_64BIT != 0];
6880  else
6881    algs = &cost->memcpy[TARGET_64BIT != 0];
6882
6883  /* See maximal size for user defined algorithm.  */
6884  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6885    {
6886      enum stringop_alg candidate = algs->size[i].alg;
6887      bool usable = alg_usable_p (candidate, memset, have_as);
6888      any_alg_usable_p |= usable;
6889
6890      if (candidate != libcall && candidate && usable)
6891	max = algs->size[i].max;
6892    }
6893
6894  /* If expected size is not known but max size is small enough
6895     so inline version is a win, set expected size into
6896     the range.  */
6897  if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6898      && expected_size == -1)
6899    expected_size = min_size / 2 + max_size / 2;
6900
6901  /* If user specified the algorithm, honor it if possible.  */
6902  if (ix86_stringop_alg != no_stringop
6903      && alg_usable_p (ix86_stringop_alg, memset, have_as))
6904    return ix86_stringop_alg;
6905  /* rep; movq or rep; movl is the smallest variant.  */
6906  else if (!optimize_for_speed)
6907    {
6908      *noalign = true;
6909      if (!count || (count & 3) || (memset && !zero_memset))
6910	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6911	       ? rep_prefix_1_byte : loop_1_byte;
6912      else
6913	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6914	       ? rep_prefix_4_byte : loop;
6915    }
6916  /* Very tiny blocks are best handled via the loop, REP is expensive to
6917     setup.  */
6918  else if (expected_size != -1 && expected_size < 4)
6919    return loop_1_byte;
6920  else if (expected_size != -1)
6921    {
6922      enum stringop_alg alg = libcall;
6923      bool alg_noalign = false;
6924      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6925	{
6926	  /* We get here if the algorithms that were not libcall-based
6927	     were rep-prefix based and we are unable to use rep prefixes
6928	     based on global register usage.  Break out of the loop and
6929	     use the heuristic below.  */
6930	  if (algs->size[i].max == 0)
6931	    break;
6932	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6933	    {
6934	      enum stringop_alg candidate = algs->size[i].alg;
6935
6936	      if (candidate != libcall
6937		  && alg_usable_p (candidate, memset, have_as))
6938		{
6939		  alg = candidate;
6940		  alg_noalign = algs->size[i].noalign;
6941		}
6942	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6943		 last non-libcall inline algorithm.  */
6944	      if (TARGET_INLINE_ALL_STRINGOPS)
6945		{
6946		  /* When the current size is best to be copied by a libcall,
6947		     but we are still forced to inline, run the heuristic below
6948		     that will pick code for medium sized blocks.  */
6949		  if (alg != libcall)
6950		    {
6951		      *noalign = alg_noalign;
6952		      return alg;
6953		    }
6954		  else if (!any_alg_usable_p)
6955		    break;
6956		}
6957	      else if (alg_usable_p (candidate, memset, have_as))
6958		{
6959		  *noalign = algs->size[i].noalign;
6960		  return candidate;
6961		}
6962	    }
6963	}
6964    }
6965  /* When asked to inline the call anyway, try to pick meaningful choice.
6966     We look for maximal size of block that is faster to copy by hand and
6967     take blocks of at most of that size guessing that average size will
6968     be roughly half of the block.
6969
6970     If this turns out to be bad, we might simply specify the preferred
6971     choice in ix86_costs.  */
6972  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6973      && (algs->unknown_size == libcall
6974	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
6975    {
6976      enum stringop_alg alg;
6977      HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6978
6979      /* If there aren't any usable algorithms or if recursing already,
6980	 then recursing on smaller sizes or same size isn't going to
6981	 find anything.  Just return the simple byte-at-a-time copy loop.  */
6982      if (!any_alg_usable_p || recur)
6983	{
6984	  /* Pick something reasonable.  */
6985	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6986	    *dynamic_check = 128;
6987	  return loop_1_byte;
6988	}
6989      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6990			zero_memset, have_as, dynamic_check, noalign, true);
6991      gcc_assert (*dynamic_check == -1);
6992      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6993	*dynamic_check = max;
6994      else
6995	gcc_assert (alg != libcall);
6996      return alg;
6997    }
6998  return (alg_usable_p (algs->unknown_size, memset, have_as)
6999	  ? algs->unknown_size : libcall);
7000}
7001
7002/* Decide on alignment.  We know that the operand is already aligned to ALIGN
7003   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
7004static int
7005decide_alignment (int align,
7006		  enum stringop_alg alg,
7007		  int expected_size,
7008		  machine_mode move_mode)
7009{
7010  int desired_align = 0;
7011
7012  gcc_assert (alg != no_stringop);
7013
7014  if (alg == libcall)
7015    return 0;
7016  if (move_mode == VOIDmode)
7017    return 0;
7018
7019  desired_align = GET_MODE_SIZE (move_mode);
7020  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7021     copying whole cacheline at once.  */
7022  if (TARGET_PENTIUMPRO
7023      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7024    desired_align = 8;
7025
7026  if (optimize_size)
7027    desired_align = 1;
7028  if (desired_align < align)
7029    desired_align = align;
7030  if (expected_size != -1 && expected_size < 4)
7031    desired_align = align;
7032
7033  return desired_align;
7034}
7035
7036
7037/* Helper function for memcpy.  For QImode value 0xXY produce
7038   0xXYXYXYXY of wide specified by MODE.  This is essentially
7039   a * 0x10101010, but we can do slightly better than
7040   synth_mult by unwinding the sequence by hand on CPUs with
7041   slow multiply.  */
7042static rtx
7043promote_duplicated_reg (machine_mode mode, rtx val)
7044{
7045  machine_mode valmode = GET_MODE (val);
7046  rtx tmp;
7047  int nops = mode == DImode ? 3 : 2;
7048
7049  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7050  if (val == const0_rtx)
7051    return copy_to_mode_reg (mode, CONST0_RTX (mode));
7052  if (CONST_INT_P (val))
7053    {
7054      HOST_WIDE_INT v = INTVAL (val) & 255;
7055
7056      v |= v << 8;
7057      v |= v << 16;
7058      if (mode == DImode)
7059        v |= (v << 16) << 16;
7060      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7061    }
7062
7063  if (valmode == VOIDmode)
7064    valmode = QImode;
7065  if (valmode != QImode)
7066    val = gen_lowpart (QImode, val);
7067  if (mode == QImode)
7068    return val;
7069  if (!TARGET_PARTIAL_REG_STALL)
7070    nops--;
7071  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7072      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7073      <= (ix86_cost->shift_const + ix86_cost->add) * nops
7074          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7075    {
7076      rtx reg = convert_modes (mode, QImode, val, true);
7077      tmp = promote_duplicated_reg (mode, const1_rtx);
7078      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7079				  OPTAB_DIRECT);
7080    }
7081  else
7082    {
7083      rtx reg = convert_modes (mode, QImode, val, true);
7084
7085      if (!TARGET_PARTIAL_REG_STALL)
7086	if (mode == SImode)
7087	  emit_insn (gen_insvsi_1 (reg, reg));
7088	else
7089	  emit_insn (gen_insvdi_1 (reg, reg));
7090      else
7091	{
7092	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7093				     NULL, 1, OPTAB_DIRECT);
7094	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7095				     OPTAB_DIRECT);
7096	}
7097      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7098			         NULL, 1, OPTAB_DIRECT);
7099      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7100      if (mode == SImode)
7101	return reg;
7102      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7103				 NULL, 1, OPTAB_DIRECT);
7104      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7105      return reg;
7106    }
7107}
7108
7109/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7110   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7111   alignment from ALIGN to DESIRED_ALIGN.  */
7112static rtx
7113promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7114				int align)
7115{
7116  rtx promoted_val;
7117
7118  if (TARGET_64BIT
7119      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7120    promoted_val = promote_duplicated_reg (DImode, val);
7121  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7122    promoted_val = promote_duplicated_reg (SImode, val);
7123  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7124    promoted_val = promote_duplicated_reg (HImode, val);
7125  else
7126    promoted_val = val;
7127
7128  return promoted_val;
7129}
7130
7131/* Copy the address to a Pmode register.  This is used for x32 to
7132   truncate DImode TLS address to a SImode register. */
7133
7134static rtx
7135ix86_copy_addr_to_reg (rtx addr)
7136{
7137  rtx reg;
7138  if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7139    {
7140      reg = copy_addr_to_reg (addr);
7141      REG_POINTER (reg) = 1;
7142      return reg;
7143    }
7144  else
7145    {
7146      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7147      reg = copy_to_mode_reg (DImode, addr);
7148      REG_POINTER (reg) = 1;
7149      return gen_rtx_SUBREG (SImode, reg, 0);
7150    }
7151}
7152
7153/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
7154   operations when profitable.  The code depends upon architecture, block size
7155   and alignment, but always has one of the following overall structures:
7156
7157   Aligned move sequence:
7158
7159     1) Prologue guard: Conditional that jumps up to epilogues for small
7160	blocks that can be handled by epilogue alone.  This is faster
7161	but also needed for correctness, since prologue assume the block
7162	is larger than the desired alignment.
7163
7164	Optional dynamic check for size and libcall for large
7165	blocks is emitted here too, with -minline-stringops-dynamically.
7166
7167     2) Prologue: copy first few bytes in order to get destination
7168	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
7169	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7170	copied.  We emit either a jump tree on power of two sized
7171	blocks, or a byte loop.
7172
7173     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7174	with specified algorithm.
7175
7176     4) Epilogue: code copying tail of the block that is too small to be
7177	handled by main body (or up to size guarded by prologue guard).
7178
7179  Misaligned move sequence
7180
7181     1) missaligned move prologue/epilogue containing:
7182        a) Prologue handling small memory blocks and jumping to done_label
7183	   (skipped if blocks are known to be large enough)
7184	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7185           needed by single possibly misaligned move
7186	   (skipped if alignment is not needed)
7187        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7188
7189     2) Zero size guard dispatching to done_label, if needed
7190
7191     3) dispatch to library call, if needed,
7192
7193     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7194	with specified algorithm.  */
7195bool
7196ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7197			   rtx align_exp, rtx expected_align_exp,
7198			   rtx expected_size_exp, rtx min_size_exp,
7199			   rtx max_size_exp, rtx probable_max_size_exp,
7200			   bool issetmem)
7201{
7202  rtx destreg;
7203  rtx srcreg = NULL;
7204  rtx_code_label *label = NULL;
7205  rtx tmp;
7206  rtx_code_label *jump_around_label = NULL;
7207  HOST_WIDE_INT align = 1;
7208  unsigned HOST_WIDE_INT count = 0;
7209  HOST_WIDE_INT expected_size = -1;
7210  int size_needed = 0, epilogue_size_needed;
7211  int desired_align = 0, align_bytes = 0;
7212  enum stringop_alg alg;
7213  rtx promoted_val = NULL;
7214  rtx vec_promoted_val = NULL;
7215  bool force_loopy_epilogue = false;
7216  int dynamic_check;
7217  bool need_zero_guard = false;
7218  bool noalign;
7219  machine_mode move_mode = VOIDmode;
7220  machine_mode wider_mode;
7221  int unroll_factor = 1;
7222  /* TODO: Once value ranges are available, fill in proper data.  */
7223  unsigned HOST_WIDE_INT min_size = 0;
7224  unsigned HOST_WIDE_INT max_size = -1;
7225  unsigned HOST_WIDE_INT probable_max_size = -1;
7226  bool misaligned_prologue_used = false;
7227  bool have_as;
7228
7229  if (CONST_INT_P (align_exp))
7230    align = INTVAL (align_exp);
7231  /* i386 can do misaligned access on reasonably increased cost.  */
7232  if (CONST_INT_P (expected_align_exp)
7233      && INTVAL (expected_align_exp) > align)
7234    align = INTVAL (expected_align_exp);
7235  /* ALIGN is the minimum of destination and source alignment, but we care here
7236     just about destination alignment.  */
7237  else if (!issetmem
7238	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7239    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7240
7241  if (CONST_INT_P (count_exp))
7242    {
7243      min_size = max_size = probable_max_size = count = expected_size
7244	= INTVAL (count_exp);
7245      /* When COUNT is 0, there is nothing to do.  */
7246      if (!count)
7247	return true;
7248    }
7249  else
7250    {
7251      if (min_size_exp)
7252	min_size = INTVAL (min_size_exp);
7253      if (max_size_exp)
7254	max_size = INTVAL (max_size_exp);
7255      if (probable_max_size_exp)
7256	probable_max_size = INTVAL (probable_max_size_exp);
7257      if (CONST_INT_P (expected_size_exp))
7258	expected_size = INTVAL (expected_size_exp);
7259     }
7260
7261  /* Make sure we don't need to care about overflow later on.  */
7262  if (count > (HOST_WIDE_INT_1U << 30))
7263    return false;
7264
7265  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7266  if (!issetmem)
7267    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7268
7269  /* Step 0: Decide on preferred algorithm, desired alignment and
7270     size of chunks to be copied by main loop.  */
7271  alg = decide_alg (count, expected_size, min_size, probable_max_size,
7272		    issetmem,
7273		    issetmem && val_exp == const0_rtx, have_as,
7274		    &dynamic_check, &noalign, false);
7275
7276  if (dump_file)
7277    fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7278	     stringop_alg_names[alg]);
7279
7280  if (alg == libcall)
7281    return false;
7282  gcc_assert (alg != no_stringop);
7283
7284  /* For now vector-version of memset is generated only for memory zeroing, as
7285     creating of promoted vector value is very cheap in this case.  */
7286  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7287    alg = unrolled_loop;
7288
7289  if (!count)
7290    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7291  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7292  if (!issetmem)
7293    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7294
7295  unroll_factor = 1;
7296  move_mode = word_mode;
7297  switch (alg)
7298    {
7299    case libcall:
7300    case no_stringop:
7301    case last_alg:
7302      gcc_unreachable ();
7303    case loop_1_byte:
7304      need_zero_guard = true;
7305      move_mode = QImode;
7306      break;
7307    case loop:
7308      need_zero_guard = true;
7309      break;
7310    case unrolled_loop:
7311      need_zero_guard = true;
7312      unroll_factor = (TARGET_64BIT ? 4 : 2);
7313      break;
7314    case vector_loop:
7315      need_zero_guard = true;
7316      unroll_factor = 4;
7317      /* Find the widest supported mode.  */
7318      move_mode = word_mode;
7319      while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7320	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7321	move_mode = wider_mode;
7322
7323      if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7324	move_mode = TImode;
7325
7326      /* Find the corresponding vector mode with the same size as MOVE_MODE.
7327	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
7328      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7329	{
7330	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7331	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7332	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7333	    move_mode = word_mode;
7334	}
7335      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7336      break;
7337    case rep_prefix_8_byte:
7338      move_mode = DImode;
7339      break;
7340    case rep_prefix_4_byte:
7341      move_mode = SImode;
7342      break;
7343    case rep_prefix_1_byte:
7344      move_mode = QImode;
7345      break;
7346    }
7347  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7348  epilogue_size_needed = size_needed;
7349
7350  /* If we are going to call any library calls conditionally, make sure any
7351     pending stack adjustment happen before the first conditional branch,
7352     otherwise they will be emitted before the library call only and won't
7353     happen from the other branches.  */
7354  if (dynamic_check != -1)
7355    do_pending_stack_adjust ();
7356
7357  desired_align = decide_alignment (align, alg, expected_size, move_mode);
7358  if (!TARGET_ALIGN_STRINGOPS || noalign)
7359    align = desired_align;
7360
7361  /* Step 1: Prologue guard.  */
7362
7363  /* Alignment code needs count to be in register.  */
7364  if (CONST_INT_P (count_exp) && desired_align > align)
7365    {
7366      if (INTVAL (count_exp) > desired_align
7367	  && INTVAL (count_exp) > size_needed)
7368	{
7369	  align_bytes
7370	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7371	  if (align_bytes <= 0)
7372	    align_bytes = 0;
7373	  else
7374	    align_bytes = desired_align - align_bytes;
7375	}
7376      if (align_bytes == 0)
7377	count_exp = force_reg (counter_mode (count_exp), count_exp);
7378    }
7379  gcc_assert (desired_align >= 1 && align >= 1);
7380
7381  /* Misaligned move sequences handle both prologue and epilogue at once.
7382     Default code generation results in a smaller code for large alignments
7383     and also avoids redundant job when sizes are known precisely.  */
7384  misaligned_prologue_used
7385    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7386       && MAX (desired_align, epilogue_size_needed) <= 32
7387       && desired_align <= epilogue_size_needed
7388       && ((desired_align > align && !align_bytes)
7389	   || (!count && epilogue_size_needed > 1)));
7390
7391  /* Do the cheap promotion to allow better CSE across the
7392     main loop and epilogue (ie one load of the big constant in the
7393     front of all code.
7394     For now the misaligned move sequences do not have fast path
7395     without broadcasting.  */
7396  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7397    {
7398      if (alg == vector_loop)
7399	{
7400	  gcc_assert (val_exp == const0_rtx);
7401	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7402	  promoted_val = promote_duplicated_reg_to_size (val_exp,
7403							 GET_MODE_SIZE (word_mode),
7404							 desired_align, align);
7405	}
7406      else
7407	{
7408	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7409							 desired_align, align);
7410	}
7411    }
7412  /* Misaligned move sequences handles both prologues and epilogues at once.
7413     Default code generation results in smaller code for large alignments and
7414     also avoids redundant job when sizes are known precisely.  */
7415  if (misaligned_prologue_used)
7416    {
7417      /* Misaligned move prologue handled small blocks by itself.  */
7418      expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7419	   (dst, src, &destreg, &srcreg,
7420	    move_mode, promoted_val, vec_promoted_val,
7421	    &count_exp,
7422	    &jump_around_label,
7423            desired_align < align
7424	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7425	    desired_align, align, &min_size, dynamic_check, issetmem);
7426      if (!issetmem)
7427        src = change_address (src, BLKmode, srcreg);
7428      dst = change_address (dst, BLKmode, destreg);
7429      set_mem_align (dst, desired_align * BITS_PER_UNIT);
7430      epilogue_size_needed = 0;
7431      if (need_zero_guard
7432	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
7433	{
7434	  /* It is possible that we copied enough so the main loop will not
7435	     execute.  */
7436	  gcc_assert (size_needed > 1);
7437	  if (jump_around_label == NULL_RTX)
7438	    jump_around_label = gen_label_rtx ();
7439	  emit_cmp_and_jump_insns (count_exp,
7440				   GEN_INT (size_needed),
7441				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7442	  if (expected_size == -1
7443	      || expected_size < (desired_align - align) / 2 + size_needed)
7444	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7445	  else
7446	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7447	}
7448    }
7449  /* Ensure that alignment prologue won't copy past end of block.  */
7450  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7451    {
7452      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7453      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7454	 Make sure it is power of 2.  */
7455      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7456
7457      /* To improve performance of small blocks, we jump around the VAL
7458	 promoting mode.  This mean that if the promoted VAL is not constant,
7459	 we might not use it in the epilogue and have to use byte
7460	 loop variant.  */
7461      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7462	force_loopy_epilogue = true;
7463      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7464	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7465	{
7466	  /* If main algorithm works on QImode, no epilogue is needed.
7467	     For small sizes just don't align anything.  */
7468	  if (size_needed == 1)
7469	    desired_align = align;
7470	  else
7471	    goto epilogue;
7472	}
7473      else if (!count
7474	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7475	{
7476	  label = gen_label_rtx ();
7477	  emit_cmp_and_jump_insns (count_exp,
7478				   GEN_INT (epilogue_size_needed),
7479				   LTU, 0, counter_mode (count_exp), 1, label);
7480	  if (expected_size == -1 || expected_size < epilogue_size_needed)
7481	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7482	  else
7483	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7484	}
7485    }
7486
7487  /* Emit code to decide on runtime whether library call or inline should be
7488     used.  */
7489  if (dynamic_check != -1)
7490    {
7491      if (!issetmem && CONST_INT_P (count_exp))
7492	{
7493	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7494	    {
7495	      emit_block_copy_via_libcall (dst, src, count_exp);
7496	      count_exp = const0_rtx;
7497	      goto epilogue;
7498	    }
7499	}
7500      else
7501	{
7502	  rtx_code_label *hot_label = gen_label_rtx ();
7503	  if (jump_around_label == NULL_RTX)
7504	    jump_around_label = gen_label_rtx ();
7505	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7506				   LEU, 0, counter_mode (count_exp),
7507				   1, hot_label);
7508	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
7509	  if (issetmem)
7510	    set_storage_via_libcall (dst, count_exp, val_exp);
7511	  else
7512	    emit_block_copy_via_libcall (dst, src, count_exp);
7513	  emit_jump (jump_around_label);
7514	  emit_label (hot_label);
7515	}
7516    }
7517
7518  /* Step 2: Alignment prologue.  */
7519  /* Do the expensive promotion once we branched off the small blocks.  */
7520  if (issetmem && !promoted_val)
7521    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7522						   desired_align, align);
7523
7524  if (desired_align > align && !misaligned_prologue_used)
7525    {
7526      if (align_bytes == 0)
7527	{
7528	  /* Except for the first move in prologue, we no longer know
7529	     constant offset in aliasing info.  It don't seems to worth
7530	     the pain to maintain it for the first move, so throw away
7531	     the info early.  */
7532	  dst = change_address (dst, BLKmode, destreg);
7533	  if (!issetmem)
7534	    src = change_address (src, BLKmode, srcreg);
7535	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7536					    promoted_val, vec_promoted_val,
7537					    count_exp, align, desired_align,
7538					    issetmem);
7539	  /* At most desired_align - align bytes are copied.  */
7540	  if (min_size < (unsigned)(desired_align - align))
7541	    min_size = 0;
7542	  else
7543	    min_size -= desired_align - align;
7544	}
7545      else
7546	{
7547	  /* If we know how many bytes need to be stored before dst is
7548	     sufficiently aligned, maintain aliasing info accurately.  */
7549	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7550							   srcreg,
7551							   promoted_val,
7552							   vec_promoted_val,
7553							   desired_align,
7554							   align_bytes,
7555							   issetmem);
7556
7557	  count_exp = plus_constant (counter_mode (count_exp),
7558				     count_exp, -align_bytes);
7559	  count -= align_bytes;
7560	  min_size -= align_bytes;
7561	  max_size -= align_bytes;
7562	}
7563      if (need_zero_guard
7564	  && min_size < (unsigned HOST_WIDE_INT) size_needed
7565	  && (count < (unsigned HOST_WIDE_INT) size_needed
7566	      || (align_bytes == 0
7567		  && count < ((unsigned HOST_WIDE_INT) size_needed
7568			      + desired_align - align))))
7569	{
7570	  /* It is possible that we copied enough so the main loop will not
7571	     execute.  */
7572	  gcc_assert (size_needed > 1);
7573	  if (label == NULL_RTX)
7574	    label = gen_label_rtx ();
7575	  emit_cmp_and_jump_insns (count_exp,
7576				   GEN_INT (size_needed),
7577				   LTU, 0, counter_mode (count_exp), 1, label);
7578	  if (expected_size == -1
7579	      || expected_size < (desired_align - align) / 2 + size_needed)
7580	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7581	  else
7582	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7583	}
7584    }
7585  if (label && size_needed == 1)
7586    {
7587      emit_label (label);
7588      LABEL_NUSES (label) = 1;
7589      label = NULL;
7590      epilogue_size_needed = 1;
7591      if (issetmem)
7592	promoted_val = val_exp;
7593    }
7594  else if (label == NULL_RTX && !misaligned_prologue_used)
7595    epilogue_size_needed = size_needed;
7596
7597  /* Step 3: Main loop.  */
7598
7599  switch (alg)
7600    {
7601    case libcall:
7602    case no_stringop:
7603    case last_alg:
7604      gcc_unreachable ();
7605    case loop_1_byte:
7606    case loop:
7607    case unrolled_loop:
7608      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7609				     count_exp, move_mode, unroll_factor,
7610				     expected_size, issetmem);
7611      break;
7612    case vector_loop:
7613      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7614				     vec_promoted_val, count_exp, move_mode,
7615				     unroll_factor, expected_size, issetmem);
7616      break;
7617    case rep_prefix_8_byte:
7618    case rep_prefix_4_byte:
7619    case rep_prefix_1_byte:
7620      expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7621				       val_exp, count_exp, move_mode, issetmem);
7622      break;
7623    }
7624  /* Adjust properly the offset of src and dest memory for aliasing.  */
7625  if (CONST_INT_P (count_exp))
7626    {
7627      if (!issetmem)
7628	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7629					    (count / size_needed) * size_needed);
7630      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7631					  (count / size_needed) * size_needed);
7632    }
7633  else
7634    {
7635      if (!issetmem)
7636	src = change_address (src, BLKmode, srcreg);
7637      dst = change_address (dst, BLKmode, destreg);
7638    }
7639
7640  /* Step 4: Epilogue to copy the remaining bytes.  */
7641 epilogue:
7642  if (label)
7643    {
7644      /* When the main loop is done, COUNT_EXP might hold original count,
7645	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7646	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7647	 bytes. Compensate if needed.  */
7648
7649      if (size_needed < epilogue_size_needed)
7650	{
7651	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7652				     GEN_INT (size_needed - 1), count_exp, 1,
7653				     OPTAB_DIRECT);
7654	  if (tmp != count_exp)
7655	    emit_move_insn (count_exp, tmp);
7656	}
7657      emit_label (label);
7658      LABEL_NUSES (label) = 1;
7659    }
7660
7661  if (count_exp != const0_rtx && epilogue_size_needed > 1)
7662    {
7663      if (force_loopy_epilogue)
7664	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7665					 epilogue_size_needed);
7666      else
7667	{
7668	  if (issetmem)
7669	    expand_setmem_epilogue (dst, destreg, promoted_val,
7670				    vec_promoted_val, count_exp,
7671				    epilogue_size_needed);
7672	  else
7673	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7674				    epilogue_size_needed);
7675	}
7676    }
7677  if (jump_around_label)
7678    emit_label (jump_around_label);
7679  return true;
7680}
7681
7682
7683/* Expand the appropriate insns for doing strlen if not just doing
7684   repnz; scasb
7685
7686   out = result, initialized with the start address
7687   align_rtx = alignment of the address.
7688   scratch = scratch register, initialized with the startaddress when
7689	not aligned, otherwise undefined
7690
7691   This is just the body. It needs the initializations mentioned above and
7692   some address computing at the end.  These things are done in i386.md.  */
7693
7694static void
7695ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7696{
7697  int align;
7698  rtx tmp;
7699  rtx_code_label *align_2_label = NULL;
7700  rtx_code_label *align_3_label = NULL;
7701  rtx_code_label *align_4_label = gen_label_rtx ();
7702  rtx_code_label *end_0_label = gen_label_rtx ();
7703  rtx mem;
7704  rtx tmpreg = gen_reg_rtx (SImode);
7705  rtx scratch = gen_reg_rtx (SImode);
7706  rtx cmp;
7707
7708  align = 0;
7709  if (CONST_INT_P (align_rtx))
7710    align = INTVAL (align_rtx);
7711
7712  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
7713
7714  /* Is there a known alignment and is it less than 4?  */
7715  if (align < 4)
7716    {
7717      rtx scratch1 = gen_reg_rtx (Pmode);
7718      emit_move_insn (scratch1, out);
7719      /* Is there a known alignment and is it not 2? */
7720      if (align != 2)
7721	{
7722	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7723	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7724
7725	  /* Leave just the 3 lower bits.  */
7726	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7727				    NULL_RTX, 0, OPTAB_WIDEN);
7728
7729	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7730				   Pmode, 1, align_4_label);
7731	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7732				   Pmode, 1, align_2_label);
7733	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7734				   Pmode, 1, align_3_label);
7735	}
7736      else
7737        {
7738	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
7739	     check if is aligned to 4 - byte.  */
7740
7741	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7742				    NULL_RTX, 0, OPTAB_WIDEN);
7743
7744	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7745				   Pmode, 1, align_4_label);
7746        }
7747
7748      mem = change_address (src, QImode, out);
7749
7750      /* Now compare the bytes.  */
7751
7752      /* Compare the first n unaligned byte on a byte per byte basis.  */
7753      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7754			       QImode, 1, end_0_label);
7755
7756      /* Increment the address.  */
7757      emit_insn (gen_add2_insn (out, const1_rtx));
7758
7759      /* Not needed with an alignment of 2 */
7760      if (align != 2)
7761	{
7762	  emit_label (align_2_label);
7763
7764	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7765				   end_0_label);
7766
7767	  emit_insn (gen_add2_insn (out, const1_rtx));
7768
7769	  emit_label (align_3_label);
7770	}
7771
7772      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7773			       end_0_label);
7774
7775      emit_insn (gen_add2_insn (out, const1_rtx));
7776    }
7777
7778  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
7779     align this loop.  It gives only huge programs, but does not help to
7780     speed up.  */
7781  emit_label (align_4_label);
7782
7783  mem = change_address (src, SImode, out);
7784  emit_move_insn (scratch, mem);
7785  emit_insn (gen_add2_insn (out, GEN_INT (4)));
7786
7787  /* This formula yields a nonzero result iff one of the bytes is zero.
7788     This saves three branches inside loop and many cycles.  */
7789
7790  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7791  emit_insn (gen_one_cmplsi2 (scratch, scratch));
7792  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7793  emit_insn (gen_andsi3 (tmpreg, tmpreg,
7794			 gen_int_mode (0x80808080, SImode)));
7795  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7796			   align_4_label);
7797
7798  if (TARGET_CMOVE)
7799    {
7800       rtx reg = gen_reg_rtx (SImode);
7801       rtx reg2 = gen_reg_rtx (Pmode);
7802       emit_move_insn (reg, tmpreg);
7803       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7804
7805       /* If zero is not in the first two bytes, move two bytes forward.  */
7806       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7807       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7808       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7809       emit_insn (gen_rtx_SET (tmpreg,
7810			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
7811						     reg,
7812						     tmpreg)));
7813       /* Emit lea manually to avoid clobbering of flags.  */
7814       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7815
7816       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7817       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7818       emit_insn (gen_rtx_SET (out,
7819			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7820						     reg2,
7821						     out)));
7822    }
7823  else
7824    {
7825       rtx_code_label *end_2_label = gen_label_rtx ();
7826       /* Is zero in the first two bytes? */
7827
7828       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7829       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7830       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7831       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7832                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7833                            pc_rtx);
7834       tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7835       JUMP_LABEL (tmp) = end_2_label;
7836
7837       /* Not in the first two.  Move two bytes forward.  */
7838       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7839       emit_insn (gen_add2_insn (out, const2_rtx));
7840
7841       emit_label (end_2_label);
7842
7843    }
7844
7845  /* Avoid branch in fixing the byte.  */
7846  tmpreg = gen_lowpart (QImode, tmpreg);
7847  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7848  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7849  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7850  emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7851
7852  emit_label (end_0_label);
7853}
7854
7855/* Expand strlen.  */
7856
7857bool
7858ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7859{
7860if (TARGET_UNROLL_STRLEN
7861	   && TARGET_INLINE_ALL_STRINGOPS
7862	   && eoschar == const0_rtx
7863	   && optimize > 1)
7864    {
7865      /* The generic case of strlen expander is long.  Avoid it's
7866	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
7867      rtx addr = force_reg (Pmode, XEXP (src, 0));
7868      /* Well it seems that some optimizer does not combine a call like
7869	 foo(strlen(bar), strlen(bar));
7870	 when the move and the subtraction is done here.  It does calculate
7871	 the length just once when these instructions are done inside of
7872	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
7873	 often used and I use one fewer register for the lifetime of
7874	 output_strlen_unroll() this is better.  */
7875
7876      emit_move_insn (out, addr);
7877
7878      ix86_expand_strlensi_unroll_1 (out, src, align);
7879
7880      /* strlensi_unroll_1 returns the address of the zero at the end of
7881	 the string, like memchr(), so compute the length by subtracting
7882	 the start address.  */
7883      emit_insn (gen_sub2_insn (out, addr));
7884      return true;
7885    }
7886  else
7887    return false;
7888}
7889
7890/* For given symbol (function) construct code to compute address of it's PLT
7891   entry in large x86-64 PIC model.  */
7892
7893static rtx
7894construct_plt_address (rtx symbol)
7895{
7896  rtx tmp, unspec;
7897
7898  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7899  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7900  gcc_assert (Pmode == DImode);
7901
7902  tmp = gen_reg_rtx (Pmode);
7903  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7904
7905  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7906  emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7907  return tmp;
7908}
7909
7910/* Additional registers that are clobbered by SYSV calls.  */
7911
7912static int const x86_64_ms_sysv_extra_clobbered_registers
7913		 [NUM_X86_64_MS_CLOBBERED_REGS] =
7914{
7915  SI_REG, DI_REG,
7916  XMM6_REG, XMM7_REG,
7917  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7918  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7919};
7920
7921rtx_insn *
7922ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7923		  rtx callarg2,
7924		  rtx pop, bool sibcall)
7925{
7926  rtx vec[3];
7927  rtx use = NULL, call;
7928  unsigned int vec_len = 0;
7929  tree fndecl;
7930
7931  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7932    {
7933      fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7934      if (fndecl
7935	  && (lookup_attribute ("interrupt",
7936				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7937	error ("interrupt service routine cannot be called directly");
7938    }
7939  else
7940    fndecl = NULL_TREE;
7941
7942  if (pop == const0_rtx)
7943    pop = NULL;
7944  gcc_assert (!TARGET_64BIT || !pop);
7945
7946  rtx addr = XEXP (fnaddr, 0);
7947  if (TARGET_MACHO && !TARGET_64BIT)
7948    {
7949#if TARGET_MACHO
7950      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7951	fnaddr = machopic_indirect_call_target (fnaddr);
7952#endif
7953    }
7954  else
7955    {
7956      /* Static functions and indirect calls don't need the pic register.  Also,
7957	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7958	 it an indirect call.  */
7959      if (flag_pic
7960	  && GET_CODE (addr) == SYMBOL_REF
7961	  && !SYMBOL_REF_LOCAL_P (addr))
7962	{
7963	  if (flag_plt
7964	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
7965		  || !lookup_attribute ("noplt",
7966					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7967	    {
7968	      if (!TARGET_64BIT
7969		  || (ix86_cmodel == CM_LARGE_PIC
7970		      && DEFAULT_ABI != MS_ABI))
7971		{
7972		  use_reg (&use, gen_rtx_REG (Pmode,
7973					      REAL_PIC_OFFSET_TABLE_REGNUM));
7974		  if (ix86_use_pseudo_pic_reg ())
7975		    emit_move_insn (gen_rtx_REG (Pmode,
7976						 REAL_PIC_OFFSET_TABLE_REGNUM),
7977				    pic_offset_table_rtx);
7978		}
7979	    }
7980	  else if (!TARGET_PECOFF && !TARGET_MACHO)
7981	    {
7982	      if (TARGET_64BIT
7983		  && ix86_cmodel == CM_LARGE_PIC
7984		  && DEFAULT_ABI != MS_ABI)
7985		{
7986		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7987					   UNSPEC_GOT);
7988		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7989		  fnaddr = force_reg (Pmode, fnaddr);
7990		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
7991		}
7992	      else if (TARGET_64BIT)
7993		{
7994		  fnaddr = gen_rtx_UNSPEC (Pmode,
7995					   gen_rtvec (1, addr),
7996					   UNSPEC_GOTPCREL);
7997		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7998		}
7999	      else
8000		{
8001		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8002					   UNSPEC_GOT);
8003		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8004		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8005					 fnaddr);
8006		}
8007	      fnaddr = gen_const_mem (Pmode, fnaddr);
8008	      /* Pmode may not be the same as word_mode for x32, which
8009		 doesn't support indirect branch via 32-bit memory slot.
8010		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8011		 indirect branch via x32 GOT slot is OK.  */
8012	      if (GET_MODE (fnaddr) != word_mode)
8013		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8014	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
8015	    }
8016	}
8017    }
8018
8019  /* Skip setting up RAX register for -mskip-rax-setup when there are no
8020     parameters passed in vector registers.  */
8021  if (TARGET_64BIT
8022      && (INTVAL (callarg2) > 0
8023	  || (INTVAL (callarg2) == 0
8024	      && (TARGET_SSE || !flag_skip_rax_setup))))
8025    {
8026      rtx al = gen_rtx_REG (QImode, AX_REG);
8027      emit_move_insn (al, callarg2);
8028      use_reg (&use, al);
8029    }
8030
8031  if (ix86_cmodel == CM_LARGE_PIC
8032      && !TARGET_PECOFF
8033      && MEM_P (fnaddr)
8034      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8035      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8036    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8037  /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8038     branch via x32 GOT slot is OK.  */
8039  else if (!(TARGET_X32
8040	     && MEM_P (fnaddr)
8041	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8042	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8043	   && (sibcall
8044	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8045	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8046    {
8047      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8048      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8049    }
8050
8051  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8052
8053  if (retval)
8054    call = gen_rtx_SET (retval, call);
8055  vec[vec_len++] = call;
8056
8057  if (pop)
8058    {
8059      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8060      pop = gen_rtx_SET (stack_pointer_rtx, pop);
8061      vec[vec_len++] = pop;
8062    }
8063
8064  if (cfun->machine->no_caller_saved_registers
8065      && (!fndecl
8066	  || (!TREE_THIS_VOLATILE (fndecl)
8067	      && !lookup_attribute ("no_caller_saved_registers",
8068				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8069    {
8070      static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8071      bool is_64bit_ms_abi = (TARGET_64BIT
8072			      && ix86_function_abi (fndecl) == MS_ABI);
8073      char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8074
8075      /* If there are no caller-saved registers, add all registers
8076	 that are clobbered by the call which returns.  */
8077      for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8078	if (!fixed_regs[i]
8079	    && (ix86_call_used_regs[i] == 1
8080		|| (ix86_call_used_regs[i] & c_mask))
8081	    && !STACK_REGNO_P (i)
8082	    && !MMX_REGNO_P (i))
8083	  clobber_reg (&use,
8084		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8085    }
8086  else if (TARGET_64BIT_MS_ABI
8087	   && (!callarg2 || INTVAL (callarg2) != -2))
8088    {
8089      unsigned i;
8090
8091      for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8092	{
8093	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8094	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8095
8096	  clobber_reg (&use, gen_rtx_REG (mode, regno));
8097	}
8098
8099      /* Set here, but it may get cleared later.  */
8100      if (TARGET_CALL_MS2SYSV_XLOGUES)
8101	{
8102	  if (!TARGET_SSE)
8103	    ;
8104
8105	  /* Don't break hot-patched functions.  */
8106	  else if (ix86_function_ms_hook_prologue (current_function_decl))
8107	    ;
8108
8109	  /* TODO: Cases not yet examined.  */
8110	  else if (flag_split_stack)
8111	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8112
8113	  else
8114	    {
8115	      gcc_assert (!reload_completed);
8116	      cfun->machine->call_ms2sysv = true;
8117	    }
8118	}
8119    }
8120
8121  if (TARGET_MACHO && TARGET_64BIT && !sibcall
8122      && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
8123	  || !fndecl || TREE_PUBLIC (fndecl)))
8124    {
8125      /* We allow public functions defined in a TU to bind locally for PIC
8126	 code (the default) on 64bit Mach-O.
8127	 If such functions are not inlined, we cannot tell at compile-time if
8128	 they will be called via the lazy symbol resolver (this can depend on
8129	 options given at link-time).  Therefore, we must assume that the lazy
8130	 resolver could be used which clobbers R11 and R10.  */
8131      clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
8132      clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
8133    }
8134
8135  if (vec_len > 1)
8136    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8137  rtx_insn *call_insn = emit_call_insn (call);
8138  if (use)
8139    CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8140
8141  return call_insn;
8142}
8143
8144/* Split simple return with popping POPC bytes from stack to indirect
8145   branch with stack adjustment .  */
8146
8147void
8148ix86_split_simple_return_pop_internal (rtx popc)
8149{
8150  struct machine_function *m = cfun->machine;
8151  rtx ecx = gen_rtx_REG (SImode, CX_REG);
8152  rtx_insn *insn;
8153
8154  /* There is no "pascal" calling convention in any 64bit ABI.  */
8155  gcc_assert (!TARGET_64BIT);
8156
8157  insn = emit_insn (gen_pop (ecx));
8158  m->fs.cfa_offset -= UNITS_PER_WORD;
8159  m->fs.sp_offset -= UNITS_PER_WORD;
8160
8161  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8162  x = gen_rtx_SET (stack_pointer_rtx, x);
8163  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8164  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8165  RTX_FRAME_RELATED_P (insn) = 1;
8166
8167  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8168  x = gen_rtx_SET (stack_pointer_rtx, x);
8169  insn = emit_insn (x);
8170  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8171  RTX_FRAME_RELATED_P (insn) = 1;
8172
8173  /* Now return address is in ECX.  */
8174  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8175}
8176
8177/* Errors in the source file can cause expand_expr to return const0_rtx
8178   where we expect a vector.  To avoid crashing, use one of the vector
8179   clear instructions.  */
8180
8181static rtx
8182safe_vector_operand (rtx x, machine_mode mode)
8183{
8184  if (x == const0_rtx)
8185    x = CONST0_RTX (mode);
8186  return x;
8187}
8188
8189/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
8190
8191static rtx
8192ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8193{
8194  rtx pat;
8195  tree arg0 = CALL_EXPR_ARG (exp, 0);
8196  tree arg1 = CALL_EXPR_ARG (exp, 1);
8197  rtx op0 = expand_normal (arg0);
8198  rtx op1 = expand_normal (arg1);
8199  machine_mode tmode = insn_data[icode].operand[0].mode;
8200  machine_mode mode0 = insn_data[icode].operand[1].mode;
8201  machine_mode mode1 = insn_data[icode].operand[2].mode;
8202
8203  if (VECTOR_MODE_P (mode0))
8204    op0 = safe_vector_operand (op0, mode0);
8205  if (VECTOR_MODE_P (mode1))
8206    op1 = safe_vector_operand (op1, mode1);
8207
8208  if (optimize || !target
8209      || GET_MODE (target) != tmode
8210      || !insn_data[icode].operand[0].predicate (target, tmode))
8211    target = gen_reg_rtx (tmode);
8212
8213  if (GET_MODE (op1) == SImode && mode1 == TImode)
8214    {
8215      rtx x = gen_reg_rtx (V4SImode);
8216      emit_insn (gen_sse2_loadd (x, op1));
8217      op1 = gen_lowpart (TImode, x);
8218    }
8219
8220  if (!insn_data[icode].operand[1].predicate (op0, mode0))
8221    op0 = copy_to_mode_reg (mode0, op0);
8222  if (!insn_data[icode].operand[2].predicate (op1, mode1))
8223    op1 = copy_to_mode_reg (mode1, op1);
8224
8225  pat = GEN_FCN (icode) (target, op0, op1);
8226  if (! pat)
8227    return 0;
8228
8229  emit_insn (pat);
8230
8231  return target;
8232}
8233
8234/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
8235
8236static rtx
8237ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8238			       enum ix86_builtin_func_type m_type,
8239			       enum rtx_code sub_code)
8240{
8241  rtx pat;
8242  int i;
8243  int nargs;
8244  bool comparison_p = false;
8245  bool tf_p = false;
8246  bool last_arg_constant = false;
8247  int num_memory = 0;
8248  struct {
8249    rtx op;
8250    machine_mode mode;
8251  } args[4];
8252
8253  machine_mode tmode = insn_data[icode].operand[0].mode;
8254
8255  switch (m_type)
8256    {
8257    case MULTI_ARG_4_DF2_DI_I:
8258    case MULTI_ARG_4_DF2_DI_I1:
8259    case MULTI_ARG_4_SF2_SI_I:
8260    case MULTI_ARG_4_SF2_SI_I1:
8261      nargs = 4;
8262      last_arg_constant = true;
8263      break;
8264
8265    case MULTI_ARG_3_SF:
8266    case MULTI_ARG_3_DF:
8267    case MULTI_ARG_3_SF2:
8268    case MULTI_ARG_3_DF2:
8269    case MULTI_ARG_3_DI:
8270    case MULTI_ARG_3_SI:
8271    case MULTI_ARG_3_SI_DI:
8272    case MULTI_ARG_3_HI:
8273    case MULTI_ARG_3_HI_SI:
8274    case MULTI_ARG_3_QI:
8275    case MULTI_ARG_3_DI2:
8276    case MULTI_ARG_3_SI2:
8277    case MULTI_ARG_3_HI2:
8278    case MULTI_ARG_3_QI2:
8279      nargs = 3;
8280      break;
8281
8282    case MULTI_ARG_2_SF:
8283    case MULTI_ARG_2_DF:
8284    case MULTI_ARG_2_DI:
8285    case MULTI_ARG_2_SI:
8286    case MULTI_ARG_2_HI:
8287    case MULTI_ARG_2_QI:
8288      nargs = 2;
8289      break;
8290
8291    case MULTI_ARG_2_DI_IMM:
8292    case MULTI_ARG_2_SI_IMM:
8293    case MULTI_ARG_2_HI_IMM:
8294    case MULTI_ARG_2_QI_IMM:
8295      nargs = 2;
8296      last_arg_constant = true;
8297      break;
8298
8299    case MULTI_ARG_1_SF:
8300    case MULTI_ARG_1_DF:
8301    case MULTI_ARG_1_SF2:
8302    case MULTI_ARG_1_DF2:
8303    case MULTI_ARG_1_DI:
8304    case MULTI_ARG_1_SI:
8305    case MULTI_ARG_1_HI:
8306    case MULTI_ARG_1_QI:
8307    case MULTI_ARG_1_SI_DI:
8308    case MULTI_ARG_1_HI_DI:
8309    case MULTI_ARG_1_HI_SI:
8310    case MULTI_ARG_1_QI_DI:
8311    case MULTI_ARG_1_QI_SI:
8312    case MULTI_ARG_1_QI_HI:
8313      nargs = 1;
8314      break;
8315
8316    case MULTI_ARG_2_DI_CMP:
8317    case MULTI_ARG_2_SI_CMP:
8318    case MULTI_ARG_2_HI_CMP:
8319    case MULTI_ARG_2_QI_CMP:
8320      nargs = 2;
8321      comparison_p = true;
8322      break;
8323
8324    case MULTI_ARG_2_SF_TF:
8325    case MULTI_ARG_2_DF_TF:
8326    case MULTI_ARG_2_DI_TF:
8327    case MULTI_ARG_2_SI_TF:
8328    case MULTI_ARG_2_HI_TF:
8329    case MULTI_ARG_2_QI_TF:
8330      nargs = 2;
8331      tf_p = true;
8332      break;
8333
8334    default:
8335      gcc_unreachable ();
8336    }
8337
8338  if (optimize || !target
8339      || GET_MODE (target) != tmode
8340      || !insn_data[icode].operand[0].predicate (target, tmode))
8341    target = gen_reg_rtx (tmode);
8342  else if (memory_operand (target, tmode))
8343    num_memory++;
8344
8345  gcc_assert (nargs <= 4);
8346
8347  for (i = 0; i < nargs; i++)
8348    {
8349      tree arg = CALL_EXPR_ARG (exp, i);
8350      rtx op = expand_normal (arg);
8351      int adjust = (comparison_p) ? 1 : 0;
8352      machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8353
8354      if (last_arg_constant && i == nargs - 1)
8355	{
8356	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8357	    {
8358	      enum insn_code new_icode = icode;
8359	      switch (icode)
8360		{
8361		case CODE_FOR_xop_vpermil2v2df3:
8362		case CODE_FOR_xop_vpermil2v4sf3:
8363		case CODE_FOR_xop_vpermil2v4df3:
8364		case CODE_FOR_xop_vpermil2v8sf3:
8365		  error ("the last argument must be a 2-bit immediate");
8366		  return gen_reg_rtx (tmode);
8367		case CODE_FOR_xop_rotlv2di3:
8368		  new_icode = CODE_FOR_rotlv2di3;
8369		  goto xop_rotl;
8370		case CODE_FOR_xop_rotlv4si3:
8371		  new_icode = CODE_FOR_rotlv4si3;
8372		  goto xop_rotl;
8373		case CODE_FOR_xop_rotlv8hi3:
8374		  new_icode = CODE_FOR_rotlv8hi3;
8375		  goto xop_rotl;
8376		case CODE_FOR_xop_rotlv16qi3:
8377		  new_icode = CODE_FOR_rotlv16qi3;
8378		xop_rotl:
8379		  if (CONST_INT_P (op))
8380		    {
8381		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8382		      op = GEN_INT (INTVAL (op) & mask);
8383		      gcc_checking_assert
8384			(insn_data[icode].operand[i + 1].predicate (op, mode));
8385		    }
8386		  else
8387		    {
8388		      gcc_checking_assert
8389			(nargs == 2
8390			 && insn_data[new_icode].operand[0].mode == tmode
8391			 && insn_data[new_icode].operand[1].mode == tmode
8392			 && insn_data[new_icode].operand[2].mode == mode
8393			 && insn_data[new_icode].operand[0].predicate
8394			    == insn_data[icode].operand[0].predicate
8395			 && insn_data[new_icode].operand[1].predicate
8396			    == insn_data[icode].operand[1].predicate);
8397		      icode = new_icode;
8398		      goto non_constant;
8399		    }
8400		  break;
8401		default:
8402		  gcc_unreachable ();
8403		}
8404	    }
8405	}
8406      else
8407	{
8408	non_constant:
8409	  if (VECTOR_MODE_P (mode))
8410	    op = safe_vector_operand (op, mode);
8411
8412	  /* If we aren't optimizing, only allow one memory operand to be
8413	     generated.  */
8414	  if (memory_operand (op, mode))
8415	    num_memory++;
8416
8417	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8418
8419	  if (optimize
8420	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8421	      || num_memory > 1)
8422	    op = force_reg (mode, op);
8423	}
8424
8425      args[i].op = op;
8426      args[i].mode = mode;
8427    }
8428
8429  switch (nargs)
8430    {
8431    case 1:
8432      pat = GEN_FCN (icode) (target, args[0].op);
8433      break;
8434
8435    case 2:
8436      if (tf_p)
8437	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8438			       GEN_INT ((int)sub_code));
8439      else if (! comparison_p)
8440	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8441      else
8442	{
8443	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8444				       args[0].op,
8445				       args[1].op);
8446
8447	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8448	}
8449      break;
8450
8451    case 3:
8452      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8453      break;
8454
8455    case 4:
8456      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8457      break;
8458
8459    default:
8460      gcc_unreachable ();
8461    }
8462
8463  if (! pat)
8464    return 0;
8465
8466  emit_insn (pat);
8467  return target;
8468}
8469
8470/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8471   insns with vec_merge.  */
8472
8473static rtx
8474ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8475				    rtx target)
8476{
8477  rtx pat;
8478  tree arg0 = CALL_EXPR_ARG (exp, 0);
8479  rtx op1, op0 = expand_normal (arg0);
8480  machine_mode tmode = insn_data[icode].operand[0].mode;
8481  machine_mode mode0 = insn_data[icode].operand[1].mode;
8482
8483  if (optimize || !target
8484      || GET_MODE (target) != tmode
8485      || !insn_data[icode].operand[0].predicate (target, tmode))
8486    target = gen_reg_rtx (tmode);
8487
8488  if (VECTOR_MODE_P (mode0))
8489    op0 = safe_vector_operand (op0, mode0);
8490
8491  if ((optimize && !register_operand (op0, mode0))
8492      || !insn_data[icode].operand[1].predicate (op0, mode0))
8493    op0 = copy_to_mode_reg (mode0, op0);
8494
8495  op1 = op0;
8496  if (!insn_data[icode].operand[2].predicate (op1, mode0))
8497    op1 = copy_to_mode_reg (mode0, op1);
8498
8499  pat = GEN_FCN (icode) (target, op0, op1);
8500  if (! pat)
8501    return 0;
8502  emit_insn (pat);
8503  return target;
8504}
8505
8506/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
8507
8508static rtx
8509ix86_expand_sse_compare (const struct builtin_description *d,
8510			 tree exp, rtx target, bool swap)
8511{
8512  rtx pat;
8513  tree arg0 = CALL_EXPR_ARG (exp, 0);
8514  tree arg1 = CALL_EXPR_ARG (exp, 1);
8515  rtx op0 = expand_normal (arg0);
8516  rtx op1 = expand_normal (arg1);
8517  rtx op2;
8518  machine_mode tmode = insn_data[d->icode].operand[0].mode;
8519  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8520  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8521  enum rtx_code comparison = d->comparison;
8522
8523  if (VECTOR_MODE_P (mode0))
8524    op0 = safe_vector_operand (op0, mode0);
8525  if (VECTOR_MODE_P (mode1))
8526    op1 = safe_vector_operand (op1, mode1);
8527
8528  /* Swap operands if we have a comparison that isn't available in
8529     hardware.  */
8530  if (swap)
8531    std::swap (op0, op1);
8532
8533  if (optimize || !target
8534      || GET_MODE (target) != tmode
8535      || !insn_data[d->icode].operand[0].predicate (target, tmode))
8536    target = gen_reg_rtx (tmode);
8537
8538  if ((optimize && !register_operand (op0, mode0))
8539      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8540    op0 = copy_to_mode_reg (mode0, op0);
8541  if ((optimize && !register_operand (op1, mode1))
8542      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8543    op1 = copy_to_mode_reg (mode1, op1);
8544
8545  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8546  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8547  if (! pat)
8548    return 0;
8549  emit_insn (pat);
8550  return target;
8551}
8552
8553/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
8554
8555static rtx
8556ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8557		      rtx target)
8558{
8559  rtx pat;
8560  tree arg0 = CALL_EXPR_ARG (exp, 0);
8561  tree arg1 = CALL_EXPR_ARG (exp, 1);
8562  rtx op0 = expand_normal (arg0);
8563  rtx op1 = expand_normal (arg1);
8564  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8565  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8566  enum rtx_code comparison = d->comparison;
8567
8568  if (VECTOR_MODE_P (mode0))
8569    op0 = safe_vector_operand (op0, mode0);
8570  if (VECTOR_MODE_P (mode1))
8571    op1 = safe_vector_operand (op1, mode1);
8572
8573  /* Swap operands if we have a comparison that isn't available in
8574     hardware.  */
8575  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8576    std::swap (op0, op1);
8577
8578  target = gen_reg_rtx (SImode);
8579  emit_move_insn (target, const0_rtx);
8580  target = gen_rtx_SUBREG (QImode, target, 0);
8581
8582  if ((optimize && !register_operand (op0, mode0))
8583      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8584    op0 = copy_to_mode_reg (mode0, op0);
8585  if ((optimize && !register_operand (op1, mode1))
8586      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8587    op1 = copy_to_mode_reg (mode1, op1);
8588
8589  pat = GEN_FCN (d->icode) (op0, op1);
8590  if (! pat)
8591    return 0;
8592  emit_insn (pat);
8593  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8594			  gen_rtx_fmt_ee (comparison, QImode,
8595					  SET_DEST (pat),
8596					  const0_rtx)));
8597
8598  return SUBREG_REG (target);
8599}
8600
8601/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
8602
8603static rtx
8604ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8605		       rtx target)
8606{
8607  rtx pat;
8608  tree arg0 = CALL_EXPR_ARG (exp, 0);
8609  rtx op1, op0 = expand_normal (arg0);
8610  machine_mode tmode = insn_data[d->icode].operand[0].mode;
8611  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8612
8613  if (optimize || target == 0
8614      || GET_MODE (target) != tmode
8615      || !insn_data[d->icode].operand[0].predicate (target, tmode))
8616    target = gen_reg_rtx (tmode);
8617
8618  if (VECTOR_MODE_P (mode0))
8619    op0 = safe_vector_operand (op0, mode0);
8620
8621  if ((optimize && !register_operand (op0, mode0))
8622      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8623    op0 = copy_to_mode_reg (mode0, op0);
8624
8625  op1 = GEN_INT (d->comparison);
8626
8627  pat = GEN_FCN (d->icode) (target, op0, op1);
8628  if (! pat)
8629    return 0;
8630  emit_insn (pat);
8631  return target;
8632}
8633
8634static rtx
8635ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8636				     tree exp, rtx target)
8637{
8638  rtx pat;
8639  tree arg0 = CALL_EXPR_ARG (exp, 0);
8640  tree arg1 = CALL_EXPR_ARG (exp, 1);
8641  rtx op0 = expand_normal (arg0);
8642  rtx op1 = expand_normal (arg1);
8643  rtx op2;
8644  machine_mode tmode = insn_data[d->icode].operand[0].mode;
8645  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8646  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8647
8648  if (optimize || target == 0
8649      || GET_MODE (target) != tmode
8650      || !insn_data[d->icode].operand[0].predicate (target, tmode))
8651    target = gen_reg_rtx (tmode);
8652
8653  op0 = safe_vector_operand (op0, mode0);
8654  op1 = safe_vector_operand (op1, mode1);
8655
8656  if ((optimize && !register_operand (op0, mode0))
8657      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8658    op0 = copy_to_mode_reg (mode0, op0);
8659  if ((optimize && !register_operand (op1, mode1))
8660      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8661    op1 = copy_to_mode_reg (mode1, op1);
8662
8663  op2 = GEN_INT (d->comparison);
8664
8665  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8666  if (! pat)
8667    return 0;
8668  emit_insn (pat);
8669  return target;
8670}
8671
8672/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
8673
8674static rtx
8675ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8676		       rtx target)
8677{
8678  rtx pat;
8679  tree arg0 = CALL_EXPR_ARG (exp, 0);
8680  tree arg1 = CALL_EXPR_ARG (exp, 1);
8681  rtx op0 = expand_normal (arg0);
8682  rtx op1 = expand_normal (arg1);
8683  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8684  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8685  enum rtx_code comparison = d->comparison;
8686
8687  if (VECTOR_MODE_P (mode0))
8688    op0 = safe_vector_operand (op0, mode0);
8689  if (VECTOR_MODE_P (mode1))
8690    op1 = safe_vector_operand (op1, mode1);
8691
8692  target = gen_reg_rtx (SImode);
8693  emit_move_insn (target, const0_rtx);
8694  target = gen_rtx_SUBREG (QImode, target, 0);
8695
8696  if ((optimize && !register_operand (op0, mode0))
8697      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8698    op0 = copy_to_mode_reg (mode0, op0);
8699  if ((optimize && !register_operand (op1, mode1))
8700      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8701    op1 = copy_to_mode_reg (mode1, op1);
8702
8703  pat = GEN_FCN (d->icode) (op0, op1);
8704  if (! pat)
8705    return 0;
8706  emit_insn (pat);
8707  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8708			  gen_rtx_fmt_ee (comparison, QImode,
8709					  SET_DEST (pat),
8710					  const0_rtx)));
8711
8712  return SUBREG_REG (target);
8713}
8714
8715/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
8716
8717static rtx
8718ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8719			  tree exp, rtx target)
8720{
8721  rtx pat;
8722  tree arg0 = CALL_EXPR_ARG (exp, 0);
8723  tree arg1 = CALL_EXPR_ARG (exp, 1);
8724  tree arg2 = CALL_EXPR_ARG (exp, 2);
8725  tree arg3 = CALL_EXPR_ARG (exp, 3);
8726  tree arg4 = CALL_EXPR_ARG (exp, 4);
8727  rtx scratch0, scratch1;
8728  rtx op0 = expand_normal (arg0);
8729  rtx op1 = expand_normal (arg1);
8730  rtx op2 = expand_normal (arg2);
8731  rtx op3 = expand_normal (arg3);
8732  rtx op4 = expand_normal (arg4);
8733  machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8734
8735  tmode0 = insn_data[d->icode].operand[0].mode;
8736  tmode1 = insn_data[d->icode].operand[1].mode;
8737  modev2 = insn_data[d->icode].operand[2].mode;
8738  modei3 = insn_data[d->icode].operand[3].mode;
8739  modev4 = insn_data[d->icode].operand[4].mode;
8740  modei5 = insn_data[d->icode].operand[5].mode;
8741  modeimm = insn_data[d->icode].operand[6].mode;
8742
8743  if (VECTOR_MODE_P (modev2))
8744    op0 = safe_vector_operand (op0, modev2);
8745  if (VECTOR_MODE_P (modev4))
8746    op2 = safe_vector_operand (op2, modev4);
8747
8748  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8749    op0 = copy_to_mode_reg (modev2, op0);
8750  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8751    op1 = copy_to_mode_reg (modei3, op1);
8752  if ((optimize && !register_operand (op2, modev4))
8753      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8754    op2 = copy_to_mode_reg (modev4, op2);
8755  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8756    op3 = copy_to_mode_reg (modei5, op3);
8757
8758  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8759    {
8760      error ("the fifth argument must be an 8-bit immediate");
8761      return const0_rtx;
8762    }
8763
8764  if (d->code == IX86_BUILTIN_PCMPESTRI128)
8765    {
8766      if (optimize || !target
8767	  || GET_MODE (target) != tmode0
8768	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8769	target = gen_reg_rtx (tmode0);
8770
8771      scratch1 = gen_reg_rtx (tmode1);
8772
8773      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8774    }
8775  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8776    {
8777      if (optimize || !target
8778	  || GET_MODE (target) != tmode1
8779	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8780	target = gen_reg_rtx (tmode1);
8781
8782      scratch0 = gen_reg_rtx (tmode0);
8783
8784      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8785    }
8786  else
8787    {
8788      gcc_assert (d->flag);
8789
8790      scratch0 = gen_reg_rtx (tmode0);
8791      scratch1 = gen_reg_rtx (tmode1);
8792
8793      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8794    }
8795
8796  if (! pat)
8797    return 0;
8798
8799  emit_insn (pat);
8800
8801  if (d->flag)
8802    {
8803      target = gen_reg_rtx (SImode);
8804      emit_move_insn (target, const0_rtx);
8805      target = gen_rtx_SUBREG (QImode, target, 0);
8806
8807      emit_insn
8808	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8809		      gen_rtx_fmt_ee (EQ, QImode,
8810				      gen_rtx_REG ((machine_mode) d->flag,
8811						   FLAGS_REG),
8812				      const0_rtx)));
8813      return SUBREG_REG (target);
8814    }
8815  else
8816    return target;
8817}
8818
8819
8820/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
8821
8822static rtx
8823ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8824			  tree exp, rtx target)
8825{
8826  rtx pat;
8827  tree arg0 = CALL_EXPR_ARG (exp, 0);
8828  tree arg1 = CALL_EXPR_ARG (exp, 1);
8829  tree arg2 = CALL_EXPR_ARG (exp, 2);
8830  rtx scratch0, scratch1;
8831  rtx op0 = expand_normal (arg0);
8832  rtx op1 = expand_normal (arg1);
8833  rtx op2 = expand_normal (arg2);
8834  machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8835
8836  tmode0 = insn_data[d->icode].operand[0].mode;
8837  tmode1 = insn_data[d->icode].operand[1].mode;
8838  modev2 = insn_data[d->icode].operand[2].mode;
8839  modev3 = insn_data[d->icode].operand[3].mode;
8840  modeimm = insn_data[d->icode].operand[4].mode;
8841
8842  if (VECTOR_MODE_P (modev2))
8843    op0 = safe_vector_operand (op0, modev2);
8844  if (VECTOR_MODE_P (modev3))
8845    op1 = safe_vector_operand (op1, modev3);
8846
8847  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8848    op0 = copy_to_mode_reg (modev2, op0);
8849  if ((optimize && !register_operand (op1, modev3))
8850      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8851    op1 = copy_to_mode_reg (modev3, op1);
8852
8853  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8854    {
8855      error ("the third argument must be an 8-bit immediate");
8856      return const0_rtx;
8857    }
8858
8859  if (d->code == IX86_BUILTIN_PCMPISTRI128)
8860    {
8861      if (optimize || !target
8862	  || GET_MODE (target) != tmode0
8863	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8864	target = gen_reg_rtx (tmode0);
8865
8866      scratch1 = gen_reg_rtx (tmode1);
8867
8868      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8869    }
8870  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8871    {
8872      if (optimize || !target
8873	  || GET_MODE (target) != tmode1
8874	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8875	target = gen_reg_rtx (tmode1);
8876
8877      scratch0 = gen_reg_rtx (tmode0);
8878
8879      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8880    }
8881  else
8882    {
8883      gcc_assert (d->flag);
8884
8885      scratch0 = gen_reg_rtx (tmode0);
8886      scratch1 = gen_reg_rtx (tmode1);
8887
8888      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8889    }
8890
8891  if (! pat)
8892    return 0;
8893
8894  emit_insn (pat);
8895
8896  if (d->flag)
8897    {
8898      target = gen_reg_rtx (SImode);
8899      emit_move_insn (target, const0_rtx);
8900      target = gen_rtx_SUBREG (QImode, target, 0);
8901
8902      emit_insn
8903	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8904		      gen_rtx_fmt_ee (EQ, QImode,
8905				      gen_rtx_REG ((machine_mode) d->flag,
8906						   FLAGS_REG),
8907				      const0_rtx)));
8908      return SUBREG_REG (target);
8909    }
8910  else
8911    return target;
8912}
8913
8914/* Fixup modeless constants to fit required mode.  */
8915
8916static rtx
8917fixup_modeless_constant (rtx x, machine_mode mode)
8918{
8919  if (GET_MODE (x) == VOIDmode)
8920    x = convert_to_mode (mode, x, 1);
8921  return x;
8922}
8923
8924/* Subroutine of ix86_expand_builtin to take care of insns with
8925   variable number of operands.  */
8926
8927static rtx
8928ix86_expand_args_builtin (const struct builtin_description *d,
8929			  tree exp, rtx target)
8930{
8931  rtx pat, real_target;
8932  unsigned int i, nargs;
8933  unsigned int nargs_constant = 0;
8934  unsigned int mask_pos = 0;
8935  int num_memory = 0;
8936  struct
8937    {
8938      rtx op;
8939      machine_mode mode;
8940    } args[6];
8941  bool second_arg_count = false;
8942  enum insn_code icode = d->icode;
8943  const struct insn_data_d *insn_p = &insn_data[icode];
8944  machine_mode tmode = insn_p->operand[0].mode;
8945  machine_mode rmode = VOIDmode;
8946  bool swap = false;
8947  enum rtx_code comparison = d->comparison;
8948
8949  switch ((enum ix86_builtin_func_type) d->flag)
8950    {
8951    case V2DF_FTYPE_V2DF_ROUND:
8952    case V4DF_FTYPE_V4DF_ROUND:
8953    case V8DF_FTYPE_V8DF_ROUND:
8954    case V4SF_FTYPE_V4SF_ROUND:
8955    case V8SF_FTYPE_V8SF_ROUND:
8956    case V16SF_FTYPE_V16SF_ROUND:
8957    case V4SI_FTYPE_V4SF_ROUND:
8958    case V8SI_FTYPE_V8SF_ROUND:
8959    case V16SI_FTYPE_V16SF_ROUND:
8960      return ix86_expand_sse_round (d, exp, target);
8961    case V4SI_FTYPE_V2DF_V2DF_ROUND:
8962    case V8SI_FTYPE_V4DF_V4DF_ROUND:
8963    case V16SI_FTYPE_V8DF_V8DF_ROUND:
8964      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8965    case INT_FTYPE_V8SF_V8SF_PTEST:
8966    case INT_FTYPE_V4DI_V4DI_PTEST:
8967    case INT_FTYPE_V4DF_V4DF_PTEST:
8968    case INT_FTYPE_V4SF_V4SF_PTEST:
8969    case INT_FTYPE_V2DI_V2DI_PTEST:
8970    case INT_FTYPE_V2DF_V2DF_PTEST:
8971      return ix86_expand_sse_ptest (d, exp, target);
8972    case FLOAT128_FTYPE_FLOAT128:
8973    case FLOAT_FTYPE_FLOAT:
8974    case INT_FTYPE_INT:
8975    case UINT_FTYPE_UINT:
8976    case UINT16_FTYPE_UINT16:
8977    case UINT64_FTYPE_INT:
8978    case UINT64_FTYPE_UINT64:
8979    case INT64_FTYPE_INT64:
8980    case INT64_FTYPE_V4SF:
8981    case INT64_FTYPE_V2DF:
8982    case INT_FTYPE_V16QI:
8983    case INT_FTYPE_V8QI:
8984    case INT_FTYPE_V8SF:
8985    case INT_FTYPE_V4DF:
8986    case INT_FTYPE_V4SF:
8987    case INT_FTYPE_V2DF:
8988    case INT_FTYPE_V32QI:
8989    case V16QI_FTYPE_V16QI:
8990    case V8SI_FTYPE_V8SF:
8991    case V8SI_FTYPE_V4SI:
8992    case V8HI_FTYPE_V8HI:
8993    case V8HI_FTYPE_V16QI:
8994    case V8QI_FTYPE_V8QI:
8995    case V8SF_FTYPE_V8SF:
8996    case V8SF_FTYPE_V8SI:
8997    case V8SF_FTYPE_V4SF:
8998    case V8SF_FTYPE_V8HI:
8999    case V4SI_FTYPE_V4SI:
9000    case V4SI_FTYPE_V16QI:
9001    case V4SI_FTYPE_V4SF:
9002    case V4SI_FTYPE_V8SI:
9003    case V4SI_FTYPE_V8HI:
9004    case V4SI_FTYPE_V4DF:
9005    case V4SI_FTYPE_V2DF:
9006    case V4HI_FTYPE_V4HI:
9007    case V4DF_FTYPE_V4DF:
9008    case V4DF_FTYPE_V4SI:
9009    case V4DF_FTYPE_V4SF:
9010    case V4DF_FTYPE_V2DF:
9011    case V4SF_FTYPE_V4SF:
9012    case V4SF_FTYPE_V4SI:
9013    case V4SF_FTYPE_V8SF:
9014    case V4SF_FTYPE_V4DF:
9015    case V4SF_FTYPE_V8HI:
9016    case V4SF_FTYPE_V2DF:
9017    case V2DI_FTYPE_V2DI:
9018    case V2DI_FTYPE_V16QI:
9019    case V2DI_FTYPE_V8HI:
9020    case V2DI_FTYPE_V4SI:
9021    case V2DF_FTYPE_V2DF:
9022    case V2DF_FTYPE_V4SI:
9023    case V2DF_FTYPE_V4DF:
9024    case V2DF_FTYPE_V4SF:
9025    case V2DF_FTYPE_V2SI:
9026    case V2SI_FTYPE_V2SI:
9027    case V2SI_FTYPE_V4SF:
9028    case V2SI_FTYPE_V2SF:
9029    case V2SI_FTYPE_V2DF:
9030    case V2SF_FTYPE_V2SF:
9031    case V2SF_FTYPE_V2SI:
9032    case V32QI_FTYPE_V32QI:
9033    case V32QI_FTYPE_V16QI:
9034    case V16HI_FTYPE_V16HI:
9035    case V16HI_FTYPE_V8HI:
9036    case V8SI_FTYPE_V8SI:
9037    case V16HI_FTYPE_V16QI:
9038    case V8SI_FTYPE_V16QI:
9039    case V4DI_FTYPE_V16QI:
9040    case V8SI_FTYPE_V8HI:
9041    case V4DI_FTYPE_V8HI:
9042    case V4DI_FTYPE_V4SI:
9043    case V4DI_FTYPE_V2DI:
9044    case UQI_FTYPE_UQI:
9045    case UHI_FTYPE_UHI:
9046    case USI_FTYPE_USI:
9047    case USI_FTYPE_UQI:
9048    case USI_FTYPE_UHI:
9049    case UDI_FTYPE_UDI:
9050    case UHI_FTYPE_V16QI:
9051    case USI_FTYPE_V32QI:
9052    case UDI_FTYPE_V64QI:
9053    case V16QI_FTYPE_UHI:
9054    case V32QI_FTYPE_USI:
9055    case V64QI_FTYPE_UDI:
9056    case V8HI_FTYPE_UQI:
9057    case V16HI_FTYPE_UHI:
9058    case V32HI_FTYPE_USI:
9059    case V4SI_FTYPE_UQI:
9060    case V8SI_FTYPE_UQI:
9061    case V4SI_FTYPE_UHI:
9062    case V8SI_FTYPE_UHI:
9063    case UQI_FTYPE_V8HI:
9064    case UHI_FTYPE_V16HI:
9065    case USI_FTYPE_V32HI:
9066    case UQI_FTYPE_V4SI:
9067    case UQI_FTYPE_V8SI:
9068    case UHI_FTYPE_V16SI:
9069    case UQI_FTYPE_V2DI:
9070    case UQI_FTYPE_V4DI:
9071    case UQI_FTYPE_V8DI:
9072    case V16SI_FTYPE_UHI:
9073    case V2DI_FTYPE_UQI:
9074    case V4DI_FTYPE_UQI:
9075    case V16SI_FTYPE_INT:
9076    case V16SF_FTYPE_V8SF:
9077    case V16SI_FTYPE_V8SI:
9078    case V16SF_FTYPE_V4SF:
9079    case V16SI_FTYPE_V4SI:
9080    case V16SI_FTYPE_V16SF:
9081    case V16SI_FTYPE_V16SI:
9082    case V64QI_FTYPE_V64QI:
9083    case V32HI_FTYPE_V32HI:
9084    case V16SF_FTYPE_V16SF:
9085    case V8DI_FTYPE_UQI:
9086    case V8DI_FTYPE_V8DI:
9087    case V8DF_FTYPE_V4DF:
9088    case V8DF_FTYPE_V2DF:
9089    case V8DF_FTYPE_V8DF:
9090    case V4DI_FTYPE_V4DI:
9091    case V16HI_FTYPE_V16SF:
9092    case V8HI_FTYPE_V8SF:
9093    case V8HI_FTYPE_V4SF:
9094      nargs = 1;
9095      break;
9096    case V4SF_FTYPE_V4SF_VEC_MERGE:
9097    case V2DF_FTYPE_V2DF_VEC_MERGE:
9098      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9099    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9100    case V16QI_FTYPE_V16QI_V16QI:
9101    case V16QI_FTYPE_V8HI_V8HI:
9102    case V16SF_FTYPE_V16SF_V16SF:
9103    case V8QI_FTYPE_V8QI_V8QI:
9104    case V8QI_FTYPE_V4HI_V4HI:
9105    case V8HI_FTYPE_V8HI_V8HI:
9106    case V8HI_FTYPE_V16QI_V16QI:
9107    case V8HI_FTYPE_V4SI_V4SI:
9108    case V8SF_FTYPE_V8SF_V8SF:
9109    case V8SF_FTYPE_V8SF_V8SI:
9110    case V8DF_FTYPE_V8DF_V8DF:
9111    case V4SI_FTYPE_V4SI_V4SI:
9112    case V4SI_FTYPE_V8HI_V8HI:
9113    case V4SI_FTYPE_V2DF_V2DF:
9114    case V4HI_FTYPE_V4HI_V4HI:
9115    case V4HI_FTYPE_V8QI_V8QI:
9116    case V4HI_FTYPE_V2SI_V2SI:
9117    case V4DF_FTYPE_V4DF_V4DF:
9118    case V4DF_FTYPE_V4DF_V4DI:
9119    case V4SF_FTYPE_V4SF_V4SF:
9120    case V4SF_FTYPE_V4SF_V4SI:
9121    case V4SF_FTYPE_V4SF_V2SI:
9122    case V4SF_FTYPE_V4SF_V2DF:
9123    case V4SF_FTYPE_V4SF_UINT:
9124    case V4SF_FTYPE_V4SF_DI:
9125    case V4SF_FTYPE_V4SF_SI:
9126    case V2DI_FTYPE_V2DI_V2DI:
9127    case V2DI_FTYPE_V16QI_V16QI:
9128    case V2DI_FTYPE_V4SI_V4SI:
9129    case V2DI_FTYPE_V2DI_V16QI:
9130    case V2SI_FTYPE_V2SI_V2SI:
9131    case V2SI_FTYPE_V4HI_V4HI:
9132    case V2SI_FTYPE_V2SF_V2SF:
9133    case V2DF_FTYPE_V2DF_V2DF:
9134    case V2DF_FTYPE_V2DF_V4SF:
9135    case V2DF_FTYPE_V2DF_V2DI:
9136    case V2DF_FTYPE_V2DF_DI:
9137    case V2DF_FTYPE_V2DF_SI:
9138    case V2DF_FTYPE_V2DF_UINT:
9139    case V2SF_FTYPE_V2SF_V2SF:
9140    case V1DI_FTYPE_V1DI_V1DI:
9141    case V1DI_FTYPE_V8QI_V8QI:
9142    case V1DI_FTYPE_V2SI_V2SI:
9143    case V32QI_FTYPE_V16HI_V16HI:
9144    case V16HI_FTYPE_V8SI_V8SI:
9145    case V64QI_FTYPE_V64QI_V64QI:
9146    case V32QI_FTYPE_V32QI_V32QI:
9147    case V16HI_FTYPE_V32QI_V32QI:
9148    case V16HI_FTYPE_V16HI_V16HI:
9149    case V8SI_FTYPE_V4DF_V4DF:
9150    case V8SI_FTYPE_V8SI_V8SI:
9151    case V8SI_FTYPE_V16HI_V16HI:
9152    case V4DI_FTYPE_V4DI_V4DI:
9153    case V4DI_FTYPE_V8SI_V8SI:
9154    case V8DI_FTYPE_V64QI_V64QI:
9155      if (comparison == UNKNOWN)
9156	return ix86_expand_binop_builtin (icode, exp, target);
9157      nargs = 2;
9158      break;
9159    case V4SF_FTYPE_V4SF_V4SF_SWAP:
9160    case V2DF_FTYPE_V2DF_V2DF_SWAP:
9161      gcc_assert (comparison != UNKNOWN);
9162      nargs = 2;
9163      swap = true;
9164      break;
9165    case V16HI_FTYPE_V16HI_V8HI_COUNT:
9166    case V16HI_FTYPE_V16HI_SI_COUNT:
9167    case V8SI_FTYPE_V8SI_V4SI_COUNT:
9168    case V8SI_FTYPE_V8SI_SI_COUNT:
9169    case V4DI_FTYPE_V4DI_V2DI_COUNT:
9170    case V4DI_FTYPE_V4DI_INT_COUNT:
9171    case V8HI_FTYPE_V8HI_V8HI_COUNT:
9172    case V8HI_FTYPE_V8HI_SI_COUNT:
9173    case V4SI_FTYPE_V4SI_V4SI_COUNT:
9174    case V4SI_FTYPE_V4SI_SI_COUNT:
9175    case V4HI_FTYPE_V4HI_V4HI_COUNT:
9176    case V4HI_FTYPE_V4HI_SI_COUNT:
9177    case V2DI_FTYPE_V2DI_V2DI_COUNT:
9178    case V2DI_FTYPE_V2DI_SI_COUNT:
9179    case V2SI_FTYPE_V2SI_V2SI_COUNT:
9180    case V2SI_FTYPE_V2SI_SI_COUNT:
9181    case V1DI_FTYPE_V1DI_V1DI_COUNT:
9182    case V1DI_FTYPE_V1DI_SI_COUNT:
9183      nargs = 2;
9184      second_arg_count = true;
9185      break;
9186    case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9187    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9188    case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9189    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9190    case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9191    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9192    case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9193    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9194    case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9195    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9196    case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9197    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9198    case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9199    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9200    case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9201    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9202    case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9203    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9204      nargs = 4;
9205      second_arg_count = true;
9206      break;
9207    case UINT64_FTYPE_UINT64_UINT64:
9208    case UINT_FTYPE_UINT_UINT:
9209    case UINT_FTYPE_UINT_USHORT:
9210    case UINT_FTYPE_UINT_UCHAR:
9211    case UINT16_FTYPE_UINT16_INT:
9212    case UINT8_FTYPE_UINT8_INT:
9213    case UQI_FTYPE_UQI_UQI:
9214    case UHI_FTYPE_UHI_UHI:
9215    case USI_FTYPE_USI_USI:
9216    case UDI_FTYPE_UDI_UDI:
9217    case V16SI_FTYPE_V8DF_V8DF:
9218    case V32HI_FTYPE_V16SF_V16SF:
9219    case V16HI_FTYPE_V8SF_V8SF:
9220    case V8HI_FTYPE_V4SF_V4SF:
9221    case V16HI_FTYPE_V16SF_UHI:
9222    case V8HI_FTYPE_V8SF_UQI:
9223    case V8HI_FTYPE_V4SF_UQI:
9224      nargs = 2;
9225      break;
9226    case V2DI_FTYPE_V2DI_INT_CONVERT:
9227      nargs = 2;
9228      rmode = V1TImode;
9229      nargs_constant = 1;
9230      break;
9231    case V4DI_FTYPE_V4DI_INT_CONVERT:
9232      nargs = 2;
9233      rmode = V2TImode;
9234      nargs_constant = 1;
9235      break;
9236    case V8DI_FTYPE_V8DI_INT_CONVERT:
9237      nargs = 2;
9238      rmode = V4TImode;
9239      nargs_constant = 1;
9240      break;
9241    case V8HI_FTYPE_V8HI_INT:
9242    case V8HI_FTYPE_V8SF_INT:
9243    case V16HI_FTYPE_V16SF_INT:
9244    case V8HI_FTYPE_V4SF_INT:
9245    case V8SF_FTYPE_V8SF_INT:
9246    case V4SF_FTYPE_V16SF_INT:
9247    case V16SF_FTYPE_V16SF_INT:
9248    case V4SI_FTYPE_V4SI_INT:
9249    case V4SI_FTYPE_V8SI_INT:
9250    case V4HI_FTYPE_V4HI_INT:
9251    case V4DF_FTYPE_V4DF_INT:
9252    case V4DF_FTYPE_V8DF_INT:
9253    case V4SF_FTYPE_V4SF_INT:
9254    case V4SF_FTYPE_V8SF_INT:
9255    case V2DI_FTYPE_V2DI_INT:
9256    case V2DF_FTYPE_V2DF_INT:
9257    case V2DF_FTYPE_V4DF_INT:
9258    case V16HI_FTYPE_V16HI_INT:
9259    case V8SI_FTYPE_V8SI_INT:
9260    case V16SI_FTYPE_V16SI_INT:
9261    case V4SI_FTYPE_V16SI_INT:
9262    case V4DI_FTYPE_V4DI_INT:
9263    case V2DI_FTYPE_V4DI_INT:
9264    case V4DI_FTYPE_V8DI_INT:
9265    case UQI_FTYPE_UQI_UQI_CONST:
9266    case UHI_FTYPE_UHI_UQI:
9267    case USI_FTYPE_USI_UQI:
9268    case UDI_FTYPE_UDI_UQI:
9269      nargs = 2;
9270      nargs_constant = 1;
9271      break;
9272    case V16QI_FTYPE_V16QI_V16QI_V16QI:
9273    case V8SF_FTYPE_V8SF_V8SF_V8SF:
9274    case V4DF_FTYPE_V4DF_V4DF_V4DF:
9275    case V4SF_FTYPE_V4SF_V4SF_V4SF:
9276    case V2DF_FTYPE_V2DF_V2DF_V2DF:
9277    case V32QI_FTYPE_V32QI_V32QI_V32QI:
9278    case UHI_FTYPE_V16SI_V16SI_UHI:
9279    case UQI_FTYPE_V8DI_V8DI_UQI:
9280    case V16HI_FTYPE_V16SI_V16HI_UHI:
9281    case V16QI_FTYPE_V16SI_V16QI_UHI:
9282    case V16QI_FTYPE_V8DI_V16QI_UQI:
9283    case V16SF_FTYPE_V16SF_V16SF_UHI:
9284    case V16SF_FTYPE_V4SF_V16SF_UHI:
9285    case V16SI_FTYPE_SI_V16SI_UHI:
9286    case V16SI_FTYPE_V16HI_V16SI_UHI:
9287    case V16SI_FTYPE_V16QI_V16SI_UHI:
9288    case V8SF_FTYPE_V4SF_V8SF_UQI:
9289    case V4DF_FTYPE_V2DF_V4DF_UQI:
9290    case V8SI_FTYPE_V4SI_V8SI_UQI:
9291    case V8SI_FTYPE_SI_V8SI_UQI:
9292    case V4SI_FTYPE_V4SI_V4SI_UQI:
9293    case V4SI_FTYPE_SI_V4SI_UQI:
9294    case V4DI_FTYPE_V2DI_V4DI_UQI:
9295    case V4DI_FTYPE_DI_V4DI_UQI:
9296    case V2DI_FTYPE_V2DI_V2DI_UQI:
9297    case V2DI_FTYPE_DI_V2DI_UQI:
9298    case V64QI_FTYPE_V64QI_V64QI_UDI:
9299    case V64QI_FTYPE_V16QI_V64QI_UDI:
9300    case V64QI_FTYPE_QI_V64QI_UDI:
9301    case V32QI_FTYPE_V32QI_V32QI_USI:
9302    case V32QI_FTYPE_V16QI_V32QI_USI:
9303    case V32QI_FTYPE_QI_V32QI_USI:
9304    case V16QI_FTYPE_V16QI_V16QI_UHI:
9305    case V16QI_FTYPE_QI_V16QI_UHI:
9306    case V32HI_FTYPE_V8HI_V32HI_USI:
9307    case V32HI_FTYPE_HI_V32HI_USI:
9308    case V16HI_FTYPE_V8HI_V16HI_UHI:
9309    case V16HI_FTYPE_HI_V16HI_UHI:
9310    case V8HI_FTYPE_V8HI_V8HI_UQI:
9311    case V8HI_FTYPE_HI_V8HI_UQI:
9312    case V8SF_FTYPE_V8HI_V8SF_UQI:
9313    case V4SF_FTYPE_V8HI_V4SF_UQI:
9314    case V8SI_FTYPE_V8SF_V8SI_UQI:
9315    case V4SI_FTYPE_V4SF_V4SI_UQI:
9316    case V4DI_FTYPE_V4SF_V4DI_UQI:
9317    case V2DI_FTYPE_V4SF_V2DI_UQI:
9318    case V4SF_FTYPE_V4DI_V4SF_UQI:
9319    case V4SF_FTYPE_V2DI_V4SF_UQI:
9320    case V4DF_FTYPE_V4DI_V4DF_UQI:
9321    case V2DF_FTYPE_V2DI_V2DF_UQI:
9322    case V16QI_FTYPE_V8HI_V16QI_UQI:
9323    case V16QI_FTYPE_V16HI_V16QI_UHI:
9324    case V16QI_FTYPE_V4SI_V16QI_UQI:
9325    case V16QI_FTYPE_V8SI_V16QI_UQI:
9326    case V8HI_FTYPE_V4SI_V8HI_UQI:
9327    case V8HI_FTYPE_V8SI_V8HI_UQI:
9328    case V16QI_FTYPE_V2DI_V16QI_UQI:
9329    case V16QI_FTYPE_V4DI_V16QI_UQI:
9330    case V8HI_FTYPE_V2DI_V8HI_UQI:
9331    case V8HI_FTYPE_V4DI_V8HI_UQI:
9332    case V4SI_FTYPE_V2DI_V4SI_UQI:
9333    case V4SI_FTYPE_V4DI_V4SI_UQI:
9334    case V32QI_FTYPE_V32HI_V32QI_USI:
9335    case UHI_FTYPE_V16QI_V16QI_UHI:
9336    case USI_FTYPE_V32QI_V32QI_USI:
9337    case UDI_FTYPE_V64QI_V64QI_UDI:
9338    case UQI_FTYPE_V8HI_V8HI_UQI:
9339    case UHI_FTYPE_V16HI_V16HI_UHI:
9340    case USI_FTYPE_V32HI_V32HI_USI:
9341    case UQI_FTYPE_V4SI_V4SI_UQI:
9342    case UQI_FTYPE_V8SI_V8SI_UQI:
9343    case UQI_FTYPE_V2DI_V2DI_UQI:
9344    case UQI_FTYPE_V4DI_V4DI_UQI:
9345    case V4SF_FTYPE_V2DF_V4SF_UQI:
9346    case V4SF_FTYPE_V4DF_V4SF_UQI:
9347    case V16SI_FTYPE_V16SI_V16SI_UHI:
9348    case V16SI_FTYPE_V4SI_V16SI_UHI:
9349    case V2DI_FTYPE_V4SI_V2DI_UQI:
9350    case V2DI_FTYPE_V8HI_V2DI_UQI:
9351    case V2DI_FTYPE_V16QI_V2DI_UQI:
9352    case V4DI_FTYPE_V4DI_V4DI_UQI:
9353    case V4DI_FTYPE_V4SI_V4DI_UQI:
9354    case V4DI_FTYPE_V8HI_V4DI_UQI:
9355    case V4DI_FTYPE_V16QI_V4DI_UQI:
9356    case V4DI_FTYPE_V4DF_V4DI_UQI:
9357    case V2DI_FTYPE_V2DF_V2DI_UQI:
9358    case V4SI_FTYPE_V4DF_V4SI_UQI:
9359    case V4SI_FTYPE_V2DF_V4SI_UQI:
9360    case V4SI_FTYPE_V8HI_V4SI_UQI:
9361    case V4SI_FTYPE_V16QI_V4SI_UQI:
9362    case V4DI_FTYPE_V4DI_V4DI_V4DI:
9363    case V8DF_FTYPE_V2DF_V8DF_UQI:
9364    case V8DF_FTYPE_V4DF_V8DF_UQI:
9365    case V8DF_FTYPE_V8DF_V8DF_UQI:
9366    case V8SF_FTYPE_V8SF_V8SF_UQI:
9367    case V8SF_FTYPE_V8SI_V8SF_UQI:
9368    case V4DF_FTYPE_V4DF_V4DF_UQI:
9369    case V4SF_FTYPE_V4SF_V4SF_UQI:
9370    case V2DF_FTYPE_V2DF_V2DF_UQI:
9371    case V2DF_FTYPE_V4SF_V2DF_UQI:
9372    case V2DF_FTYPE_V4SI_V2DF_UQI:
9373    case V4SF_FTYPE_V4SI_V4SF_UQI:
9374    case V4DF_FTYPE_V4SF_V4DF_UQI:
9375    case V4DF_FTYPE_V4SI_V4DF_UQI:
9376    case V8SI_FTYPE_V8SI_V8SI_UQI:
9377    case V8SI_FTYPE_V8HI_V8SI_UQI:
9378    case V8SI_FTYPE_V16QI_V8SI_UQI:
9379    case V8DF_FTYPE_V8SI_V8DF_UQI:
9380    case V8DI_FTYPE_DI_V8DI_UQI:
9381    case V16SF_FTYPE_V8SF_V16SF_UHI:
9382    case V16SI_FTYPE_V8SI_V16SI_UHI:
9383    case V16HI_FTYPE_V16HI_V16HI_UHI:
9384    case V8HI_FTYPE_V16QI_V8HI_UQI:
9385    case V16HI_FTYPE_V16QI_V16HI_UHI:
9386    case V32HI_FTYPE_V32HI_V32HI_USI:
9387    case V32HI_FTYPE_V32QI_V32HI_USI:
9388    case V8DI_FTYPE_V16QI_V8DI_UQI:
9389    case V8DI_FTYPE_V2DI_V8DI_UQI:
9390    case V8DI_FTYPE_V4DI_V8DI_UQI:
9391    case V8DI_FTYPE_V8DI_V8DI_UQI:
9392    case V8DI_FTYPE_V8HI_V8DI_UQI:
9393    case V8DI_FTYPE_V8SI_V8DI_UQI:
9394    case V8HI_FTYPE_V8DI_V8HI_UQI:
9395    case V8SI_FTYPE_V8DI_V8SI_UQI:
9396    case V4SI_FTYPE_V4SI_V4SI_V4SI:
9397    case V16SI_FTYPE_V16SI_V16SI_V16SI:
9398    case V8DI_FTYPE_V8DI_V8DI_V8DI:
9399    case V32HI_FTYPE_V32HI_V32HI_V32HI:
9400    case V2DI_FTYPE_V2DI_V2DI_V2DI:
9401    case V16HI_FTYPE_V16HI_V16HI_V16HI:
9402    case V8SI_FTYPE_V8SI_V8SI_V8SI:
9403    case V8HI_FTYPE_V8HI_V8HI_V8HI:
9404    case V32HI_FTYPE_V16SF_V16SF_USI:
9405    case V16HI_FTYPE_V8SF_V8SF_UHI:
9406    case V8HI_FTYPE_V4SF_V4SF_UQI:
9407    case V16HI_FTYPE_V16SF_V16HI_UHI:
9408    case V8HI_FTYPE_V8SF_V8HI_UQI:
9409    case V8HI_FTYPE_V4SF_V8HI_UQI:
9410    case V16SF_FTYPE_V16SF_V32HI_V32HI:
9411    case V8SF_FTYPE_V8SF_V16HI_V16HI:
9412    case V4SF_FTYPE_V4SF_V8HI_V8HI:
9413      nargs = 3;
9414      break;
9415    case V32QI_FTYPE_V32QI_V32QI_INT:
9416    case V16HI_FTYPE_V16HI_V16HI_INT:
9417    case V16QI_FTYPE_V16QI_V16QI_INT:
9418    case V4DI_FTYPE_V4DI_V4DI_INT:
9419    case V8HI_FTYPE_V8HI_V8HI_INT:
9420    case V8SI_FTYPE_V8SI_V8SI_INT:
9421    case V8SI_FTYPE_V8SI_V4SI_INT:
9422    case V8SF_FTYPE_V8SF_V8SF_INT:
9423    case V8SF_FTYPE_V8SF_V4SF_INT:
9424    case V4SI_FTYPE_V4SI_V4SI_INT:
9425    case V4DF_FTYPE_V4DF_V4DF_INT:
9426    case V16SF_FTYPE_V16SF_V16SF_INT:
9427    case V16SF_FTYPE_V16SF_V4SF_INT:
9428    case V16SI_FTYPE_V16SI_V4SI_INT:
9429    case V4DF_FTYPE_V4DF_V2DF_INT:
9430    case V4SF_FTYPE_V4SF_V4SF_INT:
9431    case V2DI_FTYPE_V2DI_V2DI_INT:
9432    case V4DI_FTYPE_V4DI_V2DI_INT:
9433    case V2DF_FTYPE_V2DF_V2DF_INT:
9434    case UQI_FTYPE_V8DI_V8UDI_INT:
9435    case UQI_FTYPE_V8DF_V8DF_INT:
9436    case UQI_FTYPE_V2DF_V2DF_INT:
9437    case UQI_FTYPE_V4SF_V4SF_INT:
9438    case UHI_FTYPE_V16SI_V16SI_INT:
9439    case UHI_FTYPE_V16SF_V16SF_INT:
9440    case V64QI_FTYPE_V64QI_V64QI_INT:
9441    case V32HI_FTYPE_V32HI_V32HI_INT:
9442    case V16SI_FTYPE_V16SI_V16SI_INT:
9443    case V8DI_FTYPE_V8DI_V8DI_INT:
9444      nargs = 3;
9445      nargs_constant = 1;
9446      break;
9447    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9448      nargs = 3;
9449      rmode = V4DImode;
9450      nargs_constant = 1;
9451      break;
9452    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9453      nargs = 3;
9454      rmode = V2DImode;
9455      nargs_constant = 1;
9456      break;
9457    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9458      nargs = 3;
9459      rmode = DImode;
9460      nargs_constant = 1;
9461      break;
9462    case V2DI_FTYPE_V2DI_UINT_UINT:
9463      nargs = 3;
9464      nargs_constant = 2;
9465      break;
9466    case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9467      nargs = 3;
9468      rmode = V8DImode;
9469      nargs_constant = 1;
9470      break;
9471    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9472      nargs = 5;
9473      rmode = V8DImode;
9474      mask_pos = 2;
9475      nargs_constant = 1;
9476      break;
9477    case QI_FTYPE_V8DF_INT_UQI:
9478    case QI_FTYPE_V4DF_INT_UQI:
9479    case QI_FTYPE_V2DF_INT_UQI:
9480    case HI_FTYPE_V16SF_INT_UHI:
9481    case QI_FTYPE_V8SF_INT_UQI:
9482    case QI_FTYPE_V4SF_INT_UQI:
9483    case V4SI_FTYPE_V4SI_V4SI_UHI:
9484    case V8SI_FTYPE_V8SI_V8SI_UHI:
9485      nargs = 3;
9486      mask_pos = 1;
9487      nargs_constant = 1;
9488      break;
9489    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9490      nargs = 5;
9491      rmode = V4DImode;
9492      mask_pos = 2;
9493      nargs_constant = 1;
9494      break;
9495    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9496      nargs = 5;
9497      rmode = V2DImode;
9498      mask_pos = 2;
9499      nargs_constant = 1;
9500      break;
9501    case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9502    case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9503    case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9504    case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9505    case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9506    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9507    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9508    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9509    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9510    case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9511    case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9512    case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9513    case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9514    case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9515    case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9516    case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9517    case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9518    case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9519    case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9520    case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9521    case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9522    case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9523    case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9524    case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9525    case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9526    case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9527    case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9528    case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9529    case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9530    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9531    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9532    case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9533    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9534    case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9535    case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9536    case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9537    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9538    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9539    case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9540    case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9541    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9542    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9543    case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9544    case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9545    case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9546    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9547    case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9548    case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9549    case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9550    case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9551    case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9552    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9553    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9554    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9555      nargs = 4;
9556      break;
9557    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9558    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9559    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9560    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9561    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9562      nargs = 4;
9563      nargs_constant = 1;
9564      break;
9565    case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9566    case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9567    case QI_FTYPE_V4DF_V4DF_INT_UQI:
9568    case QI_FTYPE_V8SF_V8SF_INT_UQI:
9569    case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9570    case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9571    case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9572    case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9573    case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9574    case USI_FTYPE_V32QI_V32QI_INT_USI:
9575    case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9576    case USI_FTYPE_V32HI_V32HI_INT_USI:
9577    case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9578    case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9579      nargs = 4;
9580      mask_pos = 1;
9581      nargs_constant = 1;
9582      break;
9583    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9584      nargs = 4;
9585      nargs_constant = 2;
9586      break;
9587    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9588    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9589    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9590    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9591    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9592      nargs = 4;
9593      break;
9594    case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9595    case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9596      mask_pos = 1;
9597      nargs = 4;
9598      nargs_constant = 1;
9599      break;
9600    case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9601    case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9602    case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9603    case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9604    case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9605    case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9606    case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9607    case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9608    case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9609    case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9610    case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9611    case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9612    case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9613    case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9614    case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9615    case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9616    case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9617    case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9618    case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9619    case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9620    case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9621    case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9622    case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9623    case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9624    case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9625    case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9626    case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9627    case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9628    case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9629    case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9630      nargs = 4;
9631      mask_pos = 2;
9632      nargs_constant = 1;
9633      break;
9634    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9635    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9636    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9637    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9638    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9639    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9640    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9641    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9642    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9643    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9644    case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9645    case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9646    case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9647    case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9648    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9649    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9650    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9651    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9652    case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9653    case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9654    case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9655    case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9656    case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9657    case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9658    case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9659    case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9660    case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9661      nargs = 5;
9662      mask_pos = 2;
9663      nargs_constant = 1;
9664      break;
9665    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9666    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9667    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9668    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9669    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9670    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9671    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9672    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9673    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9674    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9675      nargs = 5;
9676      mask_pos = 1;
9677      nargs_constant = 1;
9678      break;
9679    case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9680    case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9681    case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9682    case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9683    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9684    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9685    case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9686    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9687    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9688    case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9689    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9690    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9691      nargs = 5;
9692      mask_pos = 1;
9693      nargs_constant = 2;
9694      break;
9695
9696    default:
9697      gcc_unreachable ();
9698    }
9699
9700  gcc_assert (nargs <= ARRAY_SIZE (args));
9701
9702  if (comparison != UNKNOWN)
9703    {
9704      gcc_assert (nargs == 2);
9705      return ix86_expand_sse_compare (d, exp, target, swap);
9706    }
9707
9708  if (rmode == VOIDmode || rmode == tmode)
9709    {
9710      if (optimize
9711	  || target == 0
9712	  || GET_MODE (target) != tmode
9713	  || !insn_p->operand[0].predicate (target, tmode))
9714	target = gen_reg_rtx (tmode);
9715      else if (memory_operand (target, tmode))
9716	num_memory++;
9717      real_target = target;
9718    }
9719  else
9720    {
9721      real_target = gen_reg_rtx (tmode);
9722      target = lowpart_subreg (rmode, real_target, tmode);
9723    }
9724
9725  for (i = 0; i < nargs; i++)
9726    {
9727      tree arg = CALL_EXPR_ARG (exp, i);
9728      rtx op = expand_normal (arg);
9729      machine_mode mode = insn_p->operand[i + 1].mode;
9730      bool match = insn_p->operand[i + 1].predicate (op, mode);
9731
9732      if (second_arg_count && i == 1)
9733	{
9734	  /* SIMD shift insns take either an 8-bit immediate or
9735	     register as count.  But builtin functions take int as
9736	     count.  If count doesn't match, we put it in register.
9737	     The instructions are using 64-bit count, if op is just
9738	     32-bit, zero-extend it, as negative shift counts
9739	     are undefined behavior and zero-extension is more
9740	     efficient.  */
9741	  if (!match)
9742	    {
9743	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
9744		op = convert_modes (mode, GET_MODE (op), op, 1);
9745	      else
9746		op = lowpart_subreg (mode, op, GET_MODE (op));
9747	      if (!insn_p->operand[i + 1].predicate (op, mode))
9748		op = copy_to_reg (op);
9749	    }
9750	}
9751      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9752	       (!mask_pos && (nargs - i) <= nargs_constant))
9753	{
9754	  if (!match)
9755	    switch (icode)
9756	      {
9757	      case CODE_FOR_avx_vinsertf128v4di:
9758	      case CODE_FOR_avx_vextractf128v4di:
9759		error ("the last argument must be an 1-bit immediate");
9760		return const0_rtx;
9761
9762	      case CODE_FOR_avx512f_cmpv8di3_mask:
9763	      case CODE_FOR_avx512f_cmpv16si3_mask:
9764	      case CODE_FOR_avx512f_ucmpv8di3_mask:
9765	      case CODE_FOR_avx512f_ucmpv16si3_mask:
9766	      case CODE_FOR_avx512vl_cmpv4di3_mask:
9767	      case CODE_FOR_avx512vl_cmpv8si3_mask:
9768	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
9769	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
9770	      case CODE_FOR_avx512vl_cmpv2di3_mask:
9771	      case CODE_FOR_avx512vl_cmpv4si3_mask:
9772	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
9773	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
9774		error ("the last argument must be a 3-bit immediate");
9775		return const0_rtx;
9776
9777	      case CODE_FOR_sse4_1_roundsd:
9778	      case CODE_FOR_sse4_1_roundss:
9779
9780	      case CODE_FOR_sse4_1_roundpd:
9781	      case CODE_FOR_sse4_1_roundps:
9782	      case CODE_FOR_avx_roundpd256:
9783	      case CODE_FOR_avx_roundps256:
9784
9785	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9786	      case CODE_FOR_sse4_1_roundps_sfix:
9787	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9788	      case CODE_FOR_avx_roundps_sfix256:
9789
9790	      case CODE_FOR_sse4_1_blendps:
9791	      case CODE_FOR_avx_blendpd256:
9792	      case CODE_FOR_avx_vpermilv4df:
9793	      case CODE_FOR_avx_vpermilv4df_mask:
9794	      case CODE_FOR_avx512f_getmantv8df_mask:
9795	      case CODE_FOR_avx512f_getmantv16sf_mask:
9796	      case CODE_FOR_avx512vl_getmantv8sf_mask:
9797	      case CODE_FOR_avx512vl_getmantv4df_mask:
9798	      case CODE_FOR_avx512vl_getmantv4sf_mask:
9799	      case CODE_FOR_avx512vl_getmantv2df_mask:
9800	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
9801	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9802	      case CODE_FOR_avx512dq_rangepv4df_mask:
9803	      case CODE_FOR_avx512dq_rangepv8sf_mask:
9804	      case CODE_FOR_avx512dq_rangepv2df_mask:
9805	      case CODE_FOR_avx512dq_rangepv4sf_mask:
9806	      case CODE_FOR_avx_shufpd256_mask:
9807		error ("the last argument must be a 4-bit immediate");
9808		return const0_rtx;
9809
9810	      case CODE_FOR_sha1rnds4:
9811	      case CODE_FOR_sse4_1_blendpd:
9812	      case CODE_FOR_avx_vpermilv2df:
9813	      case CODE_FOR_avx_vpermilv2df_mask:
9814	      case CODE_FOR_xop_vpermil2v2df3:
9815	      case CODE_FOR_xop_vpermil2v4sf3:
9816	      case CODE_FOR_xop_vpermil2v4df3:
9817	      case CODE_FOR_xop_vpermil2v8sf3:
9818	      case CODE_FOR_avx512f_vinsertf32x4_mask:
9819	      case CODE_FOR_avx512f_vinserti32x4_mask:
9820	      case CODE_FOR_avx512f_vextractf32x4_mask:
9821	      case CODE_FOR_avx512f_vextracti32x4_mask:
9822	      case CODE_FOR_sse2_shufpd:
9823	      case CODE_FOR_sse2_shufpd_mask:
9824	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
9825	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
9826	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
9827	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
9828		error ("the last argument must be a 2-bit immediate");
9829		return const0_rtx;
9830
9831	      case CODE_FOR_avx_vextractf128v4df:
9832	      case CODE_FOR_avx_vextractf128v8sf:
9833	      case CODE_FOR_avx_vextractf128v8si:
9834	      case CODE_FOR_avx_vinsertf128v4df:
9835	      case CODE_FOR_avx_vinsertf128v8sf:
9836	      case CODE_FOR_avx_vinsertf128v8si:
9837	      case CODE_FOR_avx512f_vinsertf64x4_mask:
9838	      case CODE_FOR_avx512f_vinserti64x4_mask:
9839	      case CODE_FOR_avx512f_vextractf64x4_mask:
9840	      case CODE_FOR_avx512f_vextracti64x4_mask:
9841	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
9842	      case CODE_FOR_avx512dq_vinserti32x8_mask:
9843	      case CODE_FOR_avx512vl_vinsertv4df:
9844	      case CODE_FOR_avx512vl_vinsertv4di:
9845	      case CODE_FOR_avx512vl_vinsertv8sf:
9846	      case CODE_FOR_avx512vl_vinsertv8si:
9847		error ("the last argument must be a 1-bit immediate");
9848		return const0_rtx;
9849
9850	      case CODE_FOR_avx_vmcmpv2df3:
9851	      case CODE_FOR_avx_vmcmpv4sf3:
9852	      case CODE_FOR_avx_cmpv2df3:
9853	      case CODE_FOR_avx_cmpv4sf3:
9854	      case CODE_FOR_avx_cmpv4df3:
9855	      case CODE_FOR_avx_cmpv8sf3:
9856	      case CODE_FOR_avx512f_cmpv8df3_mask:
9857	      case CODE_FOR_avx512f_cmpv16sf3_mask:
9858	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
9859	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9860		error ("the last argument must be a 5-bit immediate");
9861		return const0_rtx;
9862
9863	      default:
9864		switch (nargs_constant)
9865		  {
9866		  case 2:
9867		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9868			(!mask_pos && (nargs - i) == nargs_constant))
9869		      {
9870			error ("the next to last argument must be an 8-bit immediate");
9871			break;
9872		      }
9873		    /* FALLTHRU */
9874		  case 1:
9875		    error ("the last argument must be an 8-bit immediate");
9876		    break;
9877		  default:
9878		    gcc_unreachable ();
9879		  }
9880		return const0_rtx;
9881	      }
9882	}
9883      else
9884	{
9885	  if (VECTOR_MODE_P (mode))
9886	    op = safe_vector_operand (op, mode);
9887
9888	  /* If we aren't optimizing, only allow one memory operand to
9889	     be generated.  */
9890	  if (memory_operand (op, mode))
9891	    num_memory++;
9892
9893	  op = fixup_modeless_constant (op, mode);
9894
9895	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9896	    {
9897	      if (optimize || !match || num_memory > 1)
9898		op = copy_to_mode_reg (mode, op);
9899	    }
9900	  else
9901	    {
9902	      op = copy_to_reg (op);
9903	      op = lowpart_subreg (mode, op, GET_MODE (op));
9904	    }
9905	}
9906
9907      args[i].op = op;
9908      args[i].mode = mode;
9909    }
9910
9911  switch (nargs)
9912    {
9913    case 1:
9914      pat = GEN_FCN (icode) (real_target, args[0].op);
9915      break;
9916    case 2:
9917      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9918      break;
9919    case 3:
9920      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9921			     args[2].op);
9922      break;
9923    case 4:
9924      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9925			     args[2].op, args[3].op);
9926      break;
9927    case 5:
9928      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9929			     args[2].op, args[3].op, args[4].op);
9930      break;
9931    case 6:
9932      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9933			     args[2].op, args[3].op, args[4].op,
9934			     args[5].op);
9935      break;
9936    default:
9937      gcc_unreachable ();
9938    }
9939
9940  if (! pat)
9941    return 0;
9942
9943  emit_insn (pat);
9944  return target;
9945}
9946
9947/* Transform pattern of following layout:
9948     (set A
9949       (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9950     )
9951   into:
9952     (set (A B)) */
9953
9954static rtx
9955ix86_erase_embedded_rounding (rtx pat)
9956{
9957  if (GET_CODE (pat) == INSN)
9958    pat = PATTERN (pat);
9959
9960  gcc_assert (GET_CODE (pat) == SET);
9961  rtx src = SET_SRC (pat);
9962  gcc_assert (XVECLEN (src, 0) == 2);
9963  rtx p0 = XVECEXP (src, 0, 0);
9964  gcc_assert (GET_CODE (src) == UNSPEC
9965	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9966  rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9967  return res;
9968}
9969
9970/* Subroutine of ix86_expand_round_builtin to take care of comi insns
9971   with rounding.  */
9972static rtx
9973ix86_expand_sse_comi_round (const struct builtin_description *d,
9974			    tree exp, rtx target)
9975{
9976  rtx pat, set_dst;
9977  tree arg0 = CALL_EXPR_ARG (exp, 0);
9978  tree arg1 = CALL_EXPR_ARG (exp, 1);
9979  tree arg2 = CALL_EXPR_ARG (exp, 2);
9980  tree arg3 = CALL_EXPR_ARG (exp, 3);
9981  rtx op0 = expand_normal (arg0);
9982  rtx op1 = expand_normal (arg1);
9983  rtx op2 = expand_normal (arg2);
9984  rtx op3 = expand_normal (arg3);
9985  enum insn_code icode = d->icode;
9986  const struct insn_data_d *insn_p = &insn_data[icode];
9987  machine_mode mode0 = insn_p->operand[0].mode;
9988  machine_mode mode1 = insn_p->operand[1].mode;
9989
9990  /* See avxintrin.h for values.  */
9991  static const enum rtx_code comparisons[32] =
9992    {
9993      EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9994      UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9995      EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9996      UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9997    };
9998  static const bool ordereds[32] =
9999    {
10000      true,  true,  true,  false, false, false, false, true,
10001      false, false, false, true,  true,  true,  true,  false,
10002      true,  true,  true,  false, false, false, false, true,
10003      false, false, false, true,  true,  true,  true,  false
10004    };
10005  static const bool non_signalings[32] =
10006    {
10007      true,  false, false, true,  true,  false, false, true,
10008      true,  false, false, true,  true,  false, false, true,
10009      false, true,  true,  false, false, true,  true,  false,
10010      false, true,  true,  false, false, true,  true,  false
10011    };
10012
10013  if (!CONST_INT_P (op2))
10014    {
10015      error ("the third argument must be comparison constant");
10016      return const0_rtx;
10017    }
10018  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10019    {
10020      error ("incorrect comparison mode");
10021      return const0_rtx;
10022    }
10023
10024  if (!insn_p->operand[2].predicate (op3, SImode))
10025    {
10026      error ("incorrect rounding operand");
10027      return const0_rtx;
10028    }
10029
10030  if (VECTOR_MODE_P (mode0))
10031    op0 = safe_vector_operand (op0, mode0);
10032  if (VECTOR_MODE_P (mode1))
10033    op1 = safe_vector_operand (op1, mode1);
10034
10035  enum rtx_code comparison = comparisons[INTVAL (op2)];
10036  bool ordered = ordereds[INTVAL (op2)];
10037  bool non_signaling = non_signalings[INTVAL (op2)];
10038  rtx const_val = const0_rtx;
10039
10040  bool check_unordered = false;
10041  machine_mode mode = CCFPmode;
10042  switch (comparison)
10043    {
10044    case ORDERED:
10045      if (!ordered)
10046	{
10047	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
10048	  if (!non_signaling)
10049	    ordered = true;
10050	  mode = CCSmode;
10051	}
10052      else
10053	{
10054	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
10055	  if (non_signaling)
10056	    ordered = false;
10057	  mode = CCPmode;
10058	}
10059      comparison = NE;
10060      break;
10061    case UNORDERED:
10062      if (ordered)
10063	{
10064	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
10065	  if (non_signaling)
10066	    ordered = false;
10067	  mode = CCSmode;
10068	}
10069      else
10070	{
10071	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
10072	  if (!non_signaling)
10073	    ordered = true;
10074	  mode = CCPmode;
10075	}
10076      comparison = EQ;
10077      break;
10078
10079    case LE:	/* -> GE  */
10080    case LT:	/* -> GT  */
10081    case UNGE:	/* -> UNLE  */
10082    case UNGT:	/* -> UNLT  */
10083      std::swap (op0, op1);
10084      comparison = swap_condition (comparison);
10085      /* FALLTHRU */
10086    case GT:
10087    case GE:
10088    case UNEQ:
10089    case UNLT:
10090    case UNLE:
10091    case LTGT:
10092      /* These are supported by CCFPmode.  NB: Use ordered/signaling
10093	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
10094	 with NAN operands.  */
10095      if (ordered == non_signaling)
10096	ordered = !ordered;
10097      break;
10098    case EQ:
10099      /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10100	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
10101      check_unordered = true;
10102      mode = CCZmode;
10103      break;
10104    case NE:
10105      /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10106	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
10107      gcc_assert (!ordered);
10108      check_unordered = true;
10109      mode = CCZmode;
10110      const_val = const1_rtx;
10111      break;
10112    default:
10113      gcc_unreachable ();
10114    }
10115
10116  target = gen_reg_rtx (SImode);
10117  emit_move_insn (target, const_val);
10118  target = gen_rtx_SUBREG (QImode, target, 0);
10119
10120  if ((optimize && !register_operand (op0, mode0))
10121      || !insn_p->operand[0].predicate (op0, mode0))
10122    op0 = copy_to_mode_reg (mode0, op0);
10123  if ((optimize && !register_operand (op1, mode1))
10124      || !insn_p->operand[1].predicate (op1, mode1))
10125    op1 = copy_to_mode_reg (mode1, op1);
10126
10127  /*
10128     1. COMI: ordered and signaling.
10129     2. UCOMI: unordered and non-signaling.
10130   */
10131  if (non_signaling)
10132    icode = (icode == CODE_FOR_sse_comi_round
10133	     ? CODE_FOR_sse_ucomi_round
10134	     : CODE_FOR_sse2_ucomi_round);
10135
10136  pat = GEN_FCN (icode) (op0, op1, op3);
10137  if (! pat)
10138    return 0;
10139
10140  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
10141  if (INTVAL (op3) == NO_ROUND)
10142    {
10143      pat = ix86_erase_embedded_rounding (pat);
10144      if (! pat)
10145	return 0;
10146
10147      set_dst = SET_DEST (pat);
10148    }
10149  else
10150    {
10151      gcc_assert (GET_CODE (pat) == SET);
10152      set_dst = SET_DEST (pat);
10153    }
10154
10155  emit_insn (pat);
10156
10157  rtx_code_label *label = NULL;
10158
10159  /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10160     with NAN operands.  */
10161  if (check_unordered)
10162    {
10163      gcc_assert (comparison == EQ || comparison == NE);
10164
10165      rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10166      label = gen_label_rtx ();
10167      rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10168      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10169				  gen_rtx_LABEL_REF (VOIDmode, label),
10170				  pc_rtx);
10171      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10172    }
10173
10174  /* NB: Set CCFPmode and check a different CCmode which is in subset
10175     of CCFPmode.  */
10176  if (GET_MODE (set_dst) != mode)
10177    {
10178      gcc_assert (mode == CCAmode || mode == CCCmode
10179		  || mode == CCOmode || mode == CCPmode
10180		  || mode == CCSmode || mode == CCZmode);
10181      set_dst = gen_rtx_REG (mode, FLAGS_REG);
10182    }
10183
10184  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10185			  gen_rtx_fmt_ee (comparison, QImode,
10186					  set_dst,
10187					  const0_rtx)));
10188
10189  if (label)
10190    emit_label (label);
10191
10192  return SUBREG_REG (target);
10193}
10194
10195static rtx
10196ix86_expand_round_builtin (const struct builtin_description *d,
10197			   tree exp, rtx target)
10198{
10199  rtx pat;
10200  unsigned int i, nargs;
10201  struct
10202    {
10203      rtx op;
10204      machine_mode mode;
10205    } args[6];
10206  enum insn_code icode = d->icode;
10207  const struct insn_data_d *insn_p = &insn_data[icode];
10208  machine_mode tmode = insn_p->operand[0].mode;
10209  unsigned int nargs_constant = 0;
10210  unsigned int redundant_embed_rnd = 0;
10211
10212  switch ((enum ix86_builtin_func_type) d->flag)
10213    {
10214    case UINT64_FTYPE_V2DF_INT:
10215    case UINT64_FTYPE_V4SF_INT:
10216    case UINT_FTYPE_V2DF_INT:
10217    case UINT_FTYPE_V4SF_INT:
10218    case INT64_FTYPE_V2DF_INT:
10219    case INT64_FTYPE_V4SF_INT:
10220    case INT_FTYPE_V2DF_INT:
10221    case INT_FTYPE_V4SF_INT:
10222      nargs = 2;
10223      break;
10224    case V4SF_FTYPE_V4SF_UINT_INT:
10225    case V4SF_FTYPE_V4SF_UINT64_INT:
10226    case V2DF_FTYPE_V2DF_UINT64_INT:
10227    case V4SF_FTYPE_V4SF_INT_INT:
10228    case V4SF_FTYPE_V4SF_INT64_INT:
10229    case V2DF_FTYPE_V2DF_INT64_INT:
10230    case V4SF_FTYPE_V4SF_V4SF_INT:
10231    case V2DF_FTYPE_V2DF_V2DF_INT:
10232    case V4SF_FTYPE_V4SF_V2DF_INT:
10233    case V2DF_FTYPE_V2DF_V4SF_INT:
10234      nargs = 3;
10235      break;
10236    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10237    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10238    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10239    case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10240    case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10241    case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10242    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10243    case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10244    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10245    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10246    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10247    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10248    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10249    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10250      nargs = 4;
10251      break;
10252    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10253    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10254      nargs_constant = 2;
10255      nargs = 4;
10256      break;
10257    case INT_FTYPE_V4SF_V4SF_INT_INT:
10258    case INT_FTYPE_V2DF_V2DF_INT_INT:
10259      return ix86_expand_sse_comi_round (d, exp, target);
10260    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10261    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10262    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10263    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10264    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10265    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10266    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10267    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10268      nargs = 5;
10269      break;
10270    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10271    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10272      nargs_constant = 4;
10273      nargs = 5;
10274      break;
10275    case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10276    case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10277    case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10278    case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10279      nargs_constant = 3;
10280      nargs = 5;
10281      break;
10282    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10283    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10284    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10285    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10286    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10287    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10288      nargs = 6;
10289      nargs_constant = 4;
10290      break;
10291    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10292    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10293    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10294    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10295      nargs = 6;
10296      nargs_constant = 3;
10297      break;
10298    default:
10299      gcc_unreachable ();
10300    }
10301  gcc_assert (nargs <= ARRAY_SIZE (args));
10302
10303  if (optimize
10304      || target == 0
10305      || GET_MODE (target) != tmode
10306      || !insn_p->operand[0].predicate (target, tmode))
10307    target = gen_reg_rtx (tmode);
10308
10309  for (i = 0; i < nargs; i++)
10310    {
10311      tree arg = CALL_EXPR_ARG (exp, i);
10312      rtx op = expand_normal (arg);
10313      machine_mode mode = insn_p->operand[i + 1].mode;
10314      bool match = insn_p->operand[i + 1].predicate (op, mode);
10315
10316      if (i == nargs - nargs_constant)
10317	{
10318	  if (!match)
10319	    {
10320	      switch (icode)
10321		{
10322		case CODE_FOR_avx512f_getmantv8df_mask_round:
10323		case CODE_FOR_avx512f_getmantv16sf_mask_round:
10324		case CODE_FOR_avx512f_vgetmantv2df_round:
10325		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10326		case CODE_FOR_avx512f_vgetmantv4sf_round:
10327		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10328		  error ("the immediate argument must be a 4-bit immediate");
10329		  return const0_rtx;
10330		case CODE_FOR_avx512f_cmpv8df3_mask_round:
10331		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10332		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10333		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10334		  error ("the immediate argument must be a 5-bit immediate");
10335		  return const0_rtx;
10336		default:
10337		  error ("the immediate argument must be an 8-bit immediate");
10338		  return const0_rtx;
10339		}
10340	    }
10341	}
10342      else if (i == nargs-1)
10343	{
10344	  if (!insn_p->operand[nargs].predicate (op, SImode))
10345	    {
10346	      error ("incorrect rounding operand");
10347	      return const0_rtx;
10348	    }
10349
10350	  /* If there is no rounding use normal version of the pattern.  */
10351	  if (INTVAL (op) == NO_ROUND)
10352	    redundant_embed_rnd = 1;
10353	}
10354      else
10355	{
10356	  if (VECTOR_MODE_P (mode))
10357	    op = safe_vector_operand (op, mode);
10358
10359	  op = fixup_modeless_constant (op, mode);
10360
10361	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10362	    {
10363	      if (optimize || !match)
10364		op = copy_to_mode_reg (mode, op);
10365	    }
10366	  else
10367	    {
10368	      op = copy_to_reg (op);
10369	      op = lowpart_subreg (mode, op, GET_MODE (op));
10370	    }
10371	}
10372
10373      args[i].op = op;
10374      args[i].mode = mode;
10375    }
10376
10377  switch (nargs)
10378    {
10379    case 1:
10380      pat = GEN_FCN (icode) (target, args[0].op);
10381      break;
10382    case 2:
10383      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10384      break;
10385    case 3:
10386      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10387			     args[2].op);
10388      break;
10389    case 4:
10390      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10391			     args[2].op, args[3].op);
10392      break;
10393    case 5:
10394      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10395			     args[2].op, args[3].op, args[4].op);
10396      break;
10397    case 6:
10398      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10399			     args[2].op, args[3].op, args[4].op,
10400			     args[5].op);
10401      break;
10402    default:
10403      gcc_unreachable ();
10404    }
10405
10406  if (!pat)
10407    return 0;
10408
10409  if (redundant_embed_rnd)
10410    pat = ix86_erase_embedded_rounding (pat);
10411
10412  emit_insn (pat);
10413  return target;
10414}
10415
10416/* Subroutine of ix86_expand_builtin to take care of special insns
10417   with variable number of operands.  */
10418
10419static rtx
10420ix86_expand_special_args_builtin (const struct builtin_description *d,
10421				  tree exp, rtx target)
10422{
10423  tree arg;
10424  rtx pat, op;
10425  unsigned int i, nargs, arg_adjust, memory;
10426  bool aligned_mem = false;
10427  struct
10428    {
10429      rtx op;
10430      machine_mode mode;
10431    } args[3];
10432  enum insn_code icode = d->icode;
10433  bool last_arg_constant = false;
10434  const struct insn_data_d *insn_p = &insn_data[icode];
10435  machine_mode tmode = insn_p->operand[0].mode;
10436  enum { load, store } klass;
10437
10438  switch ((enum ix86_builtin_func_type) d->flag)
10439    {
10440    case VOID_FTYPE_VOID:
10441      emit_insn (GEN_FCN (icode) (target));
10442      return 0;
10443    case VOID_FTYPE_UINT64:
10444    case VOID_FTYPE_UNSIGNED:
10445      nargs = 0;
10446      klass = store;
10447      memory = 0;
10448      break;
10449
10450    case INT_FTYPE_VOID:
10451    case USHORT_FTYPE_VOID:
10452    case UINT64_FTYPE_VOID:
10453    case UINT_FTYPE_VOID:
10454    case UNSIGNED_FTYPE_VOID:
10455      nargs = 0;
10456      klass = load;
10457      memory = 0;
10458      break;
10459    case UINT64_FTYPE_PUNSIGNED:
10460    case V2DI_FTYPE_PV2DI:
10461    case V4DI_FTYPE_PV4DI:
10462    case V32QI_FTYPE_PCCHAR:
10463    case V16QI_FTYPE_PCCHAR:
10464    case V8SF_FTYPE_PCV4SF:
10465    case V8SF_FTYPE_PCFLOAT:
10466    case V4SF_FTYPE_PCFLOAT:
10467    case V4DF_FTYPE_PCV2DF:
10468    case V4DF_FTYPE_PCDOUBLE:
10469    case V2DF_FTYPE_PCDOUBLE:
10470    case VOID_FTYPE_PVOID:
10471    case V8DI_FTYPE_PV8DI:
10472      nargs = 1;
10473      klass = load;
10474      memory = 0;
10475      switch (icode)
10476	{
10477	case CODE_FOR_sse4_1_movntdqa:
10478	case CODE_FOR_avx2_movntdqa:
10479	case CODE_FOR_avx512f_movntdqa:
10480	  aligned_mem = true;
10481	  break;
10482	default:
10483	  break;
10484	}
10485      break;
10486    case VOID_FTYPE_PV2SF_V4SF:
10487    case VOID_FTYPE_PV8DI_V8DI:
10488    case VOID_FTYPE_PV4DI_V4DI:
10489    case VOID_FTYPE_PV2DI_V2DI:
10490    case VOID_FTYPE_PCHAR_V32QI:
10491    case VOID_FTYPE_PCHAR_V16QI:
10492    case VOID_FTYPE_PFLOAT_V16SF:
10493    case VOID_FTYPE_PFLOAT_V8SF:
10494    case VOID_FTYPE_PFLOAT_V4SF:
10495    case VOID_FTYPE_PDOUBLE_V8DF:
10496    case VOID_FTYPE_PDOUBLE_V4DF:
10497    case VOID_FTYPE_PDOUBLE_V2DF:
10498    case VOID_FTYPE_PLONGLONG_LONGLONG:
10499    case VOID_FTYPE_PULONGLONG_ULONGLONG:
10500    case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10501    case VOID_FTYPE_PINT_INT:
10502      nargs = 1;
10503      klass = store;
10504      /* Reserve memory operand for target.  */
10505      memory = ARRAY_SIZE (args);
10506      switch (icode)
10507	{
10508	/* These builtins and instructions require the memory
10509	   to be properly aligned.  */
10510	case CODE_FOR_avx_movntv4di:
10511	case CODE_FOR_sse2_movntv2di:
10512	case CODE_FOR_avx_movntv8sf:
10513	case CODE_FOR_sse_movntv4sf:
10514	case CODE_FOR_sse4a_vmmovntv4sf:
10515	case CODE_FOR_avx_movntv4df:
10516	case CODE_FOR_sse2_movntv2df:
10517	case CODE_FOR_sse4a_vmmovntv2df:
10518	case CODE_FOR_sse2_movntidi:
10519	case CODE_FOR_sse_movntq:
10520	case CODE_FOR_sse2_movntisi:
10521	case CODE_FOR_avx512f_movntv16sf:
10522	case CODE_FOR_avx512f_movntv8df:
10523	case CODE_FOR_avx512f_movntv8di:
10524	  aligned_mem = true;
10525	  break;
10526	default:
10527	  break;
10528	}
10529      break;
10530    case VOID_FTYPE_PVOID_PCVOID:
10531	nargs = 1;
10532	klass = store;
10533	memory = 0;
10534
10535	break;
10536    case V4SF_FTYPE_V4SF_PCV2SF:
10537    case V2DF_FTYPE_V2DF_PCDOUBLE:
10538      nargs = 2;
10539      klass = load;
10540      memory = 1;
10541      break;
10542    case V8SF_FTYPE_PCV8SF_V8SI:
10543    case V4DF_FTYPE_PCV4DF_V4DI:
10544    case V4SF_FTYPE_PCV4SF_V4SI:
10545    case V2DF_FTYPE_PCV2DF_V2DI:
10546    case V8SI_FTYPE_PCV8SI_V8SI:
10547    case V4DI_FTYPE_PCV4DI_V4DI:
10548    case V4SI_FTYPE_PCV4SI_V4SI:
10549    case V2DI_FTYPE_PCV2DI_V2DI:
10550    case VOID_FTYPE_INT_INT64:
10551      nargs = 2;
10552      klass = load;
10553      memory = 0;
10554      break;
10555    case VOID_FTYPE_PV8DF_V8DF_UQI:
10556    case VOID_FTYPE_PV4DF_V4DF_UQI:
10557    case VOID_FTYPE_PV2DF_V2DF_UQI:
10558    case VOID_FTYPE_PV16SF_V16SF_UHI:
10559    case VOID_FTYPE_PV8SF_V8SF_UQI:
10560    case VOID_FTYPE_PV4SF_V4SF_UQI:
10561    case VOID_FTYPE_PV8DI_V8DI_UQI:
10562    case VOID_FTYPE_PV4DI_V4DI_UQI:
10563    case VOID_FTYPE_PV2DI_V2DI_UQI:
10564    case VOID_FTYPE_PV16SI_V16SI_UHI:
10565    case VOID_FTYPE_PV8SI_V8SI_UQI:
10566    case VOID_FTYPE_PV4SI_V4SI_UQI:
10567    case VOID_FTYPE_PV64QI_V64QI_UDI:
10568    case VOID_FTYPE_PV32HI_V32HI_USI:
10569    case VOID_FTYPE_PV32QI_V32QI_USI:
10570    case VOID_FTYPE_PV16QI_V16QI_UHI:
10571    case VOID_FTYPE_PV16HI_V16HI_UHI:
10572    case VOID_FTYPE_PV8HI_V8HI_UQI:
10573      switch (icode)
10574	{
10575	/* These builtins and instructions require the memory
10576	   to be properly aligned.  */
10577	case CODE_FOR_avx512f_storev16sf_mask:
10578	case CODE_FOR_avx512f_storev16si_mask:
10579	case CODE_FOR_avx512f_storev8df_mask:
10580	case CODE_FOR_avx512f_storev8di_mask:
10581	case CODE_FOR_avx512vl_storev8sf_mask:
10582	case CODE_FOR_avx512vl_storev8si_mask:
10583	case CODE_FOR_avx512vl_storev4df_mask:
10584	case CODE_FOR_avx512vl_storev4di_mask:
10585	case CODE_FOR_avx512vl_storev4sf_mask:
10586	case CODE_FOR_avx512vl_storev4si_mask:
10587	case CODE_FOR_avx512vl_storev2df_mask:
10588	case CODE_FOR_avx512vl_storev2di_mask:
10589	  aligned_mem = true;
10590	  break;
10591	default:
10592	  break;
10593	}
10594      /* FALLTHRU */
10595    case VOID_FTYPE_PV8SF_V8SI_V8SF:
10596    case VOID_FTYPE_PV4DF_V4DI_V4DF:
10597    case VOID_FTYPE_PV4SF_V4SI_V4SF:
10598    case VOID_FTYPE_PV2DF_V2DI_V2DF:
10599    case VOID_FTYPE_PV8SI_V8SI_V8SI:
10600    case VOID_FTYPE_PV4DI_V4DI_V4DI:
10601    case VOID_FTYPE_PV4SI_V4SI_V4SI:
10602    case VOID_FTYPE_PV2DI_V2DI_V2DI:
10603    case VOID_FTYPE_PV8SI_V8DI_UQI:
10604    case VOID_FTYPE_PV8HI_V8DI_UQI:
10605    case VOID_FTYPE_PV16HI_V16SI_UHI:
10606    case VOID_FTYPE_PV16QI_V8DI_UQI:
10607    case VOID_FTYPE_PV16QI_V16SI_UHI:
10608    case VOID_FTYPE_PV4SI_V4DI_UQI:
10609    case VOID_FTYPE_PV4SI_V2DI_UQI:
10610    case VOID_FTYPE_PV8HI_V4DI_UQI:
10611    case VOID_FTYPE_PV8HI_V2DI_UQI:
10612    case VOID_FTYPE_PV8HI_V8SI_UQI:
10613    case VOID_FTYPE_PV8HI_V4SI_UQI:
10614    case VOID_FTYPE_PV16QI_V4DI_UQI:
10615    case VOID_FTYPE_PV16QI_V2DI_UQI:
10616    case VOID_FTYPE_PV16QI_V8SI_UQI:
10617    case VOID_FTYPE_PV16QI_V4SI_UQI:
10618    case VOID_FTYPE_PCHAR_V64QI_UDI:
10619    case VOID_FTYPE_PCHAR_V32QI_USI:
10620    case VOID_FTYPE_PCHAR_V16QI_UHI:
10621    case VOID_FTYPE_PSHORT_V32HI_USI:
10622    case VOID_FTYPE_PSHORT_V16HI_UHI:
10623    case VOID_FTYPE_PSHORT_V8HI_UQI:
10624    case VOID_FTYPE_PINT_V16SI_UHI:
10625    case VOID_FTYPE_PINT_V8SI_UQI:
10626    case VOID_FTYPE_PINT_V4SI_UQI:
10627    case VOID_FTYPE_PINT64_V8DI_UQI:
10628    case VOID_FTYPE_PINT64_V4DI_UQI:
10629    case VOID_FTYPE_PINT64_V2DI_UQI:
10630    case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10631    case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10632    case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10633    case VOID_FTYPE_PFLOAT_V16SF_UHI:
10634    case VOID_FTYPE_PFLOAT_V8SF_UQI:
10635    case VOID_FTYPE_PFLOAT_V4SF_UQI:
10636    case VOID_FTYPE_PV32QI_V32HI_USI:
10637    case VOID_FTYPE_PV16QI_V16HI_UHI:
10638    case VOID_FTYPE_PV8QI_V8HI_UQI:
10639      nargs = 2;
10640      klass = store;
10641      /* Reserve memory operand for target.  */
10642      memory = ARRAY_SIZE (args);
10643      break;
10644    case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10645    case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10646    case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10647    case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10648    case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10649    case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10650    case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10651    case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10652    case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10653    case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10654    case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10655    case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10656    case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10657    case V32HI_FTYPE_PCV32HI_V32HI_USI:
10658    case V32QI_FTYPE_PCV32QI_V32QI_USI:
10659    case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10660    case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10661    case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10662      switch (icode)
10663	{
10664	/* These builtins and instructions require the memory
10665	   to be properly aligned.  */
10666	case CODE_FOR_avx512f_loadv16sf_mask:
10667	case CODE_FOR_avx512f_loadv16si_mask:
10668	case CODE_FOR_avx512f_loadv8df_mask:
10669	case CODE_FOR_avx512f_loadv8di_mask:
10670	case CODE_FOR_avx512vl_loadv8sf_mask:
10671	case CODE_FOR_avx512vl_loadv8si_mask:
10672	case CODE_FOR_avx512vl_loadv4df_mask:
10673	case CODE_FOR_avx512vl_loadv4di_mask:
10674	case CODE_FOR_avx512vl_loadv4sf_mask:
10675	case CODE_FOR_avx512vl_loadv4si_mask:
10676	case CODE_FOR_avx512vl_loadv2df_mask:
10677	case CODE_FOR_avx512vl_loadv2di_mask:
10678	case CODE_FOR_avx512bw_loadv64qi_mask:
10679	case CODE_FOR_avx512vl_loadv32qi_mask:
10680	case CODE_FOR_avx512vl_loadv16qi_mask:
10681	case CODE_FOR_avx512bw_loadv32hi_mask:
10682	case CODE_FOR_avx512vl_loadv16hi_mask:
10683	case CODE_FOR_avx512vl_loadv8hi_mask:
10684	  aligned_mem = true;
10685	  break;
10686	default:
10687	  break;
10688	}
10689      /* FALLTHRU */
10690    case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10691    case V32QI_FTYPE_PCCHAR_V32QI_USI:
10692    case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10693    case V32HI_FTYPE_PCSHORT_V32HI_USI:
10694    case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10695    case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10696    case V16SI_FTYPE_PCINT_V16SI_UHI:
10697    case V8SI_FTYPE_PCINT_V8SI_UQI:
10698    case V4SI_FTYPE_PCINT_V4SI_UQI:
10699    case V8DI_FTYPE_PCINT64_V8DI_UQI:
10700    case V4DI_FTYPE_PCINT64_V4DI_UQI:
10701    case V2DI_FTYPE_PCINT64_V2DI_UQI:
10702    case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10703    case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10704    case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10705    case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10706    case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10707    case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10708      nargs = 3;
10709      klass = load;
10710      memory = 0;
10711      break;
10712    case VOID_FTYPE_UINT_UINT_UINT:
10713    case VOID_FTYPE_UINT64_UINT_UINT:
10714    case UCHAR_FTYPE_UINT_UINT_UINT:
10715    case UCHAR_FTYPE_UINT64_UINT_UINT:
10716      nargs = 3;
10717      klass = load;
10718      memory = ARRAY_SIZE (args);
10719      last_arg_constant = true;
10720      break;
10721    default:
10722      gcc_unreachable ();
10723    }
10724
10725  gcc_assert (nargs <= ARRAY_SIZE (args));
10726
10727  if (klass == store)
10728    {
10729      arg = CALL_EXPR_ARG (exp, 0);
10730      op = expand_normal (arg);
10731      gcc_assert (target == 0);
10732      if (memory)
10733	{
10734	  op = ix86_zero_extend_to_Pmode (op);
10735	  target = gen_rtx_MEM (tmode, op);
10736	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10737	     on it.  Try to improve it using get_pointer_alignment,
10738	     and if the special builtin is one that requires strict
10739	     mode alignment, also from it's GET_MODE_ALIGNMENT.
10740	     Failure to do so could lead to ix86_legitimate_combined_insn
10741	     rejecting all changes to such insns.  */
10742	  unsigned int align = get_pointer_alignment (arg);
10743	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10744	    align = GET_MODE_ALIGNMENT (tmode);
10745	  if (MEM_ALIGN (target) < align)
10746	    set_mem_align (target, align);
10747	}
10748      else
10749	target = force_reg (tmode, op);
10750      arg_adjust = 1;
10751    }
10752  else
10753    {
10754      arg_adjust = 0;
10755      if (optimize
10756	  || target == 0
10757	  || !register_operand (target, tmode)
10758	  || GET_MODE (target) != tmode)
10759	target = gen_reg_rtx (tmode);
10760    }
10761
10762  for (i = 0; i < nargs; i++)
10763    {
10764      machine_mode mode = insn_p->operand[i + 1].mode;
10765      bool match;
10766
10767      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10768      op = expand_normal (arg);
10769      match = insn_p->operand[i + 1].predicate (op, mode);
10770
10771      if (last_arg_constant && (i + 1) == nargs)
10772	{
10773	  if (!match)
10774	    {
10775	      if (icode == CODE_FOR_lwp_lwpvalsi3
10776		  || icode == CODE_FOR_lwp_lwpinssi3
10777		  || icode == CODE_FOR_lwp_lwpvaldi3
10778		  || icode == CODE_FOR_lwp_lwpinsdi3)
10779		error ("the last argument must be a 32-bit immediate");
10780	      else
10781		error ("the last argument must be an 8-bit immediate");
10782	      return const0_rtx;
10783	    }
10784	}
10785      else
10786	{
10787	  if (i == memory)
10788	    {
10789	      /* This must be the memory operand.  */
10790	      op = ix86_zero_extend_to_Pmode (op);
10791	      op = gen_rtx_MEM (mode, op);
10792	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10793		 on it.  Try to improve it using get_pointer_alignment,
10794		 and if the special builtin is one that requires strict
10795		 mode alignment, also from it's GET_MODE_ALIGNMENT.
10796		 Failure to do so could lead to ix86_legitimate_combined_insn
10797		 rejecting all changes to such insns.  */
10798	      unsigned int align = get_pointer_alignment (arg);
10799	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10800		align = GET_MODE_ALIGNMENT (mode);
10801	      if (MEM_ALIGN (op) < align)
10802		set_mem_align (op, align);
10803	    }
10804	  else
10805	    {
10806	      /* This must be register.  */
10807	      if (VECTOR_MODE_P (mode))
10808		op = safe_vector_operand (op, mode);
10809
10810	      op = fixup_modeless_constant (op, mode);
10811
10812	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10813		op = copy_to_mode_reg (mode, op);
10814	      else
10815	        {
10816	          op = copy_to_reg (op);
10817	          op = lowpart_subreg (mode, op, GET_MODE (op));
10818	        }
10819	    }
10820	}
10821
10822      args[i].op = op;
10823      args[i].mode = mode;
10824    }
10825
10826  switch (nargs)
10827    {
10828    case 0:
10829      pat = GEN_FCN (icode) (target);
10830      break;
10831    case 1:
10832      pat = GEN_FCN (icode) (target, args[0].op);
10833      break;
10834    case 2:
10835      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10836      break;
10837    case 3:
10838      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10839      break;
10840    default:
10841      gcc_unreachable ();
10842    }
10843
10844  if (! pat)
10845    return 0;
10846  emit_insn (pat);
10847  return klass == store ? 0 : target;
10848}
10849
10850/* Return the integer constant in ARG.  Constrain it to be in the range
10851   of the subparts of VEC_TYPE; issue an error if not.  */
10852
10853static int
10854get_element_number (tree vec_type, tree arg)
10855{
10856  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10857
10858  if (!tree_fits_uhwi_p (arg)
10859      || (elt = tree_to_uhwi (arg), elt > max))
10860    {
10861      error ("selector must be an integer constant in the range "
10862	     "[0, %wi]", max);
10863      return 0;
10864    }
10865
10866  return elt;
10867}
10868
10869/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10870   ix86_expand_vector_init.  We DO have language-level syntax for this, in
10871   the form of  (type){ init-list }.  Except that since we can't place emms
10872   instructions from inside the compiler, we can't allow the use of MMX
10873   registers unless the user explicitly asks for it.  So we do *not* define
10874   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
10875   we have builtins invoked by mmintrin.h that gives us license to emit
10876   these sorts of instructions.  */
10877
10878static rtx
10879ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10880{
10881  machine_mode tmode = TYPE_MODE (type);
10882  machine_mode inner_mode = GET_MODE_INNER (tmode);
10883  int i, n_elt = GET_MODE_NUNITS (tmode);
10884  rtvec v = rtvec_alloc (n_elt);
10885
10886  gcc_assert (VECTOR_MODE_P (tmode));
10887  gcc_assert (call_expr_nargs (exp) == n_elt);
10888
10889  for (i = 0; i < n_elt; ++i)
10890    {
10891      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10892      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10893    }
10894
10895  if (!target || !register_operand (target, tmode))
10896    target = gen_reg_rtx (tmode);
10897
10898  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10899  return target;
10900}
10901
10902/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10903   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
10904   had a language-level syntax for referencing vector elements.  */
10905
10906static rtx
10907ix86_expand_vec_ext_builtin (tree exp, rtx target)
10908{
10909  machine_mode tmode, mode0;
10910  tree arg0, arg1;
10911  int elt;
10912  rtx op0;
10913
10914  arg0 = CALL_EXPR_ARG (exp, 0);
10915  arg1 = CALL_EXPR_ARG (exp, 1);
10916
10917  op0 = expand_normal (arg0);
10918  elt = get_element_number (TREE_TYPE (arg0), arg1);
10919
10920  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10921  mode0 = TYPE_MODE (TREE_TYPE (arg0));
10922  gcc_assert (VECTOR_MODE_P (mode0));
10923
10924  op0 = force_reg (mode0, op0);
10925
10926  if (optimize || !target || !register_operand (target, tmode))
10927    target = gen_reg_rtx (tmode);
10928
10929  ix86_expand_vector_extract (true, target, op0, elt);
10930
10931  return target;
10932}
10933
10934/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10935   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
10936   a language-level syntax for referencing vector elements.  */
10937
10938static rtx
10939ix86_expand_vec_set_builtin (tree exp)
10940{
10941  machine_mode tmode, mode1;
10942  tree arg0, arg1, arg2;
10943  int elt;
10944  rtx op0, op1, target;
10945
10946  arg0 = CALL_EXPR_ARG (exp, 0);
10947  arg1 = CALL_EXPR_ARG (exp, 1);
10948  arg2 = CALL_EXPR_ARG (exp, 2);
10949
10950  tmode = TYPE_MODE (TREE_TYPE (arg0));
10951  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10952  gcc_assert (VECTOR_MODE_P (tmode));
10953
10954  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10955  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10956  elt = get_element_number (TREE_TYPE (arg0), arg2);
10957
10958  if (GET_MODE (op1) != mode1)
10959    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10960
10961  op0 = force_reg (tmode, op0);
10962  op1 = force_reg (mode1, op1);
10963
10964  /* OP0 is the source of these builtin functions and shouldn't be
10965     modified.  Create a copy, use it and return it as target.  */
10966  target = gen_reg_rtx (tmode);
10967  emit_move_insn (target, op0);
10968  ix86_expand_vector_set (true, target, op1, elt);
10969
10970  return target;
10971}
10972
10973/* Expand an expression EXP that calls a built-in function,
10974   with result going to TARGET if that's convenient
10975   (and in mode MODE if that's convenient).
10976   SUBTARGET may be used as the target for computing one of EXP's operands.
10977   IGNORE is nonzero if the value is to be ignored.  */
10978
10979rtx
10980ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10981		     machine_mode mode, int ignore)
10982{
10983  size_t i;
10984  enum insn_code icode, icode2;
10985  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10986  tree arg0, arg1, arg2, arg3, arg4;
10987  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10988  machine_mode mode0, mode1, mode2, mode3, mode4;
10989  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10990
10991  /* For CPU builtins that can be folded, fold first and expand the fold.  */
10992  switch (fcode)
10993    {
10994    case IX86_BUILTIN_CPU_INIT:
10995      {
10996	/* Make it call __cpu_indicator_init in libgcc. */
10997	tree call_expr, fndecl, type;
10998        type = build_function_type_list (integer_type_node, NULL_TREE);
10999	fndecl = build_fn_decl ("__cpu_indicator_init", type);
11000	call_expr = build_call_expr (fndecl, 0);
11001	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11002      }
11003    case IX86_BUILTIN_CPU_IS:
11004    case IX86_BUILTIN_CPU_SUPPORTS:
11005      {
11006	tree arg0 = CALL_EXPR_ARG (exp, 0);
11007	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11008	gcc_assert (fold_expr != NULL_TREE);
11009	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11010      }
11011    }
11012
11013  HOST_WIDE_INT isa = ix86_isa_flags;
11014  HOST_WIDE_INT isa2 = ix86_isa_flags2;
11015  HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11016  HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11017  /* The general case is we require all the ISAs specified in bisa{,2}
11018     to be enabled.
11019     The exceptions are:
11020     OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11021     OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11022     OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11023     where for each such pair it is sufficient if either of the ISAs is
11024     enabled, plus if it is ored with other options also those others.
11025     OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
11026  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11027       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11028      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11029    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11030  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11031       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11032      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11033    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11034  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11035       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11036      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11037    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11038  if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
11039    {
11040      bisa &= ~OPTION_MASK_ISA_MMX;
11041      bisa |= OPTION_MASK_ISA_SSE2;
11042    }
11043  if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11044    {
11045      bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11046      if (TARGET_ABI_X32)
11047	bisa |= OPTION_MASK_ABI_X32;
11048      else
11049	bisa |= OPTION_MASK_ABI_64;
11050      char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11051				       (enum fpmath_unit) 0,
11052				       (enum prefer_vector_width) 0,
11053				       false, add_abi_p);
11054      if (!opts)
11055	error ("%qE needs unknown isa option", fndecl);
11056      else
11057	{
11058	  gcc_assert (opts != NULL);
11059	  error ("%qE needs isa option %s", fndecl, opts);
11060	  free (opts);
11061	}
11062      return expand_call (exp, target, ignore);
11063    }
11064
11065  switch (fcode)
11066    {
11067    case IX86_BUILTIN_MASKMOVQ:
11068    case IX86_BUILTIN_MASKMOVDQU:
11069      icode = (fcode == IX86_BUILTIN_MASKMOVQ
11070	       ? CODE_FOR_mmx_maskmovq
11071	       : CODE_FOR_sse2_maskmovdqu);
11072      /* Note the arg order is different from the operand order.  */
11073      arg1 = CALL_EXPR_ARG (exp, 0);
11074      arg2 = CALL_EXPR_ARG (exp, 1);
11075      arg0 = CALL_EXPR_ARG (exp, 2);
11076      op0 = expand_normal (arg0);
11077      op1 = expand_normal (arg1);
11078      op2 = expand_normal (arg2);
11079      mode0 = insn_data[icode].operand[0].mode;
11080      mode1 = insn_data[icode].operand[1].mode;
11081      mode2 = insn_data[icode].operand[2].mode;
11082
11083      op0 = ix86_zero_extend_to_Pmode (op0);
11084      op0 = gen_rtx_MEM (mode1, op0);
11085
11086      if (!insn_data[icode].operand[0].predicate (op0, mode0))
11087	op0 = copy_to_mode_reg (mode0, op0);
11088      if (!insn_data[icode].operand[1].predicate (op1, mode1))
11089	op1 = copy_to_mode_reg (mode1, op1);
11090      if (!insn_data[icode].operand[2].predicate (op2, mode2))
11091	op2 = copy_to_mode_reg (mode2, op2);
11092      pat = GEN_FCN (icode) (op0, op1, op2);
11093      if (! pat)
11094	return 0;
11095      emit_insn (pat);
11096      return 0;
11097
11098    case IX86_BUILTIN_LDMXCSR:
11099      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11100      target = assign_386_stack_local (SImode, SLOT_TEMP);
11101      emit_move_insn (target, op0);
11102      emit_insn (gen_sse_ldmxcsr (target));
11103      return 0;
11104
11105    case IX86_BUILTIN_STMXCSR:
11106      target = assign_386_stack_local (SImode, SLOT_TEMP);
11107      emit_insn (gen_sse_stmxcsr (target));
11108      return copy_to_mode_reg (SImode, target);
11109
11110    case IX86_BUILTIN_CLFLUSH:
11111	arg0 = CALL_EXPR_ARG (exp, 0);
11112	op0 = expand_normal (arg0);
11113	icode = CODE_FOR_sse2_clflush;
11114	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11115	  op0 = ix86_zero_extend_to_Pmode (op0);
11116
11117	emit_insn (gen_sse2_clflush (op0));
11118	return 0;
11119
11120    case IX86_BUILTIN_CLWB:
11121	arg0 = CALL_EXPR_ARG (exp, 0);
11122	op0 = expand_normal (arg0);
11123	icode = CODE_FOR_clwb;
11124	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11125	  op0 = ix86_zero_extend_to_Pmode (op0);
11126
11127	emit_insn (gen_clwb (op0));
11128	return 0;
11129
11130    case IX86_BUILTIN_CLFLUSHOPT:
11131	arg0 = CALL_EXPR_ARG (exp, 0);
11132	op0 = expand_normal (arg0);
11133	icode = CODE_FOR_clflushopt;
11134	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11135	  op0 = ix86_zero_extend_to_Pmode (op0);
11136
11137	emit_insn (gen_clflushopt (op0));
11138	return 0;
11139
11140    case IX86_BUILTIN_MONITOR:
11141    case IX86_BUILTIN_MONITORX:
11142      arg0 = CALL_EXPR_ARG (exp, 0);
11143      arg1 = CALL_EXPR_ARG (exp, 1);
11144      arg2 = CALL_EXPR_ARG (exp, 2);
11145      op0 = expand_normal (arg0);
11146      op1 = expand_normal (arg1);
11147      op2 = expand_normal (arg2);
11148      if (!REG_P (op0))
11149	op0 = ix86_zero_extend_to_Pmode (op0);
11150      if (!REG_P (op1))
11151	op1 = copy_to_mode_reg (SImode, op1);
11152      if (!REG_P (op2))
11153	op2 = copy_to_mode_reg (SImode, op2);
11154
11155      emit_insn (fcode == IX86_BUILTIN_MONITOR
11156		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11157		 : gen_monitorx (Pmode, op0, op1, op2));
11158      return 0;
11159
11160    case IX86_BUILTIN_MWAIT:
11161      arg0 = CALL_EXPR_ARG (exp, 0);
11162      arg1 = CALL_EXPR_ARG (exp, 1);
11163      op0 = expand_normal (arg0);
11164      op1 = expand_normal (arg1);
11165      if (!REG_P (op0))
11166	op0 = copy_to_mode_reg (SImode, op0);
11167      if (!REG_P (op1))
11168	op1 = copy_to_mode_reg (SImode, op1);
11169      emit_insn (gen_sse3_mwait (op0, op1));
11170      return 0;
11171
11172    case IX86_BUILTIN_MWAITX:
11173      arg0 = CALL_EXPR_ARG (exp, 0);
11174      arg1 = CALL_EXPR_ARG (exp, 1);
11175      arg2 = CALL_EXPR_ARG (exp, 2);
11176      op0 = expand_normal (arg0);
11177      op1 = expand_normal (arg1);
11178      op2 = expand_normal (arg2);
11179      if (!REG_P (op0))
11180	op0 = copy_to_mode_reg (SImode, op0);
11181      if (!REG_P (op1))
11182	op1 = copy_to_mode_reg (SImode, op1);
11183      if (!REG_P (op2))
11184	op2 = copy_to_mode_reg (SImode, op2);
11185      emit_insn (gen_mwaitx (op0, op1, op2));
11186      return 0;
11187
11188    case IX86_BUILTIN_UMONITOR:
11189      arg0 = CALL_EXPR_ARG (exp, 0);
11190      op0 = expand_normal (arg0);
11191
11192      op0 = ix86_zero_extend_to_Pmode (op0);
11193      emit_insn (gen_umonitor (Pmode, op0));
11194      return 0;
11195
11196    case IX86_BUILTIN_UMWAIT:
11197    case IX86_BUILTIN_TPAUSE:
11198      arg0 = CALL_EXPR_ARG (exp, 0);
11199      arg1 = CALL_EXPR_ARG (exp, 1);
11200      op0 = expand_normal (arg0);
11201      op1 = expand_normal (arg1);
11202
11203      if (!REG_P (op0))
11204	op0 = copy_to_mode_reg (SImode, op0);
11205
11206      op1 = force_reg (DImode, op1);
11207
11208      if (TARGET_64BIT)
11209	{
11210	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11211				     NULL, 1, OPTAB_DIRECT);
11212	  switch (fcode)
11213	    {
11214	    case IX86_BUILTIN_UMWAIT:
11215	      icode = CODE_FOR_umwait_rex64;
11216	      break;
11217	    case IX86_BUILTIN_TPAUSE:
11218	      icode = CODE_FOR_tpause_rex64;
11219	      break;
11220	    default:
11221	      gcc_unreachable ();
11222	    }
11223
11224	  op2 = gen_lowpart (SImode, op2);
11225	  op1 = gen_lowpart (SImode, op1);
11226	  pat = GEN_FCN (icode) (op0, op1, op2);
11227	}
11228      else
11229	{
11230	  switch (fcode)
11231	    {
11232	    case IX86_BUILTIN_UMWAIT:
11233	      icode = CODE_FOR_umwait;
11234	      break;
11235	    case IX86_BUILTIN_TPAUSE:
11236	      icode = CODE_FOR_tpause;
11237	      break;
11238	    default:
11239	      gcc_unreachable ();
11240	    }
11241	  pat = GEN_FCN (icode) (op0, op1);
11242	}
11243
11244      if (!pat)
11245	return 0;
11246
11247      emit_insn (pat);
11248
11249      if (target == 0
11250	  || !register_operand (target, QImode))
11251	target = gen_reg_rtx (QImode);
11252
11253      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11254			const0_rtx);
11255      emit_insn (gen_rtx_SET (target, pat));
11256
11257      return target;
11258
11259    case IX86_BUILTIN_CLZERO:
11260      arg0 = CALL_EXPR_ARG (exp, 0);
11261      op0 = expand_normal (arg0);
11262      if (!REG_P (op0))
11263	op0 = ix86_zero_extend_to_Pmode (op0);
11264      emit_insn (gen_clzero (Pmode, op0));
11265      return 0;
11266
11267    case IX86_BUILTIN_CLDEMOTE:
11268      arg0 = CALL_EXPR_ARG (exp, 0);
11269      op0 = expand_normal (arg0);
11270      icode = CODE_FOR_cldemote;
11271      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11272	op0 = ix86_zero_extend_to_Pmode (op0);
11273
11274      emit_insn (gen_cldemote (op0));
11275      return 0;
11276
11277    case IX86_BUILTIN_VEC_INIT_V2SI:
11278    case IX86_BUILTIN_VEC_INIT_V4HI:
11279    case IX86_BUILTIN_VEC_INIT_V8QI:
11280      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11281
11282    case IX86_BUILTIN_VEC_EXT_V2DF:
11283    case IX86_BUILTIN_VEC_EXT_V2DI:
11284    case IX86_BUILTIN_VEC_EXT_V4SF:
11285    case IX86_BUILTIN_VEC_EXT_V4SI:
11286    case IX86_BUILTIN_VEC_EXT_V8HI:
11287    case IX86_BUILTIN_VEC_EXT_V2SI:
11288    case IX86_BUILTIN_VEC_EXT_V4HI:
11289    case IX86_BUILTIN_VEC_EXT_V16QI:
11290      return ix86_expand_vec_ext_builtin (exp, target);
11291
11292    case IX86_BUILTIN_VEC_SET_V2DI:
11293    case IX86_BUILTIN_VEC_SET_V4SF:
11294    case IX86_BUILTIN_VEC_SET_V4SI:
11295    case IX86_BUILTIN_VEC_SET_V8HI:
11296    case IX86_BUILTIN_VEC_SET_V4HI:
11297    case IX86_BUILTIN_VEC_SET_V16QI:
11298      return ix86_expand_vec_set_builtin (exp);
11299
11300    case IX86_BUILTIN_NANQ:
11301    case IX86_BUILTIN_NANSQ:
11302      return expand_call (exp, target, ignore);
11303
11304    case IX86_BUILTIN_RDPID:
11305
11306      op0 = gen_reg_rtx (word_mode);
11307
11308      if (TARGET_64BIT)
11309	{
11310	  insn = gen_rdpid_rex64 (op0);
11311	  op0 = convert_to_mode (SImode, op0, 1);
11312	}
11313      else
11314	insn = gen_rdpid (op0);
11315
11316      emit_insn (insn);
11317
11318      if (target == 0
11319	  || !register_operand (target, SImode))
11320	target = gen_reg_rtx (SImode);
11321
11322      emit_move_insn (target, op0);
11323      return target;
11324
11325    case IX86_BUILTIN_2INTERSECTD512:
11326    case IX86_BUILTIN_2INTERSECTQ512:
11327    case IX86_BUILTIN_2INTERSECTD256:
11328    case IX86_BUILTIN_2INTERSECTQ256:
11329    case IX86_BUILTIN_2INTERSECTD128:
11330    case IX86_BUILTIN_2INTERSECTQ128:
11331      arg0 = CALL_EXPR_ARG (exp, 0);
11332      arg1 = CALL_EXPR_ARG (exp, 1);
11333      arg2 = CALL_EXPR_ARG (exp, 2);
11334      arg3 = CALL_EXPR_ARG (exp, 3);
11335      op0 = expand_normal (arg0);
11336      op1 = expand_normal (arg1);
11337      op2 = expand_normal (arg2);
11338      op3 = expand_normal (arg3);
11339
11340      if (!address_operand (op0, VOIDmode))
11341	{
11342	  op0 = convert_memory_address (Pmode, op0);
11343	  op0 = copy_addr_to_reg (op0);
11344	}
11345      if (!address_operand (op1, VOIDmode))
11346	{
11347	  op1 = convert_memory_address (Pmode, op1);
11348	  op1 = copy_addr_to_reg (op1);
11349	}
11350
11351      switch (fcode)
11352	{
11353	case IX86_BUILTIN_2INTERSECTD512:
11354	  mode4 = P2HImode;
11355	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11356	  break;
11357	case IX86_BUILTIN_2INTERSECTQ512:
11358	  mode4 = P2QImode;
11359	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11360	  break;
11361	case IX86_BUILTIN_2INTERSECTD256:
11362	  mode4 = P2QImode;
11363	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11364	  break;
11365	case IX86_BUILTIN_2INTERSECTQ256:
11366	  mode4 = P2QImode;
11367	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11368	  break;
11369	case IX86_BUILTIN_2INTERSECTD128:
11370	  mode4 = P2QImode;
11371	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11372	  break;
11373	case IX86_BUILTIN_2INTERSECTQ128:
11374	  mode4 = P2QImode;
11375	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11376	  break;
11377	default:
11378	  gcc_unreachable ();
11379	}
11380
11381      mode2 = insn_data[icode].operand[1].mode;
11382      mode3 = insn_data[icode].operand[2].mode;
11383      if (!insn_data[icode].operand[1].predicate (op2, mode2))
11384	op2 = copy_to_mode_reg (mode2, op2);
11385      if (!insn_data[icode].operand[2].predicate (op3, mode3))
11386	op3 = copy_to_mode_reg (mode3, op3);
11387
11388      op4 = gen_reg_rtx (mode4);
11389      emit_insn (GEN_FCN (icode) (op4, op2, op3));
11390      mode0 = mode4 == P2HImode ? HImode : QImode;
11391      emit_move_insn (gen_rtx_MEM (mode0, op0),
11392		      gen_lowpart (mode0, op4));
11393      emit_move_insn (gen_rtx_MEM (mode0, op1),
11394		      gen_highpart (mode0, op4));
11395
11396      return 0;
11397
11398    case IX86_BUILTIN_RDPMC:
11399    case IX86_BUILTIN_RDTSC:
11400    case IX86_BUILTIN_RDTSCP:
11401    case IX86_BUILTIN_XGETBV:
11402
11403      op0 = gen_reg_rtx (DImode);
11404      op1 = gen_reg_rtx (DImode);
11405
11406      if (fcode == IX86_BUILTIN_RDPMC)
11407	{
11408	  arg0 = CALL_EXPR_ARG (exp, 0);
11409	  op2 = expand_normal (arg0);
11410	  if (!register_operand (op2, SImode))
11411	    op2 = copy_to_mode_reg (SImode, op2);
11412
11413	  insn = (TARGET_64BIT
11414		  ? gen_rdpmc_rex64 (op0, op1, op2)
11415		  : gen_rdpmc (op0, op2));
11416	  emit_insn (insn);
11417	}
11418      else if (fcode == IX86_BUILTIN_XGETBV)
11419	{
11420	  arg0 = CALL_EXPR_ARG (exp, 0);
11421	  op2 = expand_normal (arg0);
11422	  if (!register_operand (op2, SImode))
11423	    op2 = copy_to_mode_reg (SImode, op2);
11424
11425	  insn = (TARGET_64BIT
11426		  ? gen_xgetbv_rex64 (op0, op1, op2)
11427		  : gen_xgetbv (op0, op2));
11428	  emit_insn (insn);
11429	}
11430      else if (fcode == IX86_BUILTIN_RDTSC)
11431	{
11432	  insn = (TARGET_64BIT
11433		  ? gen_rdtsc_rex64 (op0, op1)
11434		  : gen_rdtsc (op0));
11435	  emit_insn (insn);
11436	}
11437      else
11438	{
11439	  op2 = gen_reg_rtx (SImode);
11440
11441	  insn = (TARGET_64BIT
11442		  ? gen_rdtscp_rex64 (op0, op1, op2)
11443		  : gen_rdtscp (op0, op2));
11444	  emit_insn (insn);
11445
11446	  arg0 = CALL_EXPR_ARG (exp, 0);
11447	  op4 = expand_normal (arg0);
11448	  if (!address_operand (op4, VOIDmode))
11449	    {
11450	      op4 = convert_memory_address (Pmode, op4);
11451	      op4 = copy_addr_to_reg (op4);
11452	    }
11453	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11454	}
11455
11456      if (target == 0
11457	  || !register_operand (target, DImode))
11458        target = gen_reg_rtx (DImode);
11459
11460      if (TARGET_64BIT)
11461	{
11462	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11463				     op1, 1, OPTAB_DIRECT);
11464	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
11465				     op0, 1, OPTAB_DIRECT);
11466	}
11467
11468      emit_move_insn (target, op0);
11469      return target;
11470
11471    case IX86_BUILTIN_ENQCMD:
11472    case IX86_BUILTIN_ENQCMDS:
11473    case IX86_BUILTIN_MOVDIR64B:
11474
11475      arg0 = CALL_EXPR_ARG (exp, 0);
11476      arg1 = CALL_EXPR_ARG (exp, 1);
11477      op0 = expand_normal (arg0);
11478      op1 = expand_normal (arg1);
11479
11480      op0 = ix86_zero_extend_to_Pmode (op0);
11481      if (!address_operand (op1, VOIDmode))
11482      {
11483	op1 = convert_memory_address (Pmode, op1);
11484	op1 = copy_addr_to_reg (op1);
11485      }
11486      op1 = gen_rtx_MEM (XImode, op1);
11487
11488      if (fcode == IX86_BUILTIN_MOVDIR64B)
11489	{
11490	  emit_insn (gen_movdir64b (Pmode, op0, op1));
11491	  return 0;
11492	}
11493      else
11494	{
11495	  rtx pat;
11496
11497	  target = gen_reg_rtx (SImode);
11498	  emit_move_insn (target, const0_rtx);
11499	  target = gen_rtx_SUBREG (QImode, target, 0);
11500
11501	  if (fcode == IX86_BUILTIN_ENQCMD)
11502	    pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11503	  else
11504	    pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11505
11506	  emit_insn (pat);
11507
11508	  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11509				  gen_rtx_fmt_ee (EQ, QImode,
11510						  SET_DEST (pat),
11511						  const0_rtx)));
11512
11513	  return SUBREG_REG (target);
11514	}
11515
11516    case IX86_BUILTIN_FXSAVE:
11517    case IX86_BUILTIN_FXRSTOR:
11518    case IX86_BUILTIN_FXSAVE64:
11519    case IX86_BUILTIN_FXRSTOR64:
11520    case IX86_BUILTIN_FNSTENV:
11521    case IX86_BUILTIN_FLDENV:
11522      mode0 = BLKmode;
11523      switch (fcode)
11524	{
11525	case IX86_BUILTIN_FXSAVE:
11526	  icode = CODE_FOR_fxsave;
11527	  break;
11528	case IX86_BUILTIN_FXRSTOR:
11529	  icode = CODE_FOR_fxrstor;
11530	  break;
11531	case IX86_BUILTIN_FXSAVE64:
11532	  icode = CODE_FOR_fxsave64;
11533	  break;
11534	case IX86_BUILTIN_FXRSTOR64:
11535	  icode = CODE_FOR_fxrstor64;
11536	  break;
11537	case IX86_BUILTIN_FNSTENV:
11538	  icode = CODE_FOR_fnstenv;
11539	  break;
11540	case IX86_BUILTIN_FLDENV:
11541	  icode = CODE_FOR_fldenv;
11542	  break;
11543	default:
11544	  gcc_unreachable ();
11545	}
11546
11547      arg0 = CALL_EXPR_ARG (exp, 0);
11548      op0 = expand_normal (arg0);
11549
11550      if (!address_operand (op0, VOIDmode))
11551	{
11552	  op0 = convert_memory_address (Pmode, op0);
11553	  op0 = copy_addr_to_reg (op0);
11554	}
11555      op0 = gen_rtx_MEM (mode0, op0);
11556
11557      pat = GEN_FCN (icode) (op0);
11558      if (pat)
11559	emit_insn (pat);
11560      return 0;
11561
11562    case IX86_BUILTIN_XSETBV:
11563      arg0 = CALL_EXPR_ARG (exp, 0);
11564      arg1 = CALL_EXPR_ARG (exp, 1);
11565      op0 = expand_normal (arg0);
11566      op1 = expand_normal (arg1);
11567
11568      if (!REG_P (op0))
11569	op0 = copy_to_mode_reg (SImode, op0);
11570
11571      op1 = force_reg (DImode, op1);
11572
11573      if (TARGET_64BIT)
11574	{
11575	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11576				     NULL, 1, OPTAB_DIRECT);
11577
11578	  icode = CODE_FOR_xsetbv_rex64;
11579
11580	  op2 = gen_lowpart (SImode, op2);
11581	  op1 = gen_lowpart (SImode, op1);
11582	  pat = GEN_FCN (icode) (op0, op1, op2);
11583	}
11584      else
11585	{
11586	  icode = CODE_FOR_xsetbv;
11587
11588	  pat = GEN_FCN (icode) (op0, op1);
11589	}
11590      if (pat)
11591	emit_insn (pat);
11592      return 0;
11593
11594    case IX86_BUILTIN_XSAVE:
11595    case IX86_BUILTIN_XRSTOR:
11596    case IX86_BUILTIN_XSAVE64:
11597    case IX86_BUILTIN_XRSTOR64:
11598    case IX86_BUILTIN_XSAVEOPT:
11599    case IX86_BUILTIN_XSAVEOPT64:
11600    case IX86_BUILTIN_XSAVES:
11601    case IX86_BUILTIN_XRSTORS:
11602    case IX86_BUILTIN_XSAVES64:
11603    case IX86_BUILTIN_XRSTORS64:
11604    case IX86_BUILTIN_XSAVEC:
11605    case IX86_BUILTIN_XSAVEC64:
11606      arg0 = CALL_EXPR_ARG (exp, 0);
11607      arg1 = CALL_EXPR_ARG (exp, 1);
11608      op0 = expand_normal (arg0);
11609      op1 = expand_normal (arg1);
11610
11611      if (!address_operand (op0, VOIDmode))
11612	{
11613	  op0 = convert_memory_address (Pmode, op0);
11614	  op0 = copy_addr_to_reg (op0);
11615	}
11616      op0 = gen_rtx_MEM (BLKmode, op0);
11617
11618      op1 = force_reg (DImode, op1);
11619
11620      if (TARGET_64BIT)
11621	{
11622	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11623				     NULL, 1, OPTAB_DIRECT);
11624	  switch (fcode)
11625	    {
11626	    case IX86_BUILTIN_XSAVE:
11627	      icode = CODE_FOR_xsave_rex64;
11628	      break;
11629	    case IX86_BUILTIN_XRSTOR:
11630	      icode = CODE_FOR_xrstor_rex64;
11631	      break;
11632	    case IX86_BUILTIN_XSAVE64:
11633	      icode = CODE_FOR_xsave64;
11634	      break;
11635	    case IX86_BUILTIN_XRSTOR64:
11636	      icode = CODE_FOR_xrstor64;
11637	      break;
11638	    case IX86_BUILTIN_XSAVEOPT:
11639	      icode = CODE_FOR_xsaveopt_rex64;
11640	      break;
11641	    case IX86_BUILTIN_XSAVEOPT64:
11642	      icode = CODE_FOR_xsaveopt64;
11643	      break;
11644	    case IX86_BUILTIN_XSAVES:
11645	      icode = CODE_FOR_xsaves_rex64;
11646	      break;
11647	    case IX86_BUILTIN_XRSTORS:
11648	      icode = CODE_FOR_xrstors_rex64;
11649	      break;
11650	    case IX86_BUILTIN_XSAVES64:
11651	      icode = CODE_FOR_xsaves64;
11652	      break;
11653	    case IX86_BUILTIN_XRSTORS64:
11654	      icode = CODE_FOR_xrstors64;
11655	      break;
11656	    case IX86_BUILTIN_XSAVEC:
11657	      icode = CODE_FOR_xsavec_rex64;
11658	      break;
11659	    case IX86_BUILTIN_XSAVEC64:
11660	      icode = CODE_FOR_xsavec64;
11661	      break;
11662	    default:
11663	      gcc_unreachable ();
11664	    }
11665
11666	  op2 = gen_lowpart (SImode, op2);
11667	  op1 = gen_lowpart (SImode, op1);
11668	  pat = GEN_FCN (icode) (op0, op1, op2);
11669	}
11670      else
11671	{
11672	  switch (fcode)
11673	    {
11674	    case IX86_BUILTIN_XSAVE:
11675	      icode = CODE_FOR_xsave;
11676	      break;
11677	    case IX86_BUILTIN_XRSTOR:
11678	      icode = CODE_FOR_xrstor;
11679	      break;
11680	    case IX86_BUILTIN_XSAVEOPT:
11681	      icode = CODE_FOR_xsaveopt;
11682	      break;
11683	    case IX86_BUILTIN_XSAVES:
11684	      icode = CODE_FOR_xsaves;
11685	      break;
11686	    case IX86_BUILTIN_XRSTORS:
11687	      icode = CODE_FOR_xrstors;
11688	      break;
11689	    case IX86_BUILTIN_XSAVEC:
11690	      icode = CODE_FOR_xsavec;
11691	      break;
11692	    default:
11693	      gcc_unreachable ();
11694	    }
11695	  pat = GEN_FCN (icode) (op0, op1);
11696	}
11697
11698      if (pat)
11699	emit_insn (pat);
11700      return 0;
11701
11702    case IX86_BUILTIN_LLWPCB:
11703      arg0 = CALL_EXPR_ARG (exp, 0);
11704      op0 = expand_normal (arg0);
11705      icode = CODE_FOR_lwp_llwpcb;
11706      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11707	op0 = ix86_zero_extend_to_Pmode (op0);
11708      emit_insn (gen_lwp_llwpcb (op0));
11709      return 0;
11710
11711    case IX86_BUILTIN_SLWPCB:
11712      icode = CODE_FOR_lwp_slwpcb;
11713      if (!target
11714	  || !insn_data[icode].operand[0].predicate (target, Pmode))
11715	target = gen_reg_rtx (Pmode);
11716      emit_insn (gen_lwp_slwpcb (target));
11717      return target;
11718
11719    case IX86_BUILTIN_BEXTRI32:
11720    case IX86_BUILTIN_BEXTRI64:
11721      arg0 = CALL_EXPR_ARG (exp, 0);
11722      arg1 = CALL_EXPR_ARG (exp, 1);
11723      op0 = expand_normal (arg0);
11724      op1 = expand_normal (arg1);
11725      icode = (fcode == IX86_BUILTIN_BEXTRI32
11726	  ? CODE_FOR_tbm_bextri_si
11727	  : CODE_FOR_tbm_bextri_di);
11728      if (!CONST_INT_P (op1))
11729        {
11730          error ("last argument must be an immediate");
11731          return const0_rtx;
11732        }
11733      else
11734        {
11735          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11736          unsigned char lsb_index = INTVAL (op1) & 0xFF;
11737          op1 = GEN_INT (length);
11738          op2 = GEN_INT (lsb_index);
11739
11740	  mode1 = insn_data[icode].operand[1].mode;
11741	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
11742	    op0 = copy_to_mode_reg (mode1, op0);
11743
11744	  mode0 = insn_data[icode].operand[0].mode;
11745	  if (target == 0
11746	      || !register_operand (target, mode0))
11747	    target = gen_reg_rtx (mode0);
11748
11749          pat = GEN_FCN (icode) (target, op0, op1, op2);
11750          if (pat)
11751            emit_insn (pat);
11752          return target;
11753        }
11754
11755    case IX86_BUILTIN_RDRAND16_STEP:
11756      icode = CODE_FOR_rdrandhi_1;
11757      mode0 = HImode;
11758      goto rdrand_step;
11759
11760    case IX86_BUILTIN_RDRAND32_STEP:
11761      icode = CODE_FOR_rdrandsi_1;
11762      mode0 = SImode;
11763      goto rdrand_step;
11764
11765    case IX86_BUILTIN_RDRAND64_STEP:
11766      icode = CODE_FOR_rdranddi_1;
11767      mode0 = DImode;
11768
11769rdrand_step:
11770      arg0 = CALL_EXPR_ARG (exp, 0);
11771      op1 = expand_normal (arg0);
11772      if (!address_operand (op1, VOIDmode))
11773	{
11774	  op1 = convert_memory_address (Pmode, op1);
11775	  op1 = copy_addr_to_reg (op1);
11776	}
11777
11778      op0 = gen_reg_rtx (mode0);
11779      emit_insn (GEN_FCN (icode) (op0));
11780
11781      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11782
11783      op1 = gen_reg_rtx (SImode);
11784      emit_move_insn (op1, CONST1_RTX (SImode));
11785
11786      /* Emit SImode conditional move.  */
11787      if (mode0 == HImode)
11788	{
11789	  if (TARGET_ZERO_EXTEND_WITH_AND
11790	      && optimize_function_for_speed_p (cfun))
11791	    {
11792	      op2 = force_reg (SImode, const0_rtx);
11793
11794	      emit_insn (gen_movstricthi
11795			 (gen_lowpart (HImode, op2), op0));
11796	    }
11797	  else
11798	    {
11799	      op2 = gen_reg_rtx (SImode);
11800
11801	      emit_insn (gen_zero_extendhisi2 (op2, op0));
11802	    }
11803	}
11804      else if (mode0 == SImode)
11805	op2 = op0;
11806      else
11807	op2 = gen_rtx_SUBREG (SImode, op0, 0);
11808
11809      if (target == 0
11810	  || !register_operand (target, SImode))
11811	target = gen_reg_rtx (SImode);
11812
11813      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11814			 const0_rtx);
11815      emit_insn (gen_rtx_SET (target,
11816			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11817      return target;
11818
11819    case IX86_BUILTIN_RDSEED16_STEP:
11820      icode = CODE_FOR_rdseedhi_1;
11821      mode0 = HImode;
11822      goto rdseed_step;
11823
11824    case IX86_BUILTIN_RDSEED32_STEP:
11825      icode = CODE_FOR_rdseedsi_1;
11826      mode0 = SImode;
11827      goto rdseed_step;
11828
11829    case IX86_BUILTIN_RDSEED64_STEP:
11830      icode = CODE_FOR_rdseeddi_1;
11831      mode0 = DImode;
11832
11833rdseed_step:
11834      arg0 = CALL_EXPR_ARG (exp, 0);
11835      op1 = expand_normal (arg0);
11836      if (!address_operand (op1, VOIDmode))
11837	{
11838	  op1 = convert_memory_address (Pmode, op1);
11839	  op1 = copy_addr_to_reg (op1);
11840	}
11841
11842      op0 = gen_reg_rtx (mode0);
11843      emit_insn (GEN_FCN (icode) (op0));
11844
11845      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11846
11847      op2 = gen_reg_rtx (QImode);
11848
11849      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11850                         const0_rtx);
11851      emit_insn (gen_rtx_SET (op2, pat));
11852
11853      if (target == 0
11854	  || !register_operand (target, SImode))
11855        target = gen_reg_rtx (SImode);
11856
11857      emit_insn (gen_zero_extendqisi2 (target, op2));
11858      return target;
11859
11860    case IX86_BUILTIN_SBB32:
11861      icode = CODE_FOR_subborrowsi;
11862      icode2 = CODE_FOR_subborrowsi_0;
11863      mode0 = SImode;
11864      mode1 = DImode;
11865      mode2 = CCmode;
11866      goto handlecarry;
11867
11868    case IX86_BUILTIN_SBB64:
11869      icode = CODE_FOR_subborrowdi;
11870      icode2 = CODE_FOR_subborrowdi_0;
11871      mode0 = DImode;
11872      mode1 = TImode;
11873      mode2 = CCmode;
11874      goto handlecarry;
11875
11876    case IX86_BUILTIN_ADDCARRYX32:
11877      icode = CODE_FOR_addcarrysi;
11878      icode2 = CODE_FOR_addcarrysi_0;
11879      mode0 = SImode;
11880      mode1 = DImode;
11881      mode2 = CCCmode;
11882      goto handlecarry;
11883
11884    case IX86_BUILTIN_ADDCARRYX64:
11885      icode = CODE_FOR_addcarrydi;
11886      icode2 = CODE_FOR_addcarrydi_0;
11887      mode0 = DImode;
11888      mode1 = TImode;
11889      mode2 = CCCmode;
11890
11891    handlecarry:
11892      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
11893      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
11894      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
11895      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
11896
11897      op1 = expand_normal (arg0);
11898      if (!integer_zerop (arg0))
11899	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11900
11901      op2 = expand_normal (arg1);
11902      if (!register_operand (op2, mode0))
11903	op2 = copy_to_mode_reg (mode0, op2);
11904
11905      op3 = expand_normal (arg2);
11906      if (!register_operand (op3, mode0))
11907	op3 = copy_to_mode_reg (mode0, op3);
11908
11909      op4 = expand_normal (arg3);
11910      if (!address_operand (op4, VOIDmode))
11911	{
11912	  op4 = convert_memory_address (Pmode, op4);
11913	  op4 = copy_addr_to_reg (op4);
11914	}
11915
11916      op0 = gen_reg_rtx (mode0);
11917      if (integer_zerop (arg0))
11918	{
11919	  /* If arg0 is 0, optimize right away into add or sub
11920	     instruction that sets CCCmode flags.  */
11921	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
11922	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11923	}
11924      else
11925	{
11926	  /* Generate CF from input operand.  */
11927	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11928
11929	  /* Generate instruction that consumes CF.  */
11930	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11931	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11932	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11933	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11934	}
11935
11936      /* Return current CF value.  */
11937      if (target == 0)
11938        target = gen_reg_rtx (QImode);
11939
11940      pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11941      emit_insn (gen_rtx_SET (target, pat));
11942
11943      /* Store the result.  */
11944      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11945
11946      return target;
11947
11948    case IX86_BUILTIN_READ_FLAGS:
11949      if (ignore)
11950	return const0_rtx;
11951
11952      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11953
11954      if (optimize
11955	  || target == NULL_RTX
11956	  || !nonimmediate_operand (target, word_mode)
11957	  || GET_MODE (target) != word_mode)
11958	target = gen_reg_rtx (word_mode);
11959
11960      emit_insn (gen_pop (target));
11961      return target;
11962
11963    case IX86_BUILTIN_WRITE_FLAGS:
11964
11965      arg0 = CALL_EXPR_ARG (exp, 0);
11966      op0 = expand_normal (arg0);
11967      if (!general_no_elim_operand (op0, word_mode))
11968	op0 = copy_to_mode_reg (word_mode, op0);
11969
11970      emit_insn (gen_push (op0));
11971      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11972      return 0;
11973
11974    case IX86_BUILTIN_KTESTC8:
11975      icode = CODE_FOR_ktestqi;
11976      mode3 = CCCmode;
11977      goto kortest;
11978
11979    case IX86_BUILTIN_KTESTZ8:
11980      icode = CODE_FOR_ktestqi;
11981      mode3 = CCZmode;
11982      goto kortest;
11983
11984    case IX86_BUILTIN_KTESTC16:
11985      icode = CODE_FOR_ktesthi;
11986      mode3 = CCCmode;
11987      goto kortest;
11988
11989    case IX86_BUILTIN_KTESTZ16:
11990      icode = CODE_FOR_ktesthi;
11991      mode3 = CCZmode;
11992      goto kortest;
11993
11994    case IX86_BUILTIN_KTESTC32:
11995      icode = CODE_FOR_ktestsi;
11996      mode3 = CCCmode;
11997      goto kortest;
11998
11999    case IX86_BUILTIN_KTESTZ32:
12000      icode = CODE_FOR_ktestsi;
12001      mode3 = CCZmode;
12002      goto kortest;
12003
12004    case IX86_BUILTIN_KTESTC64:
12005      icode = CODE_FOR_ktestdi;
12006      mode3 = CCCmode;
12007      goto kortest;
12008
12009    case IX86_BUILTIN_KTESTZ64:
12010      icode = CODE_FOR_ktestdi;
12011      mode3 = CCZmode;
12012      goto kortest;
12013
12014    case IX86_BUILTIN_KORTESTC8:
12015      icode = CODE_FOR_kortestqi;
12016      mode3 = CCCmode;
12017      goto kortest;
12018
12019    case IX86_BUILTIN_KORTESTZ8:
12020      icode = CODE_FOR_kortestqi;
12021      mode3 = CCZmode;
12022      goto kortest;
12023
12024    case IX86_BUILTIN_KORTESTC16:
12025      icode = CODE_FOR_kortesthi;
12026      mode3 = CCCmode;
12027      goto kortest;
12028
12029    case IX86_BUILTIN_KORTESTZ16:
12030      icode = CODE_FOR_kortesthi;
12031      mode3 = CCZmode;
12032      goto kortest;
12033
12034    case IX86_BUILTIN_KORTESTC32:
12035      icode = CODE_FOR_kortestsi;
12036      mode3 = CCCmode;
12037      goto kortest;
12038
12039    case IX86_BUILTIN_KORTESTZ32:
12040      icode = CODE_FOR_kortestsi;
12041      mode3 = CCZmode;
12042      goto kortest;
12043
12044    case IX86_BUILTIN_KORTESTC64:
12045      icode = CODE_FOR_kortestdi;
12046      mode3 = CCCmode;
12047      goto kortest;
12048
12049    case IX86_BUILTIN_KORTESTZ64:
12050      icode = CODE_FOR_kortestdi;
12051      mode3 = CCZmode;
12052
12053    kortest:
12054      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
12055      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
12056      op0 = expand_normal (arg0);
12057      op1 = expand_normal (arg1);
12058
12059      mode0 = insn_data[icode].operand[0].mode;
12060      mode1 = insn_data[icode].operand[1].mode;
12061
12062      if (GET_MODE (op0) != VOIDmode)
12063	op0 = force_reg (GET_MODE (op0), op0);
12064
12065      op0 = gen_lowpart (mode0, op0);
12066
12067      if (!insn_data[icode].operand[0].predicate (op0, mode0))
12068	op0 = copy_to_mode_reg (mode0, op0);
12069
12070      if (GET_MODE (op1) != VOIDmode)
12071	op1 = force_reg (GET_MODE (op1), op1);
12072
12073      op1 = gen_lowpart (mode1, op1);
12074
12075      if (!insn_data[icode].operand[1].predicate (op1, mode1))
12076	op1 = copy_to_mode_reg (mode1, op1);
12077
12078      target = gen_reg_rtx (QImode);
12079
12080      /* Emit kortest.  */
12081      emit_insn (GEN_FCN (icode) (op0, op1));
12082      /* And use setcc to return result from flags.  */
12083      ix86_expand_setcc (target, EQ,
12084			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12085      return target;
12086
12087    case IX86_BUILTIN_GATHERSIV2DF:
12088      icode = CODE_FOR_avx2_gathersiv2df;
12089      goto gather_gen;
12090    case IX86_BUILTIN_GATHERSIV4DF:
12091      icode = CODE_FOR_avx2_gathersiv4df;
12092      goto gather_gen;
12093    case IX86_BUILTIN_GATHERDIV2DF:
12094      icode = CODE_FOR_avx2_gatherdiv2df;
12095      goto gather_gen;
12096    case IX86_BUILTIN_GATHERDIV4DF:
12097      icode = CODE_FOR_avx2_gatherdiv4df;
12098      goto gather_gen;
12099    case IX86_BUILTIN_GATHERSIV4SF:
12100      icode = CODE_FOR_avx2_gathersiv4sf;
12101      goto gather_gen;
12102    case IX86_BUILTIN_GATHERSIV8SF:
12103      icode = CODE_FOR_avx2_gathersiv8sf;
12104      goto gather_gen;
12105    case IX86_BUILTIN_GATHERDIV4SF:
12106      icode = CODE_FOR_avx2_gatherdiv4sf;
12107      goto gather_gen;
12108    case IX86_BUILTIN_GATHERDIV8SF:
12109      icode = CODE_FOR_avx2_gatherdiv8sf;
12110      goto gather_gen;
12111    case IX86_BUILTIN_GATHERSIV2DI:
12112      icode = CODE_FOR_avx2_gathersiv2di;
12113      goto gather_gen;
12114    case IX86_BUILTIN_GATHERSIV4DI:
12115      icode = CODE_FOR_avx2_gathersiv4di;
12116      goto gather_gen;
12117    case IX86_BUILTIN_GATHERDIV2DI:
12118      icode = CODE_FOR_avx2_gatherdiv2di;
12119      goto gather_gen;
12120    case IX86_BUILTIN_GATHERDIV4DI:
12121      icode = CODE_FOR_avx2_gatherdiv4di;
12122      goto gather_gen;
12123    case IX86_BUILTIN_GATHERSIV4SI:
12124      icode = CODE_FOR_avx2_gathersiv4si;
12125      goto gather_gen;
12126    case IX86_BUILTIN_GATHERSIV8SI:
12127      icode = CODE_FOR_avx2_gathersiv8si;
12128      goto gather_gen;
12129    case IX86_BUILTIN_GATHERDIV4SI:
12130      icode = CODE_FOR_avx2_gatherdiv4si;
12131      goto gather_gen;
12132    case IX86_BUILTIN_GATHERDIV8SI:
12133      icode = CODE_FOR_avx2_gatherdiv8si;
12134      goto gather_gen;
12135    case IX86_BUILTIN_GATHERALTSIV4DF:
12136      icode = CODE_FOR_avx2_gathersiv4df;
12137      goto gather_gen;
12138    case IX86_BUILTIN_GATHERALTDIV8SF:
12139      icode = CODE_FOR_avx2_gatherdiv8sf;
12140      goto gather_gen;
12141    case IX86_BUILTIN_GATHERALTSIV4DI:
12142      icode = CODE_FOR_avx2_gathersiv4di;
12143      goto gather_gen;
12144    case IX86_BUILTIN_GATHERALTDIV8SI:
12145      icode = CODE_FOR_avx2_gatherdiv8si;
12146      goto gather_gen;
12147    case IX86_BUILTIN_GATHER3SIV16SF:
12148      icode = CODE_FOR_avx512f_gathersiv16sf;
12149      goto gather_gen;
12150    case IX86_BUILTIN_GATHER3SIV8DF:
12151      icode = CODE_FOR_avx512f_gathersiv8df;
12152      goto gather_gen;
12153    case IX86_BUILTIN_GATHER3DIV16SF:
12154      icode = CODE_FOR_avx512f_gatherdiv16sf;
12155      goto gather_gen;
12156    case IX86_BUILTIN_GATHER3DIV8DF:
12157      icode = CODE_FOR_avx512f_gatherdiv8df;
12158      goto gather_gen;
12159    case IX86_BUILTIN_GATHER3SIV16SI:
12160      icode = CODE_FOR_avx512f_gathersiv16si;
12161      goto gather_gen;
12162    case IX86_BUILTIN_GATHER3SIV8DI:
12163      icode = CODE_FOR_avx512f_gathersiv8di;
12164      goto gather_gen;
12165    case IX86_BUILTIN_GATHER3DIV16SI:
12166      icode = CODE_FOR_avx512f_gatherdiv16si;
12167      goto gather_gen;
12168    case IX86_BUILTIN_GATHER3DIV8DI:
12169      icode = CODE_FOR_avx512f_gatherdiv8di;
12170      goto gather_gen;
12171    case IX86_BUILTIN_GATHER3ALTSIV8DF:
12172      icode = CODE_FOR_avx512f_gathersiv8df;
12173      goto gather_gen;
12174    case IX86_BUILTIN_GATHER3ALTDIV16SF:
12175      icode = CODE_FOR_avx512f_gatherdiv16sf;
12176      goto gather_gen;
12177    case IX86_BUILTIN_GATHER3ALTSIV8DI:
12178      icode = CODE_FOR_avx512f_gathersiv8di;
12179      goto gather_gen;
12180    case IX86_BUILTIN_GATHER3ALTDIV16SI:
12181      icode = CODE_FOR_avx512f_gatherdiv16si;
12182      goto gather_gen;
12183    case IX86_BUILTIN_GATHER3SIV2DF:
12184      icode = CODE_FOR_avx512vl_gathersiv2df;
12185      goto gather_gen;
12186    case IX86_BUILTIN_GATHER3SIV4DF:
12187      icode = CODE_FOR_avx512vl_gathersiv4df;
12188      goto gather_gen;
12189    case IX86_BUILTIN_GATHER3DIV2DF:
12190      icode = CODE_FOR_avx512vl_gatherdiv2df;
12191      goto gather_gen;
12192    case IX86_BUILTIN_GATHER3DIV4DF:
12193      icode = CODE_FOR_avx512vl_gatherdiv4df;
12194      goto gather_gen;
12195    case IX86_BUILTIN_GATHER3SIV4SF:
12196      icode = CODE_FOR_avx512vl_gathersiv4sf;
12197      goto gather_gen;
12198    case IX86_BUILTIN_GATHER3SIV8SF:
12199      icode = CODE_FOR_avx512vl_gathersiv8sf;
12200      goto gather_gen;
12201    case IX86_BUILTIN_GATHER3DIV4SF:
12202      icode = CODE_FOR_avx512vl_gatherdiv4sf;
12203      goto gather_gen;
12204    case IX86_BUILTIN_GATHER3DIV8SF:
12205      icode = CODE_FOR_avx512vl_gatherdiv8sf;
12206      goto gather_gen;
12207    case IX86_BUILTIN_GATHER3SIV2DI:
12208      icode = CODE_FOR_avx512vl_gathersiv2di;
12209      goto gather_gen;
12210    case IX86_BUILTIN_GATHER3SIV4DI:
12211      icode = CODE_FOR_avx512vl_gathersiv4di;
12212      goto gather_gen;
12213    case IX86_BUILTIN_GATHER3DIV2DI:
12214      icode = CODE_FOR_avx512vl_gatherdiv2di;
12215      goto gather_gen;
12216    case IX86_BUILTIN_GATHER3DIV4DI:
12217      icode = CODE_FOR_avx512vl_gatherdiv4di;
12218      goto gather_gen;
12219    case IX86_BUILTIN_GATHER3SIV4SI:
12220      icode = CODE_FOR_avx512vl_gathersiv4si;
12221      goto gather_gen;
12222    case IX86_BUILTIN_GATHER3SIV8SI:
12223      icode = CODE_FOR_avx512vl_gathersiv8si;
12224      goto gather_gen;
12225    case IX86_BUILTIN_GATHER3DIV4SI:
12226      icode = CODE_FOR_avx512vl_gatherdiv4si;
12227      goto gather_gen;
12228    case IX86_BUILTIN_GATHER3DIV8SI:
12229      icode = CODE_FOR_avx512vl_gatherdiv8si;
12230      goto gather_gen;
12231    case IX86_BUILTIN_GATHER3ALTSIV4DF:
12232      icode = CODE_FOR_avx512vl_gathersiv4df;
12233      goto gather_gen;
12234    case IX86_BUILTIN_GATHER3ALTDIV8SF:
12235      icode = CODE_FOR_avx512vl_gatherdiv8sf;
12236      goto gather_gen;
12237    case IX86_BUILTIN_GATHER3ALTSIV4DI:
12238      icode = CODE_FOR_avx512vl_gathersiv4di;
12239      goto gather_gen;
12240    case IX86_BUILTIN_GATHER3ALTDIV8SI:
12241      icode = CODE_FOR_avx512vl_gatherdiv8si;
12242      goto gather_gen;
12243    case IX86_BUILTIN_SCATTERSIV16SF:
12244      icode = CODE_FOR_avx512f_scattersiv16sf;
12245      goto scatter_gen;
12246    case IX86_BUILTIN_SCATTERSIV8DF:
12247      icode = CODE_FOR_avx512f_scattersiv8df;
12248      goto scatter_gen;
12249    case IX86_BUILTIN_SCATTERDIV16SF:
12250      icode = CODE_FOR_avx512f_scatterdiv16sf;
12251      goto scatter_gen;
12252    case IX86_BUILTIN_SCATTERDIV8DF:
12253      icode = CODE_FOR_avx512f_scatterdiv8df;
12254      goto scatter_gen;
12255    case IX86_BUILTIN_SCATTERSIV16SI:
12256      icode = CODE_FOR_avx512f_scattersiv16si;
12257      goto scatter_gen;
12258    case IX86_BUILTIN_SCATTERSIV8DI:
12259      icode = CODE_FOR_avx512f_scattersiv8di;
12260      goto scatter_gen;
12261    case IX86_BUILTIN_SCATTERDIV16SI:
12262      icode = CODE_FOR_avx512f_scatterdiv16si;
12263      goto scatter_gen;
12264    case IX86_BUILTIN_SCATTERDIV8DI:
12265      icode = CODE_FOR_avx512f_scatterdiv8di;
12266      goto scatter_gen;
12267    case IX86_BUILTIN_SCATTERSIV8SF:
12268      icode = CODE_FOR_avx512vl_scattersiv8sf;
12269      goto scatter_gen;
12270    case IX86_BUILTIN_SCATTERSIV4SF:
12271      icode = CODE_FOR_avx512vl_scattersiv4sf;
12272      goto scatter_gen;
12273    case IX86_BUILTIN_SCATTERSIV4DF:
12274      icode = CODE_FOR_avx512vl_scattersiv4df;
12275      goto scatter_gen;
12276    case IX86_BUILTIN_SCATTERSIV2DF:
12277      icode = CODE_FOR_avx512vl_scattersiv2df;
12278      goto scatter_gen;
12279    case IX86_BUILTIN_SCATTERDIV8SF:
12280      icode = CODE_FOR_avx512vl_scatterdiv8sf;
12281      goto scatter_gen;
12282    case IX86_BUILTIN_SCATTERDIV4SF:
12283      icode = CODE_FOR_avx512vl_scatterdiv4sf;
12284      goto scatter_gen;
12285    case IX86_BUILTIN_SCATTERDIV4DF:
12286      icode = CODE_FOR_avx512vl_scatterdiv4df;
12287      goto scatter_gen;
12288    case IX86_BUILTIN_SCATTERDIV2DF:
12289      icode = CODE_FOR_avx512vl_scatterdiv2df;
12290      goto scatter_gen;
12291    case IX86_BUILTIN_SCATTERSIV8SI:
12292      icode = CODE_FOR_avx512vl_scattersiv8si;
12293      goto scatter_gen;
12294    case IX86_BUILTIN_SCATTERSIV4SI:
12295      icode = CODE_FOR_avx512vl_scattersiv4si;
12296      goto scatter_gen;
12297    case IX86_BUILTIN_SCATTERSIV4DI:
12298      icode = CODE_FOR_avx512vl_scattersiv4di;
12299      goto scatter_gen;
12300    case IX86_BUILTIN_SCATTERSIV2DI:
12301      icode = CODE_FOR_avx512vl_scattersiv2di;
12302      goto scatter_gen;
12303    case IX86_BUILTIN_SCATTERDIV8SI:
12304      icode = CODE_FOR_avx512vl_scatterdiv8si;
12305      goto scatter_gen;
12306    case IX86_BUILTIN_SCATTERDIV4SI:
12307      icode = CODE_FOR_avx512vl_scatterdiv4si;
12308      goto scatter_gen;
12309    case IX86_BUILTIN_SCATTERDIV4DI:
12310      icode = CODE_FOR_avx512vl_scatterdiv4di;
12311      goto scatter_gen;
12312    case IX86_BUILTIN_SCATTERDIV2DI:
12313      icode = CODE_FOR_avx512vl_scatterdiv2di;
12314      goto scatter_gen;
12315    case IX86_BUILTIN_GATHERPFDPD:
12316      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12317      goto vec_prefetch_gen;
12318    case IX86_BUILTIN_SCATTERALTSIV8DF:
12319      icode = CODE_FOR_avx512f_scattersiv8df;
12320      goto scatter_gen;
12321    case IX86_BUILTIN_SCATTERALTDIV16SF:
12322      icode = CODE_FOR_avx512f_scatterdiv16sf;
12323      goto scatter_gen;
12324    case IX86_BUILTIN_SCATTERALTSIV8DI:
12325      icode = CODE_FOR_avx512f_scattersiv8di;
12326      goto scatter_gen;
12327    case IX86_BUILTIN_SCATTERALTDIV16SI:
12328      icode = CODE_FOR_avx512f_scatterdiv16si;
12329      goto scatter_gen;
12330    case IX86_BUILTIN_SCATTERALTSIV4DF:
12331      icode = CODE_FOR_avx512vl_scattersiv4df;
12332      goto scatter_gen;
12333    case IX86_BUILTIN_SCATTERALTDIV8SF:
12334      icode = CODE_FOR_avx512vl_scatterdiv8sf;
12335      goto scatter_gen;
12336    case IX86_BUILTIN_SCATTERALTSIV4DI:
12337      icode = CODE_FOR_avx512vl_scattersiv4di;
12338      goto scatter_gen;
12339    case IX86_BUILTIN_SCATTERALTDIV8SI:
12340      icode = CODE_FOR_avx512vl_scatterdiv8si;
12341      goto scatter_gen;
12342    case IX86_BUILTIN_SCATTERALTSIV2DF:
12343      icode = CODE_FOR_avx512vl_scattersiv2df;
12344      goto scatter_gen;
12345    case IX86_BUILTIN_SCATTERALTDIV4SF:
12346      icode = CODE_FOR_avx512vl_scatterdiv4sf;
12347      goto scatter_gen;
12348    case IX86_BUILTIN_SCATTERALTSIV2DI:
12349      icode = CODE_FOR_avx512vl_scattersiv2di;
12350      goto scatter_gen;
12351    case IX86_BUILTIN_SCATTERALTDIV4SI:
12352      icode = CODE_FOR_avx512vl_scatterdiv4si;
12353      goto scatter_gen;
12354    case IX86_BUILTIN_GATHERPFDPS:
12355      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12356      goto vec_prefetch_gen;
12357    case IX86_BUILTIN_GATHERPFQPD:
12358      icode = CODE_FOR_avx512pf_gatherpfv8didf;
12359      goto vec_prefetch_gen;
12360    case IX86_BUILTIN_GATHERPFQPS:
12361      icode = CODE_FOR_avx512pf_gatherpfv8disf;
12362      goto vec_prefetch_gen;
12363    case IX86_BUILTIN_SCATTERPFDPD:
12364      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12365      goto vec_prefetch_gen;
12366    case IX86_BUILTIN_SCATTERPFDPS:
12367      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12368      goto vec_prefetch_gen;
12369    case IX86_BUILTIN_SCATTERPFQPD:
12370      icode = CODE_FOR_avx512pf_scatterpfv8didf;
12371      goto vec_prefetch_gen;
12372    case IX86_BUILTIN_SCATTERPFQPS:
12373      icode = CODE_FOR_avx512pf_scatterpfv8disf;
12374      goto vec_prefetch_gen;
12375
12376    gather_gen:
12377      rtx half;
12378      rtx (*gen) (rtx, rtx);
12379
12380      arg0 = CALL_EXPR_ARG (exp, 0);
12381      arg1 = CALL_EXPR_ARG (exp, 1);
12382      arg2 = CALL_EXPR_ARG (exp, 2);
12383      arg3 = CALL_EXPR_ARG (exp, 3);
12384      arg4 = CALL_EXPR_ARG (exp, 4);
12385      op0 = expand_normal (arg0);
12386      op1 = expand_normal (arg1);
12387      op2 = expand_normal (arg2);
12388      op3 = expand_normal (arg3);
12389      op4 = expand_normal (arg4);
12390      /* Note the arg order is different from the operand order.  */
12391      mode0 = insn_data[icode].operand[1].mode;
12392      mode2 = insn_data[icode].operand[3].mode;
12393      mode3 = insn_data[icode].operand[4].mode;
12394      mode4 = insn_data[icode].operand[5].mode;
12395
12396      if (target == NULL_RTX
12397	  || GET_MODE (target) != insn_data[icode].operand[0].mode
12398	  || !insn_data[icode].operand[0].predicate (target,
12399						     GET_MODE (target)))
12400	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12401      else
12402	subtarget = target;
12403
12404      switch (fcode)
12405	{
12406	case IX86_BUILTIN_GATHER3ALTSIV8DF:
12407	case IX86_BUILTIN_GATHER3ALTSIV8DI:
12408	  half = gen_reg_rtx (V8SImode);
12409	  if (!nonimmediate_operand (op2, V16SImode))
12410	    op2 = copy_to_mode_reg (V16SImode, op2);
12411	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12412	  op2 = half;
12413	  break;
12414	case IX86_BUILTIN_GATHER3ALTSIV4DF:
12415	case IX86_BUILTIN_GATHER3ALTSIV4DI:
12416	case IX86_BUILTIN_GATHERALTSIV4DF:
12417	case IX86_BUILTIN_GATHERALTSIV4DI:
12418	  half = gen_reg_rtx (V4SImode);
12419	  if (!nonimmediate_operand (op2, V8SImode))
12420	    op2 = copy_to_mode_reg (V8SImode, op2);
12421	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
12422	  op2 = half;
12423	  break;
12424	case IX86_BUILTIN_GATHER3ALTDIV16SF:
12425	case IX86_BUILTIN_GATHER3ALTDIV16SI:
12426	  half = gen_reg_rtx (mode0);
12427	  if (mode0 == V8SFmode)
12428	    gen = gen_vec_extract_lo_v16sf;
12429	  else
12430	    gen = gen_vec_extract_lo_v16si;
12431	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12432	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12433	  emit_insn (gen (half, op0));
12434	  op0 = half;
12435	  op3 = lowpart_subreg (QImode, op3, HImode);
12436	  break;
12437	case IX86_BUILTIN_GATHER3ALTDIV8SF:
12438	case IX86_BUILTIN_GATHER3ALTDIV8SI:
12439	case IX86_BUILTIN_GATHERALTDIV8SF:
12440	case IX86_BUILTIN_GATHERALTDIV8SI:
12441	  half = gen_reg_rtx (mode0);
12442	  if (mode0 == V4SFmode)
12443	    gen = gen_vec_extract_lo_v8sf;
12444	  else
12445	    gen = gen_vec_extract_lo_v8si;
12446	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12447	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12448	  emit_insn (gen (half, op0));
12449	  op0 = half;
12450	  if (VECTOR_MODE_P (GET_MODE (op3)))
12451	    {
12452	      half = gen_reg_rtx (mode0);
12453	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
12454		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12455	      emit_insn (gen (half, op3));
12456	      op3 = half;
12457	    }
12458	  break;
12459	default:
12460	  break;
12461	}
12462
12463      /* Force memory operand only with base register here.  But we
12464	 don't want to do it on memory operand for other builtin
12465	 functions.  */
12466      op1 = ix86_zero_extend_to_Pmode (op1);
12467
12468      if (!insn_data[icode].operand[1].predicate (op0, mode0))
12469	op0 = copy_to_mode_reg (mode0, op0);
12470      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12471	op1 = copy_to_mode_reg (Pmode, op1);
12472      if (!insn_data[icode].operand[3].predicate (op2, mode2))
12473	op2 = copy_to_mode_reg (mode2, op2);
12474
12475      op3 = fixup_modeless_constant (op3, mode3);
12476
12477      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12478	{
12479	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
12480	    op3 = copy_to_mode_reg (mode3, op3);
12481	}
12482      else
12483	{
12484	  op3 = copy_to_reg (op3);
12485	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12486	}
12487      if (!insn_data[icode].operand[5].predicate (op4, mode4))
12488	{
12489          error ("the last argument must be scale 1, 2, 4, 8");
12490          return const0_rtx;
12491	}
12492
12493      /* Optimize.  If mask is known to have all high bits set,
12494	 replace op0 with pc_rtx to signal that the instruction
12495	 overwrites the whole destination and doesn't use its
12496	 previous contents.  */
12497      if (optimize)
12498	{
12499	  if (TREE_CODE (arg3) == INTEGER_CST)
12500	    {
12501	      if (integer_all_onesp (arg3))
12502		op0 = pc_rtx;
12503	    }
12504	  else if (TREE_CODE (arg3) == VECTOR_CST)
12505	    {
12506	      unsigned int negative = 0;
12507	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12508		{
12509		  tree cst = VECTOR_CST_ELT (arg3, i);
12510		  if (TREE_CODE (cst) == INTEGER_CST
12511		      && tree_int_cst_sign_bit (cst))
12512		    negative++;
12513		  else if (TREE_CODE (cst) == REAL_CST
12514			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12515		    negative++;
12516		}
12517	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12518		op0 = pc_rtx;
12519	    }
12520	  else if (TREE_CODE (arg3) == SSA_NAME
12521		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12522	    {
12523	      /* Recognize also when mask is like:
12524		 __v2df src = _mm_setzero_pd ();
12525		 __v2df mask = _mm_cmpeq_pd (src, src);
12526		 or
12527		 __v8sf src = _mm256_setzero_ps ();
12528		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12529		 as that is a cheaper way to load all ones into
12530		 a register than having to load a constant from
12531		 memory.  */
12532	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12533	      if (is_gimple_call (def_stmt))
12534		{
12535		  tree fndecl = gimple_call_fndecl (def_stmt);
12536		  if (fndecl
12537		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12538		    switch (DECL_MD_FUNCTION_CODE (fndecl))
12539		      {
12540		      case IX86_BUILTIN_CMPPD:
12541		      case IX86_BUILTIN_CMPPS:
12542		      case IX86_BUILTIN_CMPPD256:
12543		      case IX86_BUILTIN_CMPPS256:
12544			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12545			  break;
12546			/* FALLTHRU */
12547		      case IX86_BUILTIN_CMPEQPD:
12548		      case IX86_BUILTIN_CMPEQPS:
12549			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12550			    && initializer_zerop (gimple_call_arg (def_stmt,
12551								   1)))
12552			  op0 = pc_rtx;
12553			break;
12554		      default:
12555			break;
12556		      }
12557		}
12558	    }
12559	}
12560
12561      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12562      if (! pat)
12563	return const0_rtx;
12564      emit_insn (pat);
12565
12566      switch (fcode)
12567	{
12568	case IX86_BUILTIN_GATHER3DIV16SF:
12569	  if (target == NULL_RTX)
12570	    target = gen_reg_rtx (V8SFmode);
12571	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12572	  break;
12573	case IX86_BUILTIN_GATHER3DIV16SI:
12574	  if (target == NULL_RTX)
12575	    target = gen_reg_rtx (V8SImode);
12576	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12577	  break;
12578	case IX86_BUILTIN_GATHER3DIV8SF:
12579	case IX86_BUILTIN_GATHERDIV8SF:
12580	  if (target == NULL_RTX)
12581	    target = gen_reg_rtx (V4SFmode);
12582	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12583	  break;
12584	case IX86_BUILTIN_GATHER3DIV8SI:
12585	case IX86_BUILTIN_GATHERDIV8SI:
12586	  if (target == NULL_RTX)
12587	    target = gen_reg_rtx (V4SImode);
12588	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12589	  break;
12590	default:
12591	  target = subtarget;
12592	  break;
12593	}
12594      return target;
12595
12596    scatter_gen:
12597      arg0 = CALL_EXPR_ARG (exp, 0);
12598      arg1 = CALL_EXPR_ARG (exp, 1);
12599      arg2 = CALL_EXPR_ARG (exp, 2);
12600      arg3 = CALL_EXPR_ARG (exp, 3);
12601      arg4 = CALL_EXPR_ARG (exp, 4);
12602      op0 = expand_normal (arg0);
12603      op1 = expand_normal (arg1);
12604      op2 = expand_normal (arg2);
12605      op3 = expand_normal (arg3);
12606      op4 = expand_normal (arg4);
12607      mode1 = insn_data[icode].operand[1].mode;
12608      mode2 = insn_data[icode].operand[2].mode;
12609      mode3 = insn_data[icode].operand[3].mode;
12610      mode4 = insn_data[icode].operand[4].mode;
12611
12612      /* Scatter instruction stores operand op3 to memory with
12613	 indices from op2 and scale from op4 under writemask op1.
12614	 If index operand op2 has more elements then source operand
12615	 op3 one need to use only its low half. And vice versa.  */
12616      switch (fcode)
12617	{
12618	case IX86_BUILTIN_SCATTERALTSIV8DF:
12619	case IX86_BUILTIN_SCATTERALTSIV8DI:
12620	  half = gen_reg_rtx (V8SImode);
12621	  if (!nonimmediate_operand (op2, V16SImode))
12622	    op2 = copy_to_mode_reg (V16SImode, op2);
12623	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12624	  op2 = half;
12625	  break;
12626	case IX86_BUILTIN_SCATTERALTDIV16SF:
12627	case IX86_BUILTIN_SCATTERALTDIV16SI:
12628	  half = gen_reg_rtx (mode3);
12629	  if (mode3 == V8SFmode)
12630	    gen = gen_vec_extract_lo_v16sf;
12631	  else
12632	    gen = gen_vec_extract_lo_v16si;
12633	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12634	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12635	  emit_insn (gen (half, op3));
12636	  op3 = half;
12637	  break;
12638	case IX86_BUILTIN_SCATTERALTSIV4DF:
12639	case IX86_BUILTIN_SCATTERALTSIV4DI:
12640	  half = gen_reg_rtx (V4SImode);
12641	  if (!nonimmediate_operand (op2, V8SImode))
12642	    op2 = copy_to_mode_reg (V8SImode, op2);
12643	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
12644	  op2 = half;
12645	  break;
12646	case IX86_BUILTIN_SCATTERALTDIV8SF:
12647	case IX86_BUILTIN_SCATTERALTDIV8SI:
12648	  half = gen_reg_rtx (mode3);
12649	  if (mode3 == V4SFmode)
12650	    gen = gen_vec_extract_lo_v8sf;
12651	  else
12652	    gen = gen_vec_extract_lo_v8si;
12653	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12654	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12655	  emit_insn (gen (half, op3));
12656	  op3 = half;
12657	  break;
12658	case IX86_BUILTIN_SCATTERALTSIV2DF:
12659	case IX86_BUILTIN_SCATTERALTSIV2DI:
12660	  if (!nonimmediate_operand (op2, V4SImode))
12661	    op2 = copy_to_mode_reg (V4SImode, op2);
12662	  break;
12663	case IX86_BUILTIN_SCATTERALTDIV4SF:
12664	case IX86_BUILTIN_SCATTERALTDIV4SI:
12665	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12666	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12667	  break;
12668	default:
12669	  break;
12670	}
12671
12672      /* Force memory operand only with base register here.  But we
12673	 don't want to do it on memory operand for other builtin
12674	 functions.  */
12675      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12676
12677      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12678	op0 = copy_to_mode_reg (Pmode, op0);
12679
12680      op1 = fixup_modeless_constant (op1, mode1);
12681
12682      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12683	{
12684	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
12685	    op1 = copy_to_mode_reg (mode1, op1);
12686	}
12687      else
12688	{
12689	  op1 = copy_to_reg (op1);
12690	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12691	}
12692
12693      if (!insn_data[icode].operand[2].predicate (op2, mode2))
12694	op2 = copy_to_mode_reg (mode2, op2);
12695
12696      if (!insn_data[icode].operand[3].predicate (op3, mode3))
12697	op3 = copy_to_mode_reg (mode3, op3);
12698
12699      if (!insn_data[icode].operand[4].predicate (op4, mode4))
12700	{
12701	  error ("the last argument must be scale 1, 2, 4, 8");
12702	  return const0_rtx;
12703	}
12704
12705      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12706      if (! pat)
12707	return const0_rtx;
12708
12709      emit_insn (pat);
12710      return 0;
12711
12712    vec_prefetch_gen:
12713      arg0 = CALL_EXPR_ARG (exp, 0);
12714      arg1 = CALL_EXPR_ARG (exp, 1);
12715      arg2 = CALL_EXPR_ARG (exp, 2);
12716      arg3 = CALL_EXPR_ARG (exp, 3);
12717      arg4 = CALL_EXPR_ARG (exp, 4);
12718      op0 = expand_normal (arg0);
12719      op1 = expand_normal (arg1);
12720      op2 = expand_normal (arg2);
12721      op3 = expand_normal (arg3);
12722      op4 = expand_normal (arg4);
12723      mode0 = insn_data[icode].operand[0].mode;
12724      mode1 = insn_data[icode].operand[1].mode;
12725      mode3 = insn_data[icode].operand[3].mode;
12726      mode4 = insn_data[icode].operand[4].mode;
12727
12728      op0 = fixup_modeless_constant (op0, mode0);
12729
12730      if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12731	{
12732	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
12733	    op0 = copy_to_mode_reg (mode0, op0);
12734	}
12735      else
12736	{
12737	  op0 = copy_to_reg (op0);
12738	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12739	}
12740
12741      if (!insn_data[icode].operand[1].predicate (op1, mode1))
12742	op1 = copy_to_mode_reg (mode1, op1);
12743
12744      /* Force memory operand only with base register here.  But we
12745	 don't want to do it on memory operand for other builtin
12746	 functions.  */
12747      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12748
12749      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12750	op2 = copy_to_mode_reg (Pmode, op2);
12751
12752      if (!insn_data[icode].operand[3].predicate (op3, mode3))
12753	{
12754	  error ("the forth argument must be scale 1, 2, 4, 8");
12755	  return const0_rtx;
12756	}
12757
12758      if (!insn_data[icode].operand[4].predicate (op4, mode4))
12759	{
12760	  error ("incorrect hint operand");
12761	  return const0_rtx;
12762	}
12763
12764      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12765      if (! pat)
12766	return const0_rtx;
12767
12768      emit_insn (pat);
12769
12770      return 0;
12771
12772    case IX86_BUILTIN_XABORT:
12773      icode = CODE_FOR_xabort;
12774      arg0 = CALL_EXPR_ARG (exp, 0);
12775      op0 = expand_normal (arg0);
12776      mode0 = insn_data[icode].operand[0].mode;
12777      if (!insn_data[icode].operand[0].predicate (op0, mode0))
12778	{
12779	  error ("the argument to %<xabort%> intrinsic must "
12780		 "be an 8-bit immediate");
12781	  return const0_rtx;
12782	}
12783      emit_insn (gen_xabort (op0));
12784      return 0;
12785
12786    case IX86_BUILTIN_RSTORSSP:
12787    case IX86_BUILTIN_CLRSSBSY:
12788      arg0 = CALL_EXPR_ARG (exp, 0);
12789      op0 = expand_normal (arg0);
12790      icode = (fcode == IX86_BUILTIN_RSTORSSP
12791	  ? CODE_FOR_rstorssp
12792	  : CODE_FOR_clrssbsy);
12793      if (!address_operand (op0, VOIDmode))
12794	{
12795	  op1 = convert_memory_address (Pmode, op0);
12796	  op0 = copy_addr_to_reg (op1);
12797	}
12798      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12799      return 0;
12800
12801    case IX86_BUILTIN_WRSSD:
12802    case IX86_BUILTIN_WRSSQ:
12803    case IX86_BUILTIN_WRUSSD:
12804    case IX86_BUILTIN_WRUSSQ:
12805      arg0 = CALL_EXPR_ARG (exp, 0);
12806      op0 = expand_normal (arg0);
12807      arg1 = CALL_EXPR_ARG (exp, 1);
12808      op1 = expand_normal (arg1);
12809      switch (fcode)
12810	{
12811	case IX86_BUILTIN_WRSSD:
12812	  icode = CODE_FOR_wrsssi;
12813	  mode = SImode;
12814	  break;
12815	case IX86_BUILTIN_WRSSQ:
12816	  icode = CODE_FOR_wrssdi;
12817	  mode = DImode;
12818	  break;
12819	case IX86_BUILTIN_WRUSSD:
12820	  icode = CODE_FOR_wrusssi;
12821	  mode = SImode;
12822	  break;
12823	case IX86_BUILTIN_WRUSSQ:
12824	  icode = CODE_FOR_wrussdi;
12825	  mode = DImode;
12826	  break;
12827	}
12828      op0 = force_reg (mode, op0);
12829      if (!address_operand (op1, VOIDmode))
12830	{
12831	  op2 = convert_memory_address (Pmode, op1);
12832	  op1 = copy_addr_to_reg (op2);
12833	}
12834      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12835      return 0;
12836
12837    case IX86_BUILTIN_VZEROUPPER:
12838      cfun->machine->has_explicit_vzeroupper = true;
12839      break;
12840
12841    default:
12842      break;
12843    }
12844
12845  if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12846      && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12847    {
12848      i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12849      return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12850					       target);
12851    }
12852
12853  if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12854      && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12855    {
12856      i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12857      rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12858      rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12859      rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12860      int masked = 1;
12861      machine_mode mode, wide_mode, nar_mode;
12862
12863      nar_mode  = V4SFmode;
12864      mode      = V16SFmode;
12865      wide_mode = V64SFmode;
12866      fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
12867      fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12868
12869      switch (fcode)
12870	{
12871	case IX86_BUILTIN_4FMAPS:
12872	  fcn = gen_avx5124fmaddps_4fmaddps;
12873	  masked = 0;
12874	  goto v4fma_expand;
12875
12876	case IX86_BUILTIN_4DPWSSD:
12877	  nar_mode  = V4SImode;
12878	  mode      = V16SImode;
12879	  wide_mode = V64SImode;
12880	  fcn = gen_avx5124vnniw_vp4dpwssd;
12881	  masked = 0;
12882	  goto v4fma_expand;
12883
12884	case IX86_BUILTIN_4DPWSSDS:
12885	  nar_mode  = V4SImode;
12886	  mode      = V16SImode;
12887	  wide_mode = V64SImode;
12888	  fcn = gen_avx5124vnniw_vp4dpwssds;
12889	  masked = 0;
12890	  goto v4fma_expand;
12891
12892	case IX86_BUILTIN_4FNMAPS:
12893	  fcn = gen_avx5124fmaddps_4fnmaddps;
12894	  masked = 0;
12895	  goto v4fma_expand;
12896
12897	case IX86_BUILTIN_4FNMAPS_MASK:
12898	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
12899	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12900	  goto v4fma_expand;
12901
12902	case IX86_BUILTIN_4DPWSSD_MASK:
12903	  nar_mode  = V4SImode;
12904	  mode      = V16SImode;
12905	  wide_mode = V64SImode;
12906	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
12907	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12908	  goto v4fma_expand;
12909
12910	case IX86_BUILTIN_4DPWSSDS_MASK:
12911	  nar_mode  = V4SImode;
12912	  mode      = V16SImode;
12913	  wide_mode = V64SImode;
12914	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
12915	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12916	  goto v4fma_expand;
12917
12918	case IX86_BUILTIN_4FMAPS_MASK:
12919	  {
12920	    tree args[4];
12921	    rtx ops[4];
12922	    rtx wide_reg;
12923	    rtx accum;
12924	    rtx addr;
12925	    rtx mem;
12926
12927v4fma_expand:
12928	    wide_reg = gen_reg_rtx (wide_mode);
12929	    for (i = 0; i < 4; i++)
12930	      {
12931		args[i] = CALL_EXPR_ARG (exp, i);
12932		ops[i] = expand_normal (args[i]);
12933
12934		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12935				ops[i]);
12936	      }
12937
12938	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12939	    accum = force_reg (mode, accum);
12940
12941	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12942	    addr = force_reg (Pmode, addr);
12943
12944	    mem = gen_rtx_MEM (nar_mode, addr);
12945
12946	    target = gen_reg_rtx (mode);
12947
12948	    emit_move_insn (target, accum);
12949
12950	    if (! masked)
12951	      emit_insn (fcn (target, accum, wide_reg, mem));
12952	    else
12953	      {
12954		rtx merge, mask;
12955		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12956
12957		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12958
12959		if (CONST_INT_P (mask))
12960		  mask = fixup_modeless_constant (mask, HImode);
12961
12962		mask = force_reg (HImode, mask);
12963
12964		if (GET_MODE (mask) != HImode)
12965		  mask = gen_rtx_SUBREG (HImode, mask, 0);
12966
12967		/* If merge is 0 then we're about to emit z-masked variant.  */
12968		if (const0_operand (merge, mode))
12969		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12970		/* If merge is the same as accum then emit merge-masked variant.  */
12971		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12972		  {
12973		    merge = force_reg (mode, merge);
12974		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12975		  }
12976		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
12977		else
12978		  {
12979		    target = gen_reg_rtx (mode);
12980		    emit_move_insn (target, merge);
12981		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12982		  }
12983	      }
12984	    return target;
12985	  }
12986
12987	case IX86_BUILTIN_4FNMASS:
12988	  fcn = gen_avx5124fmaddps_4fnmaddss;
12989	  masked = 0;
12990	  goto s4fma_expand;
12991
12992	case IX86_BUILTIN_4FMASS:
12993	  fcn = gen_avx5124fmaddps_4fmaddss;
12994	  masked = 0;
12995	  goto s4fma_expand;
12996
12997	case IX86_BUILTIN_4FNMASS_MASK:
12998	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12999	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13000	  goto s4fma_expand;
13001
13002	case IX86_BUILTIN_4FMASS_MASK:
13003	  {
13004	    tree args[4];
13005	    rtx ops[4];
13006	    rtx wide_reg;
13007	    rtx accum;
13008	    rtx addr;
13009	    rtx mem;
13010
13011	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13012	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13013
13014s4fma_expand:
13015	    mode = V4SFmode;
13016	    wide_reg = gen_reg_rtx (V64SFmode);
13017	    for (i = 0; i < 4; i++)
13018	      {
13019		rtx tmp;
13020		args[i] = CALL_EXPR_ARG (exp, i);
13021		ops[i] = expand_normal (args[i]);
13022
13023		tmp = gen_reg_rtx (SFmode);
13024		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13025
13026		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13027				gen_rtx_SUBREG (V16SFmode, tmp, 0));
13028	      }
13029
13030	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13031	    accum = force_reg (V4SFmode, accum);
13032
13033	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13034	    addr = force_reg (Pmode, addr);
13035
13036	    mem = gen_rtx_MEM (V4SFmode, addr);
13037
13038	    target = gen_reg_rtx (V4SFmode);
13039
13040	    emit_move_insn (target, accum);
13041
13042	    if (! masked)
13043	      emit_insn (fcn (target, accum, wide_reg, mem));
13044	    else
13045	      {
13046		rtx merge, mask;
13047		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13048
13049		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13050
13051		if (CONST_INT_P (mask))
13052		  mask = fixup_modeless_constant (mask, QImode);
13053
13054		mask = force_reg (QImode, mask);
13055
13056		if (GET_MODE (mask) != QImode)
13057		  mask = gen_rtx_SUBREG (QImode, mask, 0);
13058
13059		/* If merge is 0 then we're about to emit z-masked variant.  */
13060		if (const0_operand (merge, mode))
13061		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13062		/* If merge is the same as accum then emit merge-masked
13063		   variant.  */
13064		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13065		  {
13066		    merge = force_reg (mode, merge);
13067		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13068		  }
13069		/* Merge with something unknown might happen if we z-mask
13070		   w/ -O0.  */
13071		else
13072		  {
13073		    target = gen_reg_rtx (mode);
13074		    emit_move_insn (target, merge);
13075		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13076		  }
13077		}
13078	      return target;
13079	    }
13080	  case IX86_BUILTIN_RDPID:
13081	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13082						     target);
13083	  case IX86_BUILTIN_FABSQ:
13084	  case IX86_BUILTIN_COPYSIGNQ:
13085	    if (!TARGET_SSE)
13086	      /* Emit a normal call if SSE isn't available.  */
13087	      return expand_call (exp, target, ignore);
13088	    /* FALLTHRU */
13089	  default:
13090	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13091	  }
13092    }
13093
13094  if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13095      && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13096    {
13097      i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13098      return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13099    }
13100
13101  if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13102      && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13103    {
13104      i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13105      return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13106    }
13107
13108  if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13109      && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13110    {
13111      i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13112      return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13113    }
13114
13115  if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13116      && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13117    {
13118      i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13119      return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13120    }
13121
13122  if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13123      && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13124    {
13125      i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13126      const struct builtin_description *d = bdesc_multi_arg + i;
13127      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13128					    (enum ix86_builtin_func_type)
13129					    d->flag, d->comparison);
13130    }
13131
13132  if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13133      && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13134    {
13135      i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13136      return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13137					       target);
13138    }
13139
13140  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13141      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13142    {
13143      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13144      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13145				       target);
13146    }
13147
13148  gcc_unreachable ();
13149}
13150
13151/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
13152   fill target with val via vec_duplicate.  */
13153
13154static bool
13155ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13156{
13157  bool ok;
13158  rtx_insn *insn;
13159  rtx dup;
13160
13161  /* First attempt to recognize VAL as-is.  */
13162  dup = gen_vec_duplicate (mode, val);
13163  insn = emit_insn (gen_rtx_SET (target, dup));
13164  if (recog_memoized (insn) < 0)
13165    {
13166      rtx_insn *seq;
13167      machine_mode innermode = GET_MODE_INNER (mode);
13168      rtx reg;
13169
13170      /* If that fails, force VAL into a register.  */
13171
13172      start_sequence ();
13173      reg = force_reg (innermode, val);
13174      if (GET_MODE (reg) != innermode)
13175	reg = gen_lowpart (innermode, reg);
13176      SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13177      seq = get_insns ();
13178      end_sequence ();
13179      if (seq)
13180	emit_insn_before (seq, insn);
13181
13182      ok = recog_memoized (insn) >= 0;
13183      gcc_assert (ok);
13184    }
13185  return true;
13186}
13187
13188/* Get a vector mode of the same size as the original but with elements
13189   twice as wide.  This is only guaranteed to apply to integral vectors.  */
13190
13191static machine_mode
13192get_mode_wider_vector (machine_mode o)
13193{
13194  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
13195  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13196  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13197  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13198  return n;
13199}
13200
13201static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13202static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13203
13204/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13205   with all elements equal to VAR.  Return true if successful.  */
13206
13207static bool
13208ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13209				   rtx target, rtx val)
13210{
13211  bool ok;
13212
13213  switch (mode)
13214    {
13215    case E_V2SImode:
13216    case E_V2SFmode:
13217      if (!mmx_ok)
13218	return false;
13219      /* FALLTHRU */
13220
13221    case E_V4DFmode:
13222    case E_V4DImode:
13223    case E_V8SFmode:
13224    case E_V8SImode:
13225    case E_V2DFmode:
13226    case E_V2DImode:
13227    case E_V4SFmode:
13228    case E_V4SImode:
13229    case E_V16SImode:
13230    case E_V8DImode:
13231    case E_V16SFmode:
13232    case E_V8DFmode:
13233      return ix86_vector_duplicate_value (mode, target, val);
13234
13235    case E_V4HImode:
13236      if (!mmx_ok)
13237	return false;
13238      if (TARGET_SSE || TARGET_3DNOW_A)
13239	{
13240	  rtx x;
13241
13242	  val = gen_lowpart (SImode, val);
13243	  x = gen_rtx_TRUNCATE (HImode, val);
13244	  x = gen_rtx_VEC_DUPLICATE (mode, x);
13245	  emit_insn (gen_rtx_SET (target, x));
13246	  return true;
13247	}
13248      goto widen;
13249
13250    case E_V8QImode:
13251      if (!mmx_ok)
13252	return false;
13253      goto widen;
13254
13255    case E_V8HImode:
13256      if (TARGET_AVX2)
13257	return ix86_vector_duplicate_value (mode, target, val);
13258
13259      if (TARGET_SSE2)
13260	{
13261	  struct expand_vec_perm_d dperm;
13262	  rtx tmp1, tmp2;
13263
13264	permute:
13265	  memset (&dperm, 0, sizeof (dperm));
13266	  dperm.target = target;
13267	  dperm.vmode = mode;
13268	  dperm.nelt = GET_MODE_NUNITS (mode);
13269	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13270	  dperm.one_operand_p = true;
13271
13272	  /* Extend to SImode using a paradoxical SUBREG.  */
13273	  tmp1 = gen_reg_rtx (SImode);
13274	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
13275
13276	  /* Insert the SImode value as low element of a V4SImode vector. */
13277	  tmp2 = gen_reg_rtx (V4SImode);
13278	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13279	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13280
13281	  ok = (expand_vec_perm_1 (&dperm)
13282		|| expand_vec_perm_broadcast_1 (&dperm));
13283	  gcc_assert (ok);
13284	  return ok;
13285	}
13286      goto widen;
13287
13288    case E_V16QImode:
13289      if (TARGET_AVX2)
13290	return ix86_vector_duplicate_value (mode, target, val);
13291
13292      if (TARGET_SSE2)
13293	goto permute;
13294      goto widen;
13295
13296    widen:
13297      /* Replicate the value once into the next wider mode and recurse.  */
13298      {
13299	machine_mode smode, wsmode, wvmode;
13300	rtx x;
13301
13302	smode = GET_MODE_INNER (mode);
13303	wvmode = get_mode_wider_vector (mode);
13304	wsmode = GET_MODE_INNER (wvmode);
13305
13306	val = convert_modes (wsmode, smode, val, true);
13307	x = expand_simple_binop (wsmode, ASHIFT, val,
13308				 GEN_INT (GET_MODE_BITSIZE (smode)),
13309				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13310	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13311
13312	x = gen_reg_rtx (wvmode);
13313	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13314	gcc_assert (ok);
13315	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13316	return ok;
13317      }
13318
13319    case E_V16HImode:
13320    case E_V32QImode:
13321      if (TARGET_AVX2)
13322	return ix86_vector_duplicate_value (mode, target, val);
13323      else
13324	{
13325	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13326	  rtx x = gen_reg_rtx (hvmode);
13327
13328	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13329	  gcc_assert (ok);
13330
13331	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13332	  emit_insn (gen_rtx_SET (target, x));
13333	}
13334      return true;
13335
13336    case E_V64QImode:
13337    case E_V32HImode:
13338      if (TARGET_AVX512BW)
13339	return ix86_vector_duplicate_value (mode, target, val);
13340      else
13341	{
13342	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13343	  rtx x = gen_reg_rtx (hvmode);
13344
13345	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13346	  gcc_assert (ok);
13347
13348	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13349	  emit_insn (gen_rtx_SET (target, x));
13350	}
13351      return true;
13352
13353    default:
13354      return false;
13355    }
13356}
13357
13358/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13359   whose ONE_VAR element is VAR, and other elements are zero.  Return true
13360   if successful.  */
13361
13362static bool
13363ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13364				     rtx target, rtx var, int one_var)
13365{
13366  machine_mode vsimode;
13367  rtx new_target;
13368  rtx x, tmp;
13369  bool use_vector_set = false;
13370  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13371
13372  switch (mode)
13373    {
13374    case E_V2DImode:
13375      /* For SSE4.1, we normally use vector set.  But if the second
13376	 element is zero and inter-unit moves are OK, we use movq
13377	 instead.  */
13378      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13379			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
13380			     && one_var == 0));
13381      break;
13382    case E_V16QImode:
13383    case E_V4SImode:
13384    case E_V4SFmode:
13385      use_vector_set = TARGET_SSE4_1;
13386      break;
13387    case E_V8HImode:
13388      use_vector_set = TARGET_SSE2;
13389      break;
13390    case E_V8QImode:
13391      use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13392      break;
13393    case E_V4HImode:
13394      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13395      break;
13396    case E_V32QImode:
13397    case E_V16HImode:
13398      use_vector_set = TARGET_AVX;
13399      break;
13400    case E_V8SImode:
13401      use_vector_set = TARGET_AVX;
13402      gen_vec_set_0 = gen_vec_setv8si_0;
13403      break;
13404    case E_V8SFmode:
13405      use_vector_set = TARGET_AVX;
13406      gen_vec_set_0 = gen_vec_setv8sf_0;
13407      break;
13408    case E_V4DFmode:
13409      use_vector_set = TARGET_AVX;
13410      gen_vec_set_0 = gen_vec_setv4df_0;
13411      break;
13412    case E_V4DImode:
13413      /* Use ix86_expand_vector_set in 64bit mode only.  */
13414      use_vector_set = TARGET_AVX && TARGET_64BIT;
13415      gen_vec_set_0 = gen_vec_setv4di_0;
13416      break;
13417    case E_V16SImode:
13418      use_vector_set = TARGET_AVX512F && one_var == 0;
13419      gen_vec_set_0 = gen_vec_setv16si_0;
13420      break;
13421    case E_V16SFmode:
13422      use_vector_set = TARGET_AVX512F && one_var == 0;
13423      gen_vec_set_0 = gen_vec_setv16sf_0;
13424      break;
13425    case E_V8DFmode:
13426      use_vector_set = TARGET_AVX512F && one_var == 0;
13427      gen_vec_set_0 = gen_vec_setv8df_0;
13428      break;
13429    case E_V8DImode:
13430      /* Use ix86_expand_vector_set in 64bit mode only.  */
13431      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13432      gen_vec_set_0 = gen_vec_setv8di_0;
13433      break;
13434    default:
13435      break;
13436    }
13437
13438  if (use_vector_set)
13439    {
13440      if (gen_vec_set_0 && one_var == 0)
13441	{
13442	  var = force_reg (GET_MODE_INNER (mode), var);
13443	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13444	  return true;
13445	}
13446      emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13447      var = force_reg (GET_MODE_INNER (mode), var);
13448      ix86_expand_vector_set (mmx_ok, target, var, one_var);
13449      return true;
13450    }
13451
13452  switch (mode)
13453    {
13454    case E_V2SFmode:
13455    case E_V2SImode:
13456      if (!mmx_ok)
13457	return false;
13458      /* FALLTHRU */
13459
13460    case E_V2DFmode:
13461    case E_V2DImode:
13462      if (one_var != 0)
13463	return false;
13464      var = force_reg (GET_MODE_INNER (mode), var);
13465      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13466      emit_insn (gen_rtx_SET (target, x));
13467      return true;
13468
13469    case E_V4SFmode:
13470    case E_V4SImode:
13471      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13472	new_target = gen_reg_rtx (mode);
13473      else
13474	new_target = target;
13475      var = force_reg (GET_MODE_INNER (mode), var);
13476      x = gen_rtx_VEC_DUPLICATE (mode, var);
13477      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13478      emit_insn (gen_rtx_SET (new_target, x));
13479      if (one_var != 0)
13480	{
13481	  /* We need to shuffle the value to the correct position, so
13482	     create a new pseudo to store the intermediate result.  */
13483
13484	  /* With SSE2, we can use the integer shuffle insns.  */
13485	  if (mode != V4SFmode && TARGET_SSE2)
13486	    {
13487	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13488					    const1_rtx,
13489					    GEN_INT (one_var == 1 ? 0 : 1),
13490					    GEN_INT (one_var == 2 ? 0 : 1),
13491					    GEN_INT (one_var == 3 ? 0 : 1)));
13492	      if (target != new_target)
13493		emit_move_insn (target, new_target);
13494	      return true;
13495	    }
13496
13497	  /* Otherwise convert the intermediate result to V4SFmode and
13498	     use the SSE1 shuffle instructions.  */
13499	  if (mode != V4SFmode)
13500	    {
13501	      tmp = gen_reg_rtx (V4SFmode);
13502	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13503	    }
13504	  else
13505	    tmp = new_target;
13506
13507	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13508				       const1_rtx,
13509				       GEN_INT (one_var == 1 ? 0 : 1),
13510				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
13511				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13512
13513	  if (mode != V4SFmode)
13514	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13515	  else if (tmp != target)
13516	    emit_move_insn (target, tmp);
13517	}
13518      else if (target != new_target)
13519	emit_move_insn (target, new_target);
13520      return true;
13521
13522    case E_V8HImode:
13523    case E_V16QImode:
13524      vsimode = V4SImode;
13525      goto widen;
13526    case E_V4HImode:
13527    case E_V8QImode:
13528      if (!mmx_ok)
13529	return false;
13530      vsimode = V2SImode;
13531      goto widen;
13532    widen:
13533      if (one_var != 0)
13534	return false;
13535
13536      /* Zero extend the variable element to SImode and recurse.  */
13537      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13538
13539      x = gen_reg_rtx (vsimode);
13540      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13541						var, one_var))
13542	gcc_unreachable ();
13543
13544      emit_move_insn (target, gen_lowpart (mode, x));
13545      return true;
13546
13547    default:
13548      return false;
13549    }
13550}
13551
13552/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13553   consisting of the values in VALS.  It is known that all elements
13554   except ONE_VAR are constants.  Return true if successful.  */
13555
13556static bool
13557ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13558				 rtx target, rtx vals, int one_var)
13559{
13560  rtx var = XVECEXP (vals, 0, one_var);
13561  machine_mode wmode;
13562  rtx const_vec, x;
13563
13564  const_vec = copy_rtx (vals);
13565  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13566  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13567
13568  switch (mode)
13569    {
13570    case E_V2DFmode:
13571    case E_V2DImode:
13572    case E_V2SFmode:
13573    case E_V2SImode:
13574      /* For the two element vectors, it's just as easy to use
13575	 the general case.  */
13576      return false;
13577
13578    case E_V4DImode:
13579      /* Use ix86_expand_vector_set in 64bit mode only.  */
13580      if (!TARGET_64BIT)
13581	return false;
13582      /* FALLTHRU */
13583    case E_V4DFmode:
13584    case E_V8SFmode:
13585    case E_V8SImode:
13586    case E_V16HImode:
13587    case E_V32QImode:
13588    case E_V4SFmode:
13589    case E_V4SImode:
13590    case E_V8HImode:
13591    case E_V4HImode:
13592      break;
13593
13594    case E_V16QImode:
13595      if (TARGET_SSE4_1)
13596	break;
13597      wmode = V8HImode;
13598      goto widen;
13599    case E_V8QImode:
13600      if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13601	break;
13602      wmode = V4HImode;
13603      goto widen;
13604    widen:
13605      /* There's no way to set one QImode entry easily.  Combine
13606	 the variable value with its adjacent constant value, and
13607	 promote to an HImode set.  */
13608      x = XVECEXP (vals, 0, one_var ^ 1);
13609      if (one_var & 1)
13610	{
13611	  var = convert_modes (HImode, QImode, var, true);
13612	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13613				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
13614	  x = GEN_INT (INTVAL (x) & 0xff);
13615	}
13616      else
13617	{
13618	  var = convert_modes (HImode, QImode, var, true);
13619	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
13620	}
13621      if (x != const0_rtx)
13622	var = expand_simple_binop (HImode, IOR, var, x, var,
13623				   1, OPTAB_LIB_WIDEN);
13624
13625      x = gen_reg_rtx (wmode);
13626      emit_move_insn (x, gen_lowpart (wmode, const_vec));
13627      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13628
13629      emit_move_insn (target, gen_lowpart (mode, x));
13630      return true;
13631
13632    default:
13633      return false;
13634    }
13635
13636  emit_move_insn (target, const_vec);
13637  ix86_expand_vector_set (mmx_ok, target, var, one_var);
13638  return true;
13639}
13640
13641/* A subroutine of ix86_expand_vector_init_general.  Use vector
13642   concatenate to handle the most general case: all values variable,
13643   and none identical.  */
13644
13645static void
13646ix86_expand_vector_init_concat (machine_mode mode,
13647				rtx target, rtx *ops, int n)
13648{
13649  machine_mode half_mode = VOIDmode;
13650  rtx half[2];
13651  rtvec v;
13652  int i, j;
13653
13654  switch (n)
13655    {
13656    case 2:
13657      switch (mode)
13658	{
13659	case E_V16SImode:
13660	  half_mode = V8SImode;
13661	  break;
13662	case E_V16SFmode:
13663	  half_mode = V8SFmode;
13664	  break;
13665	case E_V8DImode:
13666	  half_mode = V4DImode;
13667	  break;
13668	case E_V8DFmode:
13669	  half_mode = V4DFmode;
13670	  break;
13671	case E_V8SImode:
13672	  half_mode = V4SImode;
13673	  break;
13674	case E_V8SFmode:
13675	  half_mode = V4SFmode;
13676	  break;
13677	case E_V4DImode:
13678	  half_mode = V2DImode;
13679	  break;
13680	case E_V4DFmode:
13681	  half_mode = V2DFmode;
13682	  break;
13683	case E_V4SImode:
13684	  half_mode = V2SImode;
13685	  break;
13686	case E_V4SFmode:
13687	  half_mode = V2SFmode;
13688	  break;
13689	case E_V2DImode:
13690	  half_mode = DImode;
13691	  break;
13692	case E_V2SImode:
13693	  half_mode = SImode;
13694	  break;
13695	case E_V2DFmode:
13696	  half_mode = DFmode;
13697	  break;
13698	case E_V2SFmode:
13699	  half_mode = SFmode;
13700	  break;
13701	default:
13702	  gcc_unreachable ();
13703	}
13704
13705      if (!register_operand (ops[1], half_mode))
13706	ops[1] = force_reg (half_mode, ops[1]);
13707      if (!register_operand (ops[0], half_mode))
13708	ops[0] = force_reg (half_mode, ops[0]);
13709      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13710							  ops[1])));
13711      break;
13712
13713    case 4:
13714      switch (mode)
13715	{
13716	case E_V4DImode:
13717	  half_mode = V2DImode;
13718	  break;
13719	case E_V4DFmode:
13720	  half_mode = V2DFmode;
13721	  break;
13722	case E_V4SImode:
13723	  half_mode = V2SImode;
13724	  break;
13725	case E_V4SFmode:
13726	  half_mode = V2SFmode;
13727	  break;
13728	default:
13729	  gcc_unreachable ();
13730	}
13731      goto half;
13732
13733    case 8:
13734      switch (mode)
13735	{
13736	case E_V8DImode:
13737	  half_mode = V4DImode;
13738	  break;
13739	case E_V8DFmode:
13740	  half_mode = V4DFmode;
13741	  break;
13742	case E_V8SImode:
13743	  half_mode = V4SImode;
13744	  break;
13745	case E_V8SFmode:
13746	  half_mode = V4SFmode;
13747	  break;
13748	default:
13749	  gcc_unreachable ();
13750	}
13751      goto half;
13752
13753    case 16:
13754      switch (mode)
13755	{
13756	case E_V16SImode:
13757	  half_mode = V8SImode;
13758	  break;
13759	case E_V16SFmode:
13760	  half_mode = V8SFmode;
13761	  break;
13762	default:
13763	  gcc_unreachable ();
13764	}
13765      goto half;
13766
13767half:
13768      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
13769      i = n - 1;
13770      for (j = 1; j != -1; j--)
13771	{
13772	  half[j] = gen_reg_rtx (half_mode);
13773	  switch (n >> 1)
13774	    {
13775	    case 2:
13776	      v = gen_rtvec (2, ops[i-1], ops[i]);
13777	      i -= 2;
13778	      break;
13779	    case 4:
13780	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13781	      i -= 4;
13782	      break;
13783	    case 8:
13784	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13785			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
13786	      i -= 8;
13787	      break;
13788	    default:
13789	      gcc_unreachable ();
13790	    }
13791	  ix86_expand_vector_init (false, half[j],
13792				   gen_rtx_PARALLEL (half_mode, v));
13793	}
13794
13795      ix86_expand_vector_init_concat (mode, target, half, 2);
13796      break;
13797
13798    default:
13799      gcc_unreachable ();
13800    }
13801}
13802
13803/* A subroutine of ix86_expand_vector_init_general.  Use vector
13804   interleave to handle the most general case: all values variable,
13805   and none identical.  */
13806
13807static void
13808ix86_expand_vector_init_interleave (machine_mode mode,
13809				    rtx target, rtx *ops, int n)
13810{
13811  machine_mode first_imode, second_imode, third_imode, inner_mode;
13812  int i, j;
13813  rtx op0, op1;
13814  rtx (*gen_load_even) (rtx, rtx, rtx);
13815  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13816  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13817
13818  switch (mode)
13819    {
13820    case E_V8HImode:
13821      gen_load_even = gen_vec_setv8hi;
13822      gen_interleave_first_low = gen_vec_interleave_lowv4si;
13823      gen_interleave_second_low = gen_vec_interleave_lowv2di;
13824      inner_mode = HImode;
13825      first_imode = V4SImode;
13826      second_imode = V2DImode;
13827      third_imode = VOIDmode;
13828      break;
13829    case E_V16QImode:
13830      gen_load_even = gen_vec_setv16qi;
13831      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13832      gen_interleave_second_low = gen_vec_interleave_lowv4si;
13833      inner_mode = QImode;
13834      first_imode = V8HImode;
13835      second_imode = V4SImode;
13836      third_imode = V2DImode;
13837      break;
13838    default:
13839      gcc_unreachable ();
13840    }
13841
13842  for (i = 0; i < n; i++)
13843    {
13844      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
13845      op0 = gen_reg_rtx (SImode);
13846      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13847
13848      /* Insert the SImode value as low element of V4SImode vector. */
13849      op1 = gen_reg_rtx (V4SImode);
13850      op0 = gen_rtx_VEC_MERGE (V4SImode,
13851			       gen_rtx_VEC_DUPLICATE (V4SImode,
13852						      op0),
13853			       CONST0_RTX (V4SImode),
13854			       const1_rtx);
13855      emit_insn (gen_rtx_SET (op1, op0));
13856
13857      /* Cast the V4SImode vector back to a vector in orignal mode.  */
13858      op0 = gen_reg_rtx (mode);
13859      emit_move_insn (op0, gen_lowpart (mode, op1));
13860
13861      /* Load even elements into the second position.  */
13862      emit_insn (gen_load_even (op0,
13863				force_reg (inner_mode,
13864					   ops [i + i + 1]),
13865				const1_rtx));
13866
13867      /* Cast vector to FIRST_IMODE vector.  */
13868      ops[i] = gen_reg_rtx (first_imode);
13869      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13870    }
13871
13872  /* Interleave low FIRST_IMODE vectors.  */
13873  for (i = j = 0; i < n; i += 2, j++)
13874    {
13875      op0 = gen_reg_rtx (first_imode);
13876      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13877
13878      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
13879      ops[j] = gen_reg_rtx (second_imode);
13880      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13881    }
13882
13883  /* Interleave low SECOND_IMODE vectors.  */
13884  switch (second_imode)
13885    {
13886    case E_V4SImode:
13887      for (i = j = 0; i < n / 2; i += 2, j++)
13888	{
13889	  op0 = gen_reg_rtx (second_imode);
13890	  emit_insn (gen_interleave_second_low (op0, ops[i],
13891						ops[i + 1]));
13892
13893	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13894	     vector.  */
13895	  ops[j] = gen_reg_rtx (third_imode);
13896	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13897	}
13898      second_imode = V2DImode;
13899      gen_interleave_second_low = gen_vec_interleave_lowv2di;
13900      /* FALLTHRU */
13901
13902    case E_V2DImode:
13903      op0 = gen_reg_rtx (second_imode);
13904      emit_insn (gen_interleave_second_low (op0, ops[0],
13905					    ops[1]));
13906
13907      /* Cast the SECOND_IMODE vector back to a vector on original
13908	 mode.  */
13909      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13910      break;
13911
13912    default:
13913      gcc_unreachable ();
13914    }
13915}
13916
13917/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
13918   all values variable, and none identical.  */
13919
13920static void
13921ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13922				 rtx target, rtx vals)
13923{
13924  rtx ops[64], op0, op1, op2, op3, op4, op5;
13925  machine_mode half_mode = VOIDmode;
13926  machine_mode quarter_mode = VOIDmode;
13927  int n, i;
13928
13929  switch (mode)
13930    {
13931    case E_V2SFmode:
13932    case E_V2SImode:
13933      if (!mmx_ok && !TARGET_SSE)
13934	break;
13935      /* FALLTHRU */
13936
13937    case E_V16SImode:
13938    case E_V16SFmode:
13939    case E_V8DFmode:
13940    case E_V8DImode:
13941    case E_V8SFmode:
13942    case E_V8SImode:
13943    case E_V4DFmode:
13944    case E_V4DImode:
13945    case E_V4SFmode:
13946    case E_V4SImode:
13947    case E_V2DFmode:
13948    case E_V2DImode:
13949      n = GET_MODE_NUNITS (mode);
13950      for (i = 0; i < n; i++)
13951	ops[i] = XVECEXP (vals, 0, i);
13952      ix86_expand_vector_init_concat (mode, target, ops, n);
13953      return;
13954
13955    case E_V2TImode:
13956      for (i = 0; i < 2; i++)
13957	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13958      op0 = gen_reg_rtx (V4DImode);
13959      ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13960      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13961      return;
13962
13963    case E_V4TImode:
13964      for (i = 0; i < 4; i++)
13965	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13966      ops[4] = gen_reg_rtx (V4DImode);
13967      ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13968      ops[5] = gen_reg_rtx (V4DImode);
13969      ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13970      op0 = gen_reg_rtx (V8DImode);
13971      ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13972      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13973      return;
13974
13975    case E_V32QImode:
13976      half_mode = V16QImode;
13977      goto half;
13978
13979    case E_V16HImode:
13980      half_mode = V8HImode;
13981      goto half;
13982
13983half:
13984      n = GET_MODE_NUNITS (mode);
13985      for (i = 0; i < n; i++)
13986	ops[i] = XVECEXP (vals, 0, i);
13987      op0 = gen_reg_rtx (half_mode);
13988      op1 = gen_reg_rtx (half_mode);
13989      ix86_expand_vector_init_interleave (half_mode, op0, ops,
13990					  n >> 2);
13991      ix86_expand_vector_init_interleave (half_mode, op1,
13992					  &ops [n >> 1], n >> 2);
13993      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13994      return;
13995
13996    case E_V64QImode:
13997      quarter_mode = V16QImode;
13998      half_mode = V32QImode;
13999      goto quarter;
14000
14001    case E_V32HImode:
14002      quarter_mode = V8HImode;
14003      half_mode = V16HImode;
14004      goto quarter;
14005
14006quarter:
14007      n = GET_MODE_NUNITS (mode);
14008      for (i = 0; i < n; i++)
14009	ops[i] = XVECEXP (vals, 0, i);
14010      op0 = gen_reg_rtx (quarter_mode);
14011      op1 = gen_reg_rtx (quarter_mode);
14012      op2 = gen_reg_rtx (quarter_mode);
14013      op3 = gen_reg_rtx (quarter_mode);
14014      op4 = gen_reg_rtx (half_mode);
14015      op5 = gen_reg_rtx (half_mode);
14016      ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14017					  n >> 3);
14018      ix86_expand_vector_init_interleave (quarter_mode, op1,
14019					  &ops [n >> 2], n >> 3);
14020      ix86_expand_vector_init_interleave (quarter_mode, op2,
14021					  &ops [n >> 1], n >> 3);
14022      ix86_expand_vector_init_interleave (quarter_mode, op3,
14023					  &ops [(n >> 1) | (n >> 2)], n >> 3);
14024      emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14025      emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14026      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14027      return;
14028
14029    case E_V16QImode:
14030      if (!TARGET_SSE4_1)
14031	break;
14032      /* FALLTHRU */
14033
14034    case E_V8HImode:
14035      if (!TARGET_SSE2)
14036	break;
14037
14038      /* Don't use ix86_expand_vector_init_interleave if we can't
14039	 move from GPR to SSE register directly.  */
14040      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14041	break;
14042
14043      n = GET_MODE_NUNITS (mode);
14044      for (i = 0; i < n; i++)
14045	ops[i] = XVECEXP (vals, 0, i);
14046      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14047      return;
14048
14049    case E_V4HImode:
14050    case E_V8QImode:
14051      break;
14052
14053    default:
14054      gcc_unreachable ();
14055    }
14056
14057    {
14058      int i, j, n_elts, n_words, n_elt_per_word;
14059      machine_mode inner_mode;
14060      rtx words[4], shift;
14061
14062      inner_mode = GET_MODE_INNER (mode);
14063      n_elts = GET_MODE_NUNITS (mode);
14064      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14065      n_elt_per_word = n_elts / n_words;
14066      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14067
14068      for (i = 0; i < n_words; ++i)
14069	{
14070	  rtx word = NULL_RTX;
14071
14072	  for (j = 0; j < n_elt_per_word; ++j)
14073	    {
14074	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14075	      elt = convert_modes (word_mode, inner_mode, elt, true);
14076
14077	      if (j == 0)
14078		word = elt;
14079	      else
14080		{
14081		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14082					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
14083		  word = expand_simple_binop (word_mode, IOR, word, elt,
14084					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
14085		}
14086	    }
14087
14088	  words[i] = word;
14089	}
14090
14091      if (n_words == 1)
14092	emit_move_insn (target, gen_lowpart (mode, words[0]));
14093      else if (n_words == 2)
14094	{
14095	  rtx tmp = gen_reg_rtx (mode);
14096	  emit_clobber (tmp);
14097	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14098	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14099	  emit_move_insn (target, tmp);
14100	}
14101      else if (n_words == 4)
14102	{
14103	  rtx tmp = gen_reg_rtx (V4SImode);
14104	  gcc_assert (word_mode == SImode);
14105	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14106	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14107	  emit_move_insn (target, gen_lowpart (mode, tmp));
14108	}
14109      else
14110	gcc_unreachable ();
14111    }
14112}
14113
14114/* Initialize vector TARGET via VALS.  Suppress the use of MMX
14115   instructions unless MMX_OK is true.  */
14116
14117void
14118ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14119{
14120  machine_mode mode = GET_MODE (target);
14121  machine_mode inner_mode = GET_MODE_INNER (mode);
14122  int n_elts = GET_MODE_NUNITS (mode);
14123  int n_var = 0, one_var = -1;
14124  bool all_same = true, all_const_zero = true;
14125  int i;
14126  rtx x;
14127
14128  /* Handle first initialization from vector elts.  */
14129  if (n_elts != XVECLEN (vals, 0))
14130    {
14131      rtx subtarget = target;
14132      x = XVECEXP (vals, 0, 0);
14133      gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14134      if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14135	{
14136	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14137	  if (inner_mode == QImode
14138	      || inner_mode == HImode
14139	      || inner_mode == TImode)
14140	    {
14141	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14142	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
14143	      n_bits /= GET_MODE_SIZE (elt_mode);
14144	      mode = mode_for_vector (elt_mode, n_bits).require ();
14145	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
14146	      ops[0] = gen_lowpart (inner_mode, ops[0]);
14147	      ops[1] = gen_lowpart (inner_mode, ops[1]);
14148	      subtarget = gen_reg_rtx (mode);
14149	    }
14150	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14151	  if (subtarget != target)
14152	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14153	  return;
14154	}
14155      gcc_unreachable ();
14156    }
14157
14158  for (i = 0; i < n_elts; ++i)
14159    {
14160      x = XVECEXP (vals, 0, i);
14161      if (!(CONST_SCALAR_INT_P (x)
14162	    || CONST_DOUBLE_P (x)
14163	    || CONST_FIXED_P (x)))
14164	n_var++, one_var = i;
14165      else if (x != CONST0_RTX (inner_mode))
14166	all_const_zero = false;
14167      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14168	all_same = false;
14169    }
14170
14171  /* Constants are best loaded from the constant pool.  */
14172  if (n_var == 0)
14173    {
14174      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14175      return;
14176    }
14177
14178  /* If all values are identical, broadcast the value.  */
14179  if (all_same
14180      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14181					    XVECEXP (vals, 0, 0)))
14182    return;
14183
14184  /* Values where only one field is non-constant are best loaded from
14185     the pool and overwritten via move later.  */
14186  if (n_var == 1)
14187    {
14188      if (all_const_zero
14189	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14190						  XVECEXP (vals, 0, one_var),
14191						  one_var))
14192	return;
14193
14194      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14195	return;
14196    }
14197
14198  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14199}
14200
14201void
14202ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14203{
14204  machine_mode mode = GET_MODE (target);
14205  machine_mode inner_mode = GET_MODE_INNER (mode);
14206  machine_mode half_mode;
14207  bool use_vec_merge = false;
14208  rtx tmp;
14209  static rtx (*gen_extract[6][2]) (rtx, rtx)
14210    = {
14211	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14212	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14213	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14214	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14215	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14216	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14217      };
14218  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14219    = {
14220	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14221	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14222	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14223	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14224	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14225	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14226      };
14227  int i, j, n;
14228  machine_mode mmode = VOIDmode;
14229  rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14230
14231  switch (mode)
14232    {
14233    case E_V2SImode:
14234      use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14235      if (use_vec_merge)
14236	break;
14237      /* FALLTHRU */
14238
14239    case E_V2SFmode:
14240      if (mmx_ok)
14241	{
14242	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14243	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14244	  if (elt == 0)
14245	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14246	  else
14247	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14248	  emit_insn (gen_rtx_SET (target, tmp));
14249	  return;
14250	}
14251      break;
14252
14253    case E_V2DImode:
14254      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14255      if (use_vec_merge)
14256	break;
14257
14258      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14259      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14260      if (elt == 0)
14261	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14262      else
14263	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14264      emit_insn (gen_rtx_SET (target, tmp));
14265      return;
14266
14267    case E_V2DFmode:
14268      /* NB: For ELT == 0, use standard scalar operation patterns which
14269	 preserve the rest of the vector for combiner:
14270
14271	 (vec_merge:V2DF
14272	   (vec_duplicate:V2DF (reg:DF))
14273	   (reg:V2DF)
14274	   (const_int 1))
14275       */
14276      if (elt == 0)
14277	goto do_vec_merge;
14278
14279      {
14280	rtx op0, op1;
14281
14282	/* For the two element vectors, we implement a VEC_CONCAT with
14283	   the extraction of the other element.  */
14284
14285	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14286	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14287
14288	if (elt == 0)
14289	  op0 = val, op1 = tmp;
14290	else
14291	  op0 = tmp, op1 = val;
14292
14293	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14294	emit_insn (gen_rtx_SET (target, tmp));
14295      }
14296      return;
14297
14298    case E_V4SFmode:
14299      use_vec_merge = TARGET_SSE4_1;
14300      if (use_vec_merge)
14301	break;
14302
14303      switch (elt)
14304	{
14305	case 0:
14306	  use_vec_merge = true;
14307	  break;
14308
14309	case 1:
14310	  /* tmp = target = A B C D */
14311	  tmp = copy_to_reg (target);
14312	  /* target = A A B B */
14313	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14314	  /* target = X A B B */
14315	  ix86_expand_vector_set (false, target, val, 0);
14316	  /* target = A X C D  */
14317	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14318					  const1_rtx, const0_rtx,
14319					  GEN_INT (2+4), GEN_INT (3+4)));
14320	  return;
14321
14322	case 2:
14323	  /* tmp = target = A B C D */
14324	  tmp = copy_to_reg (target);
14325	  /* tmp = X B C D */
14326	  ix86_expand_vector_set (false, tmp, val, 0);
14327	  /* target = A B X D */
14328	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14329					  const0_rtx, const1_rtx,
14330					  GEN_INT (0+4), GEN_INT (3+4)));
14331	  return;
14332
14333	case 3:
14334	  /* tmp = target = A B C D */
14335	  tmp = copy_to_reg (target);
14336	  /* tmp = X B C D */
14337	  ix86_expand_vector_set (false, tmp, val, 0);
14338	  /* target = A B X D */
14339	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14340					  const0_rtx, const1_rtx,
14341					  GEN_INT (2+4), GEN_INT (0+4)));
14342	  return;
14343
14344	default:
14345	  gcc_unreachable ();
14346	}
14347      break;
14348
14349    case E_V4SImode:
14350      use_vec_merge = TARGET_SSE4_1;
14351      if (use_vec_merge)
14352	break;
14353
14354      /* Element 0 handled by vec_merge below.  */
14355      if (elt == 0)
14356	{
14357	  use_vec_merge = true;
14358	  break;
14359	}
14360
14361      if (TARGET_SSE2)
14362	{
14363	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
14364	     store into element 0, then shuffle them back.  */
14365
14366	  rtx order[4];
14367
14368	  order[0] = GEN_INT (elt);
14369	  order[1] = const1_rtx;
14370	  order[2] = const2_rtx;
14371	  order[3] = GEN_INT (3);
14372	  order[elt] = const0_rtx;
14373
14374	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14375					order[1], order[2], order[3]));
14376
14377	  ix86_expand_vector_set (false, target, val, 0);
14378
14379	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14380					order[1], order[2], order[3]));
14381	}
14382      else
14383	{
14384	  /* For SSE1, we have to reuse the V4SF code.  */
14385	  rtx t = gen_reg_rtx (V4SFmode);
14386	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
14387	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14388	  emit_move_insn (target, gen_lowpart (mode, t));
14389	}
14390      return;
14391
14392    case E_V8HImode:
14393      use_vec_merge = TARGET_SSE2;
14394      break;
14395    case E_V4HImode:
14396      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14397      break;
14398
14399    case E_V16QImode:
14400      use_vec_merge = TARGET_SSE4_1;
14401      break;
14402
14403    case E_V8QImode:
14404      use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14405      break;
14406
14407    case E_V32QImode:
14408      half_mode = V16QImode;
14409      j = 0;
14410      n = 16;
14411      goto half;
14412
14413    case E_V16HImode:
14414      half_mode = V8HImode;
14415      j = 1;
14416      n = 8;
14417      goto half;
14418
14419    case E_V8SImode:
14420      half_mode = V4SImode;
14421      j = 2;
14422      n = 4;
14423      goto half;
14424
14425    case E_V4DImode:
14426      half_mode = V2DImode;
14427      j = 3;
14428      n = 2;
14429      goto half;
14430
14431    case E_V8SFmode:
14432      half_mode = V4SFmode;
14433      j = 4;
14434      n = 4;
14435      goto half;
14436
14437    case E_V4DFmode:
14438      half_mode = V2DFmode;
14439      j = 5;
14440      n = 2;
14441      goto half;
14442
14443half:
14444      /* Compute offset.  */
14445      i = elt / n;
14446      elt %= n;
14447
14448      gcc_assert (i <= 1);
14449
14450      /* Extract the half.  */
14451      tmp = gen_reg_rtx (half_mode);
14452      emit_insn (gen_extract[j][i] (tmp, target));
14453
14454      /* Put val in tmp at elt.  */
14455      ix86_expand_vector_set (false, tmp, val, elt);
14456
14457      /* Put it back.  */
14458      emit_insn (gen_insert[j][i] (target, target, tmp));
14459      return;
14460
14461    case E_V8DFmode:
14462      if (TARGET_AVX512F)
14463	{
14464	  mmode = QImode;
14465	  gen_blendm = gen_avx512f_blendmv8df;
14466	}
14467      break;
14468
14469    case E_V8DImode:
14470      if (TARGET_AVX512F)
14471	{
14472	  mmode = QImode;
14473	  gen_blendm = gen_avx512f_blendmv8di;
14474	}
14475      break;
14476
14477    case E_V16SFmode:
14478      if (TARGET_AVX512F)
14479	{
14480	  mmode = HImode;
14481	  gen_blendm = gen_avx512f_blendmv16sf;
14482	}
14483      break;
14484
14485    case E_V16SImode:
14486      if (TARGET_AVX512F)
14487	{
14488	  mmode = HImode;
14489	  gen_blendm = gen_avx512f_blendmv16si;
14490	}
14491      break;
14492
14493    case E_V32HImode:
14494      if (TARGET_AVX512BW)
14495	{
14496	  mmode = SImode;
14497	  gen_blendm = gen_avx512bw_blendmv32hi;
14498	}
14499      else if (TARGET_AVX512F)
14500	{
14501	  half_mode = E_V8HImode;
14502	  n = 8;
14503	  goto quarter;
14504	}
14505      break;
14506
14507    case E_V64QImode:
14508      if (TARGET_AVX512BW)
14509	{
14510	  mmode = DImode;
14511	  gen_blendm = gen_avx512bw_blendmv64qi;
14512	}
14513      else if (TARGET_AVX512F)
14514	{
14515	  half_mode = E_V16QImode;
14516	  n = 16;
14517	  goto quarter;
14518	}
14519      break;
14520
14521quarter:
14522      /* Compute offset.  */
14523      i = elt / n;
14524      elt %= n;
14525
14526      gcc_assert (i <= 3);
14527
14528      {
14529	/* Extract the quarter.  */
14530	tmp = gen_reg_rtx (V4SImode);
14531	rtx tmp2 = gen_lowpart (V16SImode, target);
14532	rtx mask = gen_reg_rtx (QImode);
14533
14534	emit_move_insn (mask, constm1_rtx);
14535	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14536						   tmp, mask));
14537
14538	tmp2 = gen_reg_rtx (half_mode);
14539	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14540	tmp = tmp2;
14541
14542	/* Put val in tmp at elt.  */
14543	ix86_expand_vector_set (false, tmp, val, elt);
14544
14545	/* Put it back.  */
14546	tmp2 = gen_reg_rtx (V16SImode);
14547	rtx tmp3 = gen_lowpart (V16SImode, target);
14548	mask = gen_reg_rtx (HImode);
14549	emit_move_insn (mask, constm1_rtx);
14550	tmp = gen_lowpart (V4SImode, tmp);
14551	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14552						  tmp3, mask));
14553	emit_move_insn (target, gen_lowpart (mode, tmp2));
14554      }
14555      return;
14556
14557    default:
14558      break;
14559    }
14560
14561  if (mmode != VOIDmode)
14562    {
14563      tmp = gen_reg_rtx (mode);
14564      emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14565      /* The avx512*_blendm<mode> expanders have different operand order
14566	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
14567	 elements where the mask is set and second input operand otherwise,
14568	 in {sse,avx}*_*blend* the first input operand is used for elements
14569	 where the mask is clear and second input operand otherwise.  */
14570      emit_insn (gen_blendm (target, target, tmp,
14571			     force_reg (mmode,
14572					gen_int_mode (HOST_WIDE_INT_1U << elt,
14573						      mmode))));
14574    }
14575  else if (use_vec_merge)
14576    {
14577do_vec_merge:
14578      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14579      tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14580			       GEN_INT (HOST_WIDE_INT_1U << elt));
14581      emit_insn (gen_rtx_SET (target, tmp));
14582    }
14583  else
14584    {
14585      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14586
14587      emit_move_insn (mem, target);
14588
14589      tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14590      emit_move_insn (tmp, val);
14591
14592      emit_move_insn (target, mem);
14593    }
14594}
14595
14596void
14597ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14598{
14599  machine_mode mode = GET_MODE (vec);
14600  machine_mode inner_mode = GET_MODE_INNER (mode);
14601  bool use_vec_extr = false;
14602  rtx tmp;
14603
14604  switch (mode)
14605    {
14606    case E_V2SImode:
14607      use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14608      if (use_vec_extr)
14609	break;
14610      /* FALLTHRU */
14611
14612    case E_V2SFmode:
14613      if (!mmx_ok)
14614	break;
14615      /* FALLTHRU */
14616
14617    case E_V2DFmode:
14618    case E_V2DImode:
14619    case E_V2TImode:
14620    case E_V4TImode:
14621      use_vec_extr = true;
14622      break;
14623
14624    case E_V4SFmode:
14625      use_vec_extr = TARGET_SSE4_1;
14626      if (use_vec_extr)
14627	break;
14628
14629      switch (elt)
14630	{
14631	case 0:
14632	  tmp = vec;
14633	  break;
14634
14635	case 1:
14636	case 3:
14637	  tmp = gen_reg_rtx (mode);
14638	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14639				       GEN_INT (elt), GEN_INT (elt),
14640				       GEN_INT (elt+4), GEN_INT (elt+4)));
14641	  break;
14642
14643	case 2:
14644	  tmp = gen_reg_rtx (mode);
14645	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14646	  break;
14647
14648	default:
14649	  gcc_unreachable ();
14650	}
14651      vec = tmp;
14652      use_vec_extr = true;
14653      elt = 0;
14654      break;
14655
14656    case E_V4SImode:
14657      use_vec_extr = TARGET_SSE4_1;
14658      if (use_vec_extr)
14659	break;
14660
14661      if (TARGET_SSE2)
14662	{
14663	  switch (elt)
14664	    {
14665	    case 0:
14666	      tmp = vec;
14667	      break;
14668
14669	    case 1:
14670	    case 3:
14671	      tmp = gen_reg_rtx (mode);
14672	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14673					    GEN_INT (elt), GEN_INT (elt),
14674					    GEN_INT (elt), GEN_INT (elt)));
14675	      break;
14676
14677	    case 2:
14678	      tmp = gen_reg_rtx (mode);
14679	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14680	      break;
14681
14682	    default:
14683	      gcc_unreachable ();
14684	    }
14685	  vec = tmp;
14686	  use_vec_extr = true;
14687	  elt = 0;
14688	}
14689      else
14690	{
14691	  /* For SSE1, we have to reuse the V4SF code.  */
14692	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14693				      gen_lowpart (V4SFmode, vec), elt);
14694	  return;
14695	}
14696      break;
14697
14698    case E_V8HImode:
14699      use_vec_extr = TARGET_SSE2;
14700      break;
14701    case E_V4HImode:
14702      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14703      break;
14704
14705    case E_V16QImode:
14706      use_vec_extr = TARGET_SSE4_1;
14707      if (!use_vec_extr
14708	  && TARGET_SSE2
14709	  && elt == 0
14710	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14711	{
14712	  tmp = gen_reg_rtx (SImode);
14713	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14714				      0);
14715	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14716	  return;
14717	}
14718      break;
14719
14720    case E_V8SFmode:
14721      if (TARGET_AVX)
14722	{
14723	  tmp = gen_reg_rtx (V4SFmode);
14724	  if (elt < 4)
14725	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14726	  else
14727	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14728	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
14729	  return;
14730	}
14731      break;
14732
14733    case E_V4DFmode:
14734      if (TARGET_AVX)
14735	{
14736	  tmp = gen_reg_rtx (V2DFmode);
14737	  if (elt < 2)
14738	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14739	  else
14740	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14741	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
14742	  return;
14743	}
14744      break;
14745
14746    case E_V32QImode:
14747      if (TARGET_AVX)
14748	{
14749	  tmp = gen_reg_rtx (V16QImode);
14750	  if (elt < 16)
14751	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14752	  else
14753	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14754	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
14755	  return;
14756	}
14757      break;
14758
14759    case E_V16HImode:
14760      if (TARGET_AVX)
14761	{
14762	  tmp = gen_reg_rtx (V8HImode);
14763	  if (elt < 8)
14764	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14765	  else
14766	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14767	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
14768	  return;
14769	}
14770      break;
14771
14772    case E_V8SImode:
14773      if (TARGET_AVX)
14774	{
14775	  tmp = gen_reg_rtx (V4SImode);
14776	  if (elt < 4)
14777	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14778	  else
14779	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14780	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
14781	  return;
14782	}
14783      break;
14784
14785    case E_V4DImode:
14786      if (TARGET_AVX)
14787	{
14788	  tmp = gen_reg_rtx (V2DImode);
14789	  if (elt < 2)
14790	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14791	  else
14792	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14793	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
14794	  return;
14795	}
14796      break;
14797
14798    case E_V32HImode:
14799      if (TARGET_AVX512BW)
14800	{
14801	  tmp = gen_reg_rtx (V16HImode);
14802	  if (elt < 16)
14803	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14804	  else
14805	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14806	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
14807	  return;
14808	}
14809      break;
14810
14811    case E_V64QImode:
14812      if (TARGET_AVX512BW)
14813	{
14814	  tmp = gen_reg_rtx (V32QImode);
14815	  if (elt < 32)
14816	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14817	  else
14818	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14819	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
14820	  return;
14821	}
14822      break;
14823
14824    case E_V16SFmode:
14825      tmp = gen_reg_rtx (V8SFmode);
14826      if (elt < 8)
14827	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14828      else
14829	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14830      ix86_expand_vector_extract (false, target, tmp, elt & 7);
14831      return;
14832
14833    case E_V8DFmode:
14834      tmp = gen_reg_rtx (V4DFmode);
14835      if (elt < 4)
14836	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14837      else
14838	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14839      ix86_expand_vector_extract (false, target, tmp, elt & 3);
14840      return;
14841
14842    case E_V16SImode:
14843      tmp = gen_reg_rtx (V8SImode);
14844      if (elt < 8)
14845	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14846      else
14847	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14848      ix86_expand_vector_extract (false, target, tmp, elt & 7);
14849      return;
14850
14851    case E_V8DImode:
14852      tmp = gen_reg_rtx (V4DImode);
14853      if (elt < 4)
14854	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14855      else
14856	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14857      ix86_expand_vector_extract (false, target, tmp, elt & 3);
14858      return;
14859
14860    case E_V8QImode:
14861      use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14862      /* ??? Could extract the appropriate HImode element and shift.  */
14863      break;
14864
14865    default:
14866      break;
14867    }
14868
14869  if (use_vec_extr)
14870    {
14871      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14872      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14873
14874      /* Let the rtl optimizers know about the zero extension performed.  */
14875      if (inner_mode == QImode || inner_mode == HImode)
14876	{
14877	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14878	  target = gen_lowpart (SImode, target);
14879	}
14880
14881      emit_insn (gen_rtx_SET (target, tmp));
14882    }
14883  else
14884    {
14885      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14886
14887      emit_move_insn (mem, vec);
14888
14889      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14890      emit_move_insn (target, tmp);
14891    }
14892}
14893
14894/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14895   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14896   The upper bits of DEST are undefined, though they shouldn't cause
14897   exceptions (some bits from src or all zeros are ok).  */
14898
14899static void
14900emit_reduc_half (rtx dest, rtx src, int i)
14901{
14902  rtx tem, d = dest;
14903  switch (GET_MODE (src))
14904    {
14905    case E_V4SFmode:
14906      if (i == 128)
14907	tem = gen_sse_movhlps (dest, src, src);
14908      else
14909	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14910				   GEN_INT (1 + 4), GEN_INT (1 + 4));
14911      break;
14912    case E_V2DFmode:
14913      tem = gen_vec_interleave_highv2df (dest, src, src);
14914      break;
14915    case E_V16QImode:
14916    case E_V8HImode:
14917    case E_V4SImode:
14918    case E_V2DImode:
14919      d = gen_reg_rtx (V1TImode);
14920      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14921				GEN_INT (i / 2));
14922      break;
14923    case E_V8SFmode:
14924      if (i == 256)
14925	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14926      else
14927	tem = gen_avx_shufps256 (dest, src, src,
14928				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14929      break;
14930    case E_V4DFmode:
14931      if (i == 256)
14932	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14933      else
14934	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14935      break;
14936    case E_V32QImode:
14937    case E_V16HImode:
14938    case E_V8SImode:
14939    case E_V4DImode:
14940      if (i == 256)
14941	{
14942	  if (GET_MODE (dest) != V4DImode)
14943	    d = gen_reg_rtx (V4DImode);
14944	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14945				   gen_lowpart (V4DImode, src),
14946				   const1_rtx);
14947	}
14948      else
14949	{
14950	  d = gen_reg_rtx (V2TImode);
14951	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14952				    GEN_INT (i / 2));
14953	}
14954      break;
14955    case E_V64QImode:
14956    case E_V32HImode:
14957      if (i < 64)
14958	{
14959	  d = gen_reg_rtx (V4TImode);
14960	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14961					GEN_INT (i / 2));
14962	  break;
14963	}
14964      /* FALLTHRU */
14965    case E_V16SImode:
14966    case E_V16SFmode:
14967    case E_V8DImode:
14968    case E_V8DFmode:
14969      if (i > 128)
14970	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14971					gen_lowpart (V16SImode, src),
14972					gen_lowpart (V16SImode, src),
14973					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14974					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14975					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14976					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14977					GEN_INT (0xC), GEN_INT (0xD),
14978					GEN_INT (0xE), GEN_INT (0xF),
14979					GEN_INT (0x10), GEN_INT (0x11),
14980					GEN_INT (0x12), GEN_INT (0x13),
14981					GEN_INT (0x14), GEN_INT (0x15),
14982					GEN_INT (0x16), GEN_INT (0x17));
14983      else
14984	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14985				    gen_lowpart (V16SImode, src),
14986				    GEN_INT (i == 128 ? 0x2 : 0x1),
14987				    GEN_INT (0x3),
14988				    GEN_INT (0x3),
14989				    GEN_INT (0x3),
14990				    GEN_INT (i == 128 ? 0x6 : 0x5),
14991				    GEN_INT (0x7),
14992				    GEN_INT (0x7),
14993				    GEN_INT (0x7),
14994				    GEN_INT (i == 128 ? 0xA : 0x9),
14995				    GEN_INT (0xB),
14996				    GEN_INT (0xB),
14997				    GEN_INT (0xB),
14998				    GEN_INT (i == 128 ? 0xE : 0xD),
14999				    GEN_INT (0xF),
15000				    GEN_INT (0xF),
15001				    GEN_INT (0xF));
15002      break;
15003    default:
15004      gcc_unreachable ();
15005    }
15006  emit_insn (tem);
15007  if (d != dest)
15008    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15009}
15010
15011/* Expand a vector reduction.  FN is the binary pattern to reduce;
15012   DEST is the destination; IN is the input vector.  */
15013
15014void
15015ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15016{
15017  rtx half, dst, vec = in;
15018  machine_mode mode = GET_MODE (in);
15019  int i;
15020
15021  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
15022  if (TARGET_SSE4_1
15023      && mode == V8HImode
15024      && fn == gen_uminv8hi3)
15025    {
15026      emit_insn (gen_sse4_1_phminposuw (dest, in));
15027      return;
15028    }
15029
15030  for (i = GET_MODE_BITSIZE (mode);
15031       i > GET_MODE_UNIT_BITSIZE (mode);
15032       i >>= 1)
15033    {
15034      half = gen_reg_rtx (mode);
15035      emit_reduc_half (half, vec, i);
15036      if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15037	dst = dest;
15038      else
15039	dst = gen_reg_rtx (mode);
15040      emit_insn (fn (dst, half, vec));
15041      vec = dst;
15042    }
15043}
15044
15045/* Output code to perform a conditional jump to LABEL, if C2 flag in
15046   FP status register is set.  */
15047
15048void
15049ix86_emit_fp_unordered_jump (rtx label)
15050{
15051  rtx reg = gen_reg_rtx (HImode);
15052  rtx_insn *insn;
15053  rtx temp;
15054
15055  emit_insn (gen_x86_fnstsw_1 (reg));
15056
15057  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15058    {
15059      emit_insn (gen_x86_sahf_1 (reg));
15060
15061      temp = gen_rtx_REG (CCmode, FLAGS_REG);
15062      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15063    }
15064  else
15065    {
15066      emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15067
15068      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15069      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15070    }
15071
15072  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15073			      gen_rtx_LABEL_REF (VOIDmode, label),
15074			      pc_rtx);
15075  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15076  predict_jump (REG_BR_PROB_BASE * 10 / 100);
15077  JUMP_LABEL (insn) = label;
15078}
15079
15080/* Output code to perform an sinh XFmode calculation.  */
15081
15082void ix86_emit_i387_sinh (rtx op0, rtx op1)
15083{
15084  rtx e1 = gen_reg_rtx (XFmode);
15085  rtx e2 = gen_reg_rtx (XFmode);
15086  rtx scratch = gen_reg_rtx (HImode);
15087  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15088  rtx half = const_double_from_real_value (dconsthalf, XFmode);
15089  rtx cst1, tmp;
15090  rtx_code_label *jump_label = gen_label_rtx ();
15091  rtx_insn *insn;
15092
15093  /* scratch = fxam (op1) */
15094  emit_insn (gen_fxamxf2_i387 (scratch, op1));
15095
15096  /* e1 = expm1 (|op1|) */
15097  emit_insn (gen_absxf2 (e2, op1));
15098  emit_insn (gen_expm1xf2 (e1, e2));
15099
15100  /* e2 = e1 / (e1 + 1.0) + e1 */
15101  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15102  emit_insn (gen_addxf3 (e2, e1, cst1));
15103  emit_insn (gen_divxf3 (e2, e1, e2));
15104  emit_insn (gen_addxf3 (e2, e2, e1));
15105
15106  /* flags = signbit (op1) */
15107  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15108
15109  /* if (flags) then e2 = -e2 */
15110  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15111			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15112			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15113			      pc_rtx);
15114  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15115  predict_jump (REG_BR_PROB_BASE * 50 / 100);
15116  JUMP_LABEL (insn) = jump_label;
15117
15118  emit_insn (gen_negxf2 (e2, e2));
15119
15120  emit_label (jump_label);
15121  LABEL_NUSES (jump_label) = 1;
15122
15123  /* op0 = 0.5 * e2 */
15124  half = force_reg (XFmode, half);
15125  emit_insn (gen_mulxf3 (op0, e2, half));
15126}
15127
15128/* Output code to perform an cosh XFmode calculation.  */
15129
15130void ix86_emit_i387_cosh (rtx op0, rtx op1)
15131{
15132  rtx e1 = gen_reg_rtx (XFmode);
15133  rtx e2 = gen_reg_rtx (XFmode);
15134  rtx half = const_double_from_real_value (dconsthalf, XFmode);
15135  rtx cst1;
15136
15137  /* e1 = exp (op1) */
15138  emit_insn (gen_expxf2 (e1, op1));
15139
15140  /* e2 = e1 + 1.0 / e1 */
15141  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15142  emit_insn (gen_divxf3 (e2, cst1, e1));
15143  emit_insn (gen_addxf3 (e2, e1, e2));
15144
15145  /* op0 = 0.5 * e2 */
15146  half = force_reg (XFmode, half);
15147  emit_insn (gen_mulxf3 (op0, e2, half));
15148}
15149
15150/* Output code to perform an tanh XFmode calculation.  */
15151
15152void ix86_emit_i387_tanh (rtx op0, rtx op1)
15153{
15154  rtx e1 = gen_reg_rtx (XFmode);
15155  rtx e2 = gen_reg_rtx (XFmode);
15156  rtx scratch = gen_reg_rtx (HImode);
15157  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15158  rtx cst2, tmp;
15159  rtx_code_label *jump_label = gen_label_rtx ();
15160  rtx_insn *insn;
15161
15162  /* scratch = fxam (op1) */
15163  emit_insn (gen_fxamxf2_i387 (scratch, op1));
15164
15165  /* e1 = expm1 (-|2 * op1|) */
15166  emit_insn (gen_addxf3 (e2, op1, op1));
15167  emit_insn (gen_absxf2 (e2, e2));
15168  emit_insn (gen_negxf2 (e2, e2));
15169  emit_insn (gen_expm1xf2 (e1, e2));
15170
15171  /* e2 = e1 / (e1 + 2.0) */
15172  cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15173  emit_insn (gen_addxf3 (e2, e1, cst2));
15174  emit_insn (gen_divxf3 (e2, e1, e2));
15175
15176  /* flags = signbit (op1) */
15177  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15178
15179  /* if (!flags) then e2 = -e2 */
15180  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15181			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15182			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15183			      pc_rtx);
15184  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15185  predict_jump (REG_BR_PROB_BASE * 50 / 100);
15186  JUMP_LABEL (insn) = jump_label;
15187
15188  emit_insn (gen_negxf2 (e2, e2));
15189
15190  emit_label (jump_label);
15191  LABEL_NUSES (jump_label) = 1;
15192
15193  emit_move_insn (op0, e2);
15194}
15195
15196/* Output code to perform an asinh XFmode calculation.  */
15197
15198void ix86_emit_i387_asinh (rtx op0, rtx op1)
15199{
15200  rtx e1 = gen_reg_rtx (XFmode);
15201  rtx e2 = gen_reg_rtx (XFmode);
15202  rtx scratch = gen_reg_rtx (HImode);
15203  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15204  rtx cst1, tmp;
15205  rtx_code_label *jump_label = gen_label_rtx ();
15206  rtx_insn *insn;
15207
15208  /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15209  emit_insn (gen_mulxf3 (e1, op1, op1));
15210  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15211  emit_insn (gen_addxf3 (e2, e1, cst1));
15212  emit_insn (gen_sqrtxf2 (e2, e2));
15213  emit_insn (gen_addxf3 (e2, e2, cst1));
15214
15215  /* e1 = e1 / e2 */
15216  emit_insn (gen_divxf3 (e1, e1, e2));
15217
15218  /* scratch = fxam (op1) */
15219  emit_insn (gen_fxamxf2_i387 (scratch, op1));
15220
15221  /* e1 = e1 + |op1| */
15222  emit_insn (gen_absxf2 (e2, op1));
15223  emit_insn (gen_addxf3 (e1, e1, e2));
15224
15225  /* e2 = log1p (e1) */
15226  ix86_emit_i387_log1p (e2, e1);
15227
15228  /* flags = signbit (op1) */
15229  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15230
15231  /* if (flags) then e2 = -e2 */
15232  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15233			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15234			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15235			      pc_rtx);
15236  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15237  predict_jump (REG_BR_PROB_BASE * 50 / 100);
15238  JUMP_LABEL (insn) = jump_label;
15239
15240  emit_insn (gen_negxf2 (e2, e2));
15241
15242  emit_label (jump_label);
15243  LABEL_NUSES (jump_label) = 1;
15244
15245  emit_move_insn (op0, e2);
15246}
15247
15248/* Output code to perform an acosh XFmode calculation.  */
15249
15250void ix86_emit_i387_acosh (rtx op0, rtx op1)
15251{
15252  rtx e1 = gen_reg_rtx (XFmode);
15253  rtx e2 = gen_reg_rtx (XFmode);
15254  rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15255
15256  /* e2 = sqrt (op1 + 1.0) */
15257  emit_insn (gen_addxf3 (e2, op1, cst1));
15258  emit_insn (gen_sqrtxf2 (e2, e2));
15259
15260  /* e1 = sqrt (op1 - 1.0) */
15261  emit_insn (gen_subxf3 (e1, op1, cst1));
15262  emit_insn (gen_sqrtxf2 (e1, e1));
15263
15264  /* e1 = e1 * e2 */
15265  emit_insn (gen_mulxf3 (e1, e1, e2));
15266
15267  /* e1 = e1 + op1 */
15268  emit_insn (gen_addxf3 (e1, e1, op1));
15269
15270  /* op0 = log (e1) */
15271  emit_insn (gen_logxf2 (op0, e1));
15272}
15273
15274/* Output code to perform an atanh XFmode calculation.  */
15275
15276void ix86_emit_i387_atanh (rtx op0, rtx op1)
15277{
15278  rtx e1 = gen_reg_rtx (XFmode);
15279  rtx e2 = gen_reg_rtx (XFmode);
15280  rtx scratch = gen_reg_rtx (HImode);
15281  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15282  rtx half = const_double_from_real_value (dconsthalf, XFmode);
15283  rtx cst1, tmp;
15284  rtx_code_label *jump_label = gen_label_rtx ();
15285  rtx_insn *insn;
15286
15287  /* scratch = fxam (op1) */
15288  emit_insn (gen_fxamxf2_i387 (scratch, op1));
15289
15290  /* e2 = |op1| */
15291  emit_insn (gen_absxf2 (e2, op1));
15292
15293  /* e1 = -(e2 + e2) / (e2 + 1.0) */
15294  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15295  emit_insn (gen_addxf3 (e1, e2, cst1));
15296  emit_insn (gen_addxf3 (e2, e2, e2));
15297  emit_insn (gen_negxf2 (e2, e2));
15298  emit_insn (gen_divxf3 (e1, e2, e1));
15299
15300  /* e2 = log1p (e1) */
15301  ix86_emit_i387_log1p (e2, e1);
15302
15303  /* flags = signbit (op1) */
15304  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15305
15306  /* if (!flags) then e2 = -e2 */
15307  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15308			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15309			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15310			      pc_rtx);
15311  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15312  predict_jump (REG_BR_PROB_BASE * 50 / 100);
15313  JUMP_LABEL (insn) = jump_label;
15314
15315  emit_insn (gen_negxf2 (e2, e2));
15316
15317  emit_label (jump_label);
15318  LABEL_NUSES (jump_label) = 1;
15319
15320  /* op0 = 0.5 * e2 */
15321  half = force_reg (XFmode, half);
15322  emit_insn (gen_mulxf3 (op0, e2, half));
15323}
15324
15325/* Output code to perform a log1p XFmode calculation.  */
15326
15327void ix86_emit_i387_log1p (rtx op0, rtx op1)
15328{
15329  rtx_code_label *label1 = gen_label_rtx ();
15330  rtx_code_label *label2 = gen_label_rtx ();
15331
15332  rtx tmp = gen_reg_rtx (XFmode);
15333  rtx res = gen_reg_rtx (XFmode);
15334  rtx cst, cstln2, cst1;
15335  rtx_insn *insn;
15336
15337  /* The emit_jump call emits pending stack adjust, make sure it is emitted
15338     before the conditional jump, otherwise the stack adjustment will be
15339     only conditional.  */
15340  do_pending_stack_adjust ();
15341
15342  cst = const_double_from_real_value
15343    (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15344  cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15345
15346  emit_insn (gen_absxf2 (tmp, op1));
15347
15348  cst = force_reg (XFmode, cst);
15349  ix86_expand_branch (GE, tmp, cst, label1);
15350  predict_jump (REG_BR_PROB_BASE * 10 / 100);
15351  insn = get_last_insn ();
15352  JUMP_LABEL (insn) = label1;
15353
15354  emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15355  emit_jump (label2);
15356
15357  emit_label (label1);
15358  LABEL_NUSES (label1) = 1;
15359
15360  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15361  emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15362  emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15363
15364  emit_label (label2);
15365  LABEL_NUSES (label2) = 1;
15366
15367  emit_move_insn (op0, res);
15368}
15369
15370/* Emit code for round calculation.  */
15371void ix86_emit_i387_round (rtx op0, rtx op1)
15372{
15373  machine_mode inmode = GET_MODE (op1);
15374  machine_mode outmode = GET_MODE (op0);
15375  rtx e1 = gen_reg_rtx (XFmode);
15376  rtx e2 = gen_reg_rtx (XFmode);
15377  rtx scratch = gen_reg_rtx (HImode);
15378  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15379  rtx half = const_double_from_real_value (dconsthalf, XFmode);
15380  rtx res = gen_reg_rtx (outmode);
15381  rtx_code_label *jump_label = gen_label_rtx ();
15382  rtx (*floor_insn) (rtx, rtx);
15383  rtx (*neg_insn) (rtx, rtx);
15384  rtx_insn *insn;
15385  rtx tmp;
15386
15387  switch (inmode)
15388    {
15389    case E_SFmode:
15390    case E_DFmode:
15391      tmp = gen_reg_rtx (XFmode);
15392
15393      emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15394      op1 = tmp;
15395      break;
15396    case E_XFmode:
15397      break;
15398    default:
15399      gcc_unreachable ();
15400    }
15401
15402  switch (outmode)
15403    {
15404    case E_SFmode:
15405      floor_insn = gen_frndintxf2_floor;
15406      neg_insn = gen_negsf2;
15407      break;
15408    case E_DFmode:
15409      floor_insn = gen_frndintxf2_floor;
15410      neg_insn = gen_negdf2;
15411      break;
15412    case E_XFmode:
15413      floor_insn = gen_frndintxf2_floor;
15414      neg_insn = gen_negxf2;
15415      break;
15416    case E_HImode:
15417      floor_insn = gen_lfloorxfhi2;
15418      neg_insn = gen_neghi2;
15419      break;
15420    case E_SImode:
15421      floor_insn = gen_lfloorxfsi2;
15422      neg_insn = gen_negsi2;
15423      break;
15424    case E_DImode:
15425      floor_insn = gen_lfloorxfdi2;
15426      neg_insn = gen_negdi2;
15427      break;
15428    default:
15429      gcc_unreachable ();
15430    }
15431
15432  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15433
15434  /* scratch = fxam(op1) */
15435  emit_insn (gen_fxamxf2_i387 (scratch, op1));
15436
15437  /* e1 = fabs(op1) */
15438  emit_insn (gen_absxf2 (e1, op1));
15439
15440  /* e2 = e1 + 0.5 */
15441  half = force_reg (XFmode, half);
15442  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15443
15444  /* res = floor(e2) */
15445  switch (outmode)
15446    {
15447    case E_SFmode:
15448    case E_DFmode:
15449      {
15450	tmp = gen_reg_rtx (XFmode);
15451
15452	emit_insn (floor_insn (tmp, e2));
15453	emit_insn (gen_rtx_SET (res,
15454				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15455						UNSPEC_TRUNC_NOOP)));
15456      }
15457      break;
15458    default:
15459      emit_insn (floor_insn (res, e2));
15460    }
15461
15462  /* flags = signbit(a) */
15463  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15464
15465  /* if (flags) then res = -res */
15466  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15467			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15468			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15469			      pc_rtx);
15470  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15471  predict_jump (REG_BR_PROB_BASE * 50 / 100);
15472  JUMP_LABEL (insn) = jump_label;
15473
15474  emit_insn (neg_insn (res, res));
15475
15476  emit_label (jump_label);
15477  LABEL_NUSES (jump_label) = 1;
15478
15479  emit_move_insn (op0, res);
15480}
15481
15482/* Output code to perform a Newton-Rhapson approximation of a single precision
15483   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
15484
15485void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15486{
15487  rtx x0, x1, e0, e1;
15488
15489  x0 = gen_reg_rtx (mode);
15490  e0 = gen_reg_rtx (mode);
15491  e1 = gen_reg_rtx (mode);
15492  x1 = gen_reg_rtx (mode);
15493
15494  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15495
15496  b = force_reg (mode, b);
15497
15498  /* x0 = rcp(b) estimate */
15499  if (mode == V16SFmode || mode == V8DFmode)
15500    {
15501      if (TARGET_AVX512ER)
15502	{
15503	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15504						      UNSPEC_RCP28)));
15505	  /* res = a * x0 */
15506	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15507	  return;
15508	}
15509      else
15510	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15511						    UNSPEC_RCP14)));
15512    }
15513  else
15514    emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15515						UNSPEC_RCP)));
15516
15517  /* e0 = x0 * b */
15518  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15519
15520  /* e0 = x0 * e0 */
15521  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15522
15523  /* e1 = x0 + x0 */
15524  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15525
15526  /* x1 = e1 - e0 */
15527  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15528
15529  /* res = a * x1 */
15530  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15531}
15532
15533/* Output code to perform a Newton-Rhapson approximation of a
15534   single precision floating point [reciprocal] square root.  */
15535
15536void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15537{
15538  rtx x0, e0, e1, e2, e3, mthree, mhalf;
15539  REAL_VALUE_TYPE r;
15540  int unspec;
15541
15542  x0 = gen_reg_rtx (mode);
15543  e0 = gen_reg_rtx (mode);
15544  e1 = gen_reg_rtx (mode);
15545  e2 = gen_reg_rtx (mode);
15546  e3 = gen_reg_rtx (mode);
15547
15548  if (TARGET_AVX512ER && mode == V16SFmode)
15549    {
15550      if (recip)
15551	/* res = rsqrt28(a) estimate */
15552	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15553						     UNSPEC_RSQRT28)));
15554      else
15555	{
15556	  /* x0 = rsqrt28(a) estimate */
15557	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15558						      UNSPEC_RSQRT28)));
15559	  /* res = rcp28(x0) estimate */
15560	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15561						       UNSPEC_RCP28)));
15562	}
15563      return;
15564    }
15565
15566  real_from_integer (&r, VOIDmode, -3, SIGNED);
15567  mthree = const_double_from_real_value (r, SFmode);
15568
15569  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15570  mhalf = const_double_from_real_value (r, SFmode);
15571  unspec = UNSPEC_RSQRT;
15572
15573  if (VECTOR_MODE_P (mode))
15574    {
15575      mthree = ix86_build_const_vector (mode, true, mthree);
15576      mhalf = ix86_build_const_vector (mode, true, mhalf);
15577      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
15578      if (GET_MODE_SIZE (mode) == 64)
15579	unspec = UNSPEC_RSQRT14;
15580    }
15581
15582  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15583     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15584
15585  a = force_reg (mode, a);
15586
15587  /* x0 = rsqrt(a) estimate */
15588  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15589					      unspec)));
15590
15591  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
15592  if (!recip)
15593    {
15594      rtx zero = force_reg (mode, CONST0_RTX(mode));
15595      rtx mask;
15596
15597      /* Handle masked compare.  */
15598      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15599	{
15600	  mask = gen_reg_rtx (HImode);
15601	  /* Imm value 0x4 corresponds to not-equal comparison.  */
15602	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15603	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15604	}
15605      else
15606	{
15607	  mask = gen_reg_rtx (mode);
15608	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15609	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15610	}
15611    }
15612
15613  /* e0 = x0 * a */
15614  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15615  /* e1 = e0 * x0 */
15616  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15617
15618  /* e2 = e1 - 3. */
15619  mthree = force_reg (mode, mthree);
15620  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15621
15622  mhalf = force_reg (mode, mhalf);
15623  if (recip)
15624    /* e3 = -.5 * x0 */
15625    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15626  else
15627    /* e3 = -.5 * e0 */
15628    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15629  /* ret = e2 * e3 */
15630  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15631}
15632
15633/* Expand fabs (OP0) and return a new rtx that holds the result.  The
15634   mask for masking out the sign-bit is stored in *SMASK, if that is
15635   non-null.  */
15636
15637static rtx
15638ix86_expand_sse_fabs (rtx op0, rtx *smask)
15639{
15640  machine_mode vmode, mode = GET_MODE (op0);
15641  rtx xa, mask;
15642
15643  xa = gen_reg_rtx (mode);
15644  if (mode == SFmode)
15645    vmode = V4SFmode;
15646  else if (mode == DFmode)
15647    vmode = V2DFmode;
15648  else
15649    vmode = mode;
15650  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15651  if (!VECTOR_MODE_P (mode))
15652    {
15653      /* We need to generate a scalar mode mask in this case.  */
15654      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15655      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15656      mask = gen_reg_rtx (mode);
15657      emit_insn (gen_rtx_SET (mask, tmp));
15658    }
15659  emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15660
15661  if (smask)
15662    *smask = mask;
15663
15664  return xa;
15665}
15666
15667/* Expands a comparison of OP0 with OP1 using comparison code CODE,
15668   swapping the operands if SWAP_OPERANDS is true.  The expanded
15669   code is a forward jump to a newly created label in case the
15670   comparison is true.  The generated label rtx is returned.  */
15671static rtx_code_label *
15672ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15673                                  bool swap_operands)
15674{
15675  bool unordered_compare = ix86_unordered_fp_compare (code);
15676  rtx_code_label *label;
15677  rtx tmp, reg;
15678
15679  if (swap_operands)
15680    std::swap (op0, op1);
15681
15682  label = gen_label_rtx ();
15683  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15684  if (unordered_compare)
15685    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15686  reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15687  emit_insn (gen_rtx_SET (reg, tmp));
15688  tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15689  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15690			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15691  tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15692  JUMP_LABEL (tmp) = label;
15693
15694  return label;
15695}
15696
15697/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15698   using comparison code CODE.  Operands are swapped for the comparison if
15699   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
15700static rtx
15701ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15702			      bool swap_operands)
15703{
15704  rtx (*insn)(rtx, rtx, rtx, rtx);
15705  machine_mode mode = GET_MODE (op0);
15706  rtx mask = gen_reg_rtx (mode);
15707
15708  if (swap_operands)
15709    std::swap (op0, op1);
15710
15711  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15712
15713  emit_insn (insn (mask, op0, op1,
15714		   gen_rtx_fmt_ee (code, mode, op0, op1)));
15715  return mask;
15716}
15717
15718/* Expand copysign from SIGN to the positive value ABS_VALUE
15719   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
15720   the sign-bit.  */
15721
15722static void
15723ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15724{
15725  machine_mode mode = GET_MODE (sign);
15726  rtx sgn = gen_reg_rtx (mode);
15727  if (mask == NULL_RTX)
15728    {
15729      machine_mode vmode;
15730
15731      if (mode == SFmode)
15732	vmode = V4SFmode;
15733      else if (mode == DFmode)
15734	vmode = V2DFmode;
15735      else
15736	vmode = mode;
15737
15738      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15739      if (!VECTOR_MODE_P (mode))
15740	{
15741	  /* We need to generate a scalar mode mask in this case.  */
15742	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15743	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15744	  mask = gen_reg_rtx (mode);
15745	  emit_insn (gen_rtx_SET (mask, tmp));
15746	}
15747    }
15748  else
15749    mask = gen_rtx_NOT (mode, mask);
15750  emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15751  emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15752}
15753
15754/* Expand SSE sequence for computing lround from OP1 storing
15755   into OP0.  */
15756
15757void
15758ix86_expand_lround (rtx op0, rtx op1)
15759{
15760  /* C code for the stuff we're doing below:
15761       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15762       return (long)tmp;
15763   */
15764  machine_mode mode = GET_MODE (op1);
15765  const struct real_format *fmt;
15766  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15767  rtx adj;
15768
15769  /* load nextafter (0.5, 0.0) */
15770  fmt = REAL_MODE_FORMAT (mode);
15771  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15772  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15773
15774  /* adj = copysign (0.5, op1) */
15775  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15776  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15777
15778  /* adj = op1 + adj */
15779  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15780
15781  /* op0 = (imode)adj */
15782  expand_fix (op0, adj, 0);
15783}
15784
15785/* Expand SSE2 sequence for computing lround from OPERAND1 storing
15786   into OPERAND0.  */
15787
15788void
15789ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15790{
15791  /* C code for the stuff we're doing below (for do_floor):
15792	xi = (long)op1;
15793        xi -= (double)xi > op1 ? 1 : 0;
15794        return xi;
15795   */
15796  machine_mode fmode = GET_MODE (op1);
15797  machine_mode imode = GET_MODE (op0);
15798  rtx ireg, freg, tmp;
15799  rtx_code_label *label;
15800
15801  /* reg = (long)op1 */
15802  ireg = gen_reg_rtx (imode);
15803  expand_fix (ireg, op1, 0);
15804
15805  /* freg = (double)reg */
15806  freg = gen_reg_rtx (fmode);
15807  expand_float (freg, ireg, 0);
15808
15809  /* ireg = (freg > op1) ? ireg - 1 : ireg */
15810  label = ix86_expand_sse_compare_and_jump (UNLE,
15811					    freg, op1, !do_floor);
15812  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15813			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15814  emit_move_insn (ireg, tmp);
15815
15816  emit_label (label);
15817  LABEL_NUSES (label) = 1;
15818
15819  emit_move_insn (op0, ireg);
15820}
15821
15822/* Generate and return a rtx of mode MODE for 2**n where n is the number
15823   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
15824
15825static rtx
15826ix86_gen_TWO52 (machine_mode mode)
15827{
15828  REAL_VALUE_TYPE TWO52r;
15829  rtx TWO52;
15830
15831  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15832  TWO52 = const_double_from_real_value (TWO52r, mode);
15833  TWO52 = force_reg (mode, TWO52);
15834
15835  return TWO52;
15836}
15837
15838/* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
15839
15840void
15841ix86_expand_rint (rtx operand0, rtx operand1)
15842{
15843  /* C code for the stuff we're doing below:
15844	xa = fabs (operand1);
15845        if (!isless (xa, 2**52))
15846	  return operand1;
15847        two52 = 2**52;
15848        if (flag_rounding_math)
15849	  {
15850	    two52 = copysign (two52, operand1);
15851	    xa = operand1;
15852	  }
15853        xa = xa + two52 - two52;
15854        return copysign (xa, operand1);
15855   */
15856  machine_mode mode = GET_MODE (operand0);
15857  rtx res, xa, TWO52, mask;
15858  rtx_code_label *label;
15859
15860  res = gen_reg_rtx (mode);
15861  emit_move_insn (res, operand1);
15862
15863  /* xa = abs (operand1) */
15864  xa = ix86_expand_sse_fabs (res, &mask);
15865
15866  /* if (!isless (xa, TWO52)) goto label; */
15867  TWO52 = ix86_gen_TWO52 (mode);
15868  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15869
15870  if (flag_rounding_math)
15871    {
15872      ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
15873      xa = res;
15874    }
15875
15876  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15877  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15878
15879  /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
15880  if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
15881    xa = ix86_expand_sse_fabs (xa, NULL);
15882
15883  ix86_sse_copysign_to_positive (res, xa, res, mask);
15884
15885  emit_label (label);
15886  LABEL_NUSES (label) = 1;
15887
15888  emit_move_insn (operand0, res);
15889}
15890
15891/* Expand SSE2 sequence for computing floor or ceil
15892   from OPERAND1 storing into OPERAND0.  */
15893void
15894ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15895{
15896  /* C code for the stuff we expand below.
15897	double xa = fabs (x), x2;
15898        if (!isless (xa, TWO52))
15899          return x;
15900	x2 = (double)(long)x;
15901
15902     Compensate.  Floor:
15903	if (x2 > x)
15904	  x2 -= 1;
15905     Compensate.  Ceil:
15906	if (x2 < x)
15907	  x2 += 1;
15908
15909	if (HONOR_SIGNED_ZEROS (mode))
15910	  return copysign (x2, x);
15911	return x2;
15912   */
15913  machine_mode mode = GET_MODE (operand0);
15914  rtx xa, xi, TWO52, tmp, one, res, mask;
15915  rtx_code_label *label;
15916
15917  TWO52 = ix86_gen_TWO52 (mode);
15918
15919  /* Temporary for holding the result, initialized to the input
15920     operand to ease control flow.  */
15921  res = gen_reg_rtx (mode);
15922  emit_move_insn (res, operand1);
15923
15924  /* xa = abs (operand1) */
15925  xa = ix86_expand_sse_fabs (res, &mask);
15926
15927  /* if (!isless (xa, TWO52)) goto label; */
15928  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15929
15930  /* xa = (double)(long)x */
15931  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15932  expand_fix (xi, res, 0);
15933  expand_float (xa, xi, 0);
15934
15935  /* generate 1.0 */
15936  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15937
15938  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15939  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15940  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15941  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15942			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15943  if (HONOR_SIGNED_ZEROS (mode))
15944    {
15945      /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
15946      if (do_floor && flag_rounding_math)
15947	tmp = ix86_expand_sse_fabs (tmp, NULL);
15948
15949      ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15950    }
15951  emit_move_insn (res, tmp);
15952
15953  emit_label (label);
15954  LABEL_NUSES (label) = 1;
15955
15956  emit_move_insn (operand0, res);
15957}
15958
15959/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15960   into OPERAND0 without relying on DImode truncation via cvttsd2siq
15961   that is only available on 64bit targets.  */
15962void
15963ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15964{
15965  /* C code for the stuff we expand below.
15966        double xa = fabs (x), x2;
15967        if (!isless (xa, TWO52))
15968          return x;
15969        xa = xa + TWO52 - TWO52;
15970        x2 = copysign (xa, x);
15971
15972     Compensate.  Floor:
15973        if (x2 > x)
15974          x2 -= 1;
15975     Compensate.  Ceil:
15976        if (x2 < x)
15977          x2 += 1;
15978
15979	if (HONOR_SIGNED_ZEROS (mode))
15980	  x2 = copysign (x2, x);
15981	return x2;
15982   */
15983  machine_mode mode = GET_MODE (operand0);
15984  rtx xa, TWO52, tmp, one, res, mask;
15985  rtx_code_label *label;
15986
15987  TWO52 = ix86_gen_TWO52 (mode);
15988
15989  /* Temporary for holding the result, initialized to the input
15990     operand to ease control flow.  */
15991  res = gen_reg_rtx (mode);
15992  emit_move_insn (res, operand1);
15993
15994  /* xa = abs (operand1) */
15995  xa = ix86_expand_sse_fabs (res, &mask);
15996
15997  /* if (!isless (xa, TWO52)) goto label; */
15998  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15999
16000  /* xa = xa + TWO52 - TWO52; */
16001  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16002  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16003
16004  /* xa = copysign (xa, operand1) */
16005  ix86_sse_copysign_to_positive (xa, xa, res, mask);
16006
16007  /* generate 1.0 */
16008  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16009
16010  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16011  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16012  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16013  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16014			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16015  if (HONOR_SIGNED_ZEROS (mode))
16016    {
16017      /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16018      if (do_floor && flag_rounding_math)
16019	tmp = ix86_expand_sse_fabs (tmp, NULL);
16020
16021      ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16022    }
16023  emit_move_insn (res, tmp);
16024
16025  emit_label (label);
16026  LABEL_NUSES (label) = 1;
16027
16028  emit_move_insn (operand0, res);
16029}
16030
16031/* Expand SSE sequence for computing trunc
16032   from OPERAND1 storing into OPERAND0.  */
16033void
16034ix86_expand_trunc (rtx operand0, rtx operand1)
16035{
16036  /* C code for SSE variant we expand below.
16037        double xa = fabs (x), x2;
16038        if (!isless (xa, TWO52))
16039          return x;
16040        x2 = (double)(long)x;
16041	if (HONOR_SIGNED_ZEROS (mode))
16042	  return copysign (x2, x);
16043	return x2;
16044   */
16045  machine_mode mode = GET_MODE (operand0);
16046  rtx xa, xi, TWO52, res, mask;
16047  rtx_code_label *label;
16048
16049  TWO52 = ix86_gen_TWO52 (mode);
16050
16051  /* Temporary for holding the result, initialized to the input
16052     operand to ease control flow.  */
16053  res = gen_reg_rtx (mode);
16054  emit_move_insn (res, operand1);
16055
16056  /* xa = abs (operand1) */
16057  xa = ix86_expand_sse_fabs (res, &mask);
16058
16059  /* if (!isless (xa, TWO52)) goto label; */
16060  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16061
16062  /* x = (double)(long)x */
16063  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16064  expand_fix (xi, res, 0);
16065  expand_float (res, xi, 0);
16066
16067  if (HONOR_SIGNED_ZEROS (mode))
16068    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16069
16070  emit_label (label);
16071  LABEL_NUSES (label) = 1;
16072
16073  emit_move_insn (operand0, res);
16074}
16075
16076/* Expand SSE sequence for computing trunc from OPERAND1 storing
16077   into OPERAND0 without relying on DImode truncation via cvttsd2siq
16078   that is only available on 64bit targets.  */
16079void
16080ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16081{
16082  machine_mode mode = GET_MODE (operand0);
16083  rtx xa, xa2, TWO52, tmp, one, res, mask;
16084  rtx_code_label *label;
16085
16086  /* C code for SSE variant we expand below.
16087        double xa = fabs (x), x2;
16088        if (!isless (xa, TWO52))
16089          return x;
16090        xa2 = xa + TWO52 - TWO52;
16091     Compensate:
16092        if (xa2 > xa)
16093          xa2 -= 1.0;
16094        x2 = copysign (xa2, x);
16095        return x2;
16096   */
16097
16098  TWO52 = ix86_gen_TWO52 (mode);
16099
16100  /* Temporary for holding the result, initialized to the input
16101     operand to ease control flow.  */
16102  res = gen_reg_rtx (mode);
16103  emit_move_insn (res, operand1);
16104
16105  /* xa = abs (operand1) */
16106  xa = ix86_expand_sse_fabs (res, &mask);
16107
16108  /* if (!isless (xa, TWO52)) goto label; */
16109  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16110
16111  /* xa2 = xa + TWO52 - TWO52; */
16112  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16113  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16114
16115  /* generate 1.0 */
16116  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16117
16118  /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
16119  tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16120  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16121  tmp = expand_simple_binop (mode, MINUS,
16122			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16123  /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16124  if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16125    tmp = ix86_expand_sse_fabs (tmp, NULL);
16126
16127  /* res = copysign (xa2, operand1) */
16128  ix86_sse_copysign_to_positive (res, tmp, res, mask);
16129
16130  emit_label (label);
16131  LABEL_NUSES (label) = 1;
16132
16133  emit_move_insn (operand0, res);
16134}
16135
16136/* Expand SSE sequence for computing round
16137   from OPERAND1 storing into OPERAND0.  */
16138void
16139ix86_expand_round (rtx operand0, rtx operand1)
16140{
16141  /* C code for the stuff we're doing below:
16142        double xa = fabs (x);
16143        if (!isless (xa, TWO52))
16144          return x;
16145        xa = (double)(long)(xa + nextafter (0.5, 0.0));
16146        return copysign (xa, x);
16147   */
16148  machine_mode mode = GET_MODE (operand0);
16149  rtx res, TWO52, xa, xi, half, mask;
16150  rtx_code_label *label;
16151  const struct real_format *fmt;
16152  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16153
16154  /* Temporary for holding the result, initialized to the input
16155     operand to ease control flow.  */
16156  res = gen_reg_rtx (mode);
16157  emit_move_insn (res, operand1);
16158
16159  TWO52 = ix86_gen_TWO52 (mode);
16160  xa = ix86_expand_sse_fabs (res, &mask);
16161  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16162
16163  /* load nextafter (0.5, 0.0) */
16164  fmt = REAL_MODE_FORMAT (mode);
16165  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16166  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16167
16168  /* xa = xa + 0.5 */
16169  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16170  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16171
16172  /* xa = (double)(int64_t)xa */
16173  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16174  expand_fix (xi, xa, 0);
16175  expand_float (xa, xi, 0);
16176
16177  /* res = copysign (xa, operand1) */
16178  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16179
16180  emit_label (label);
16181  LABEL_NUSES (label) = 1;
16182
16183  emit_move_insn (operand0, res);
16184}
16185
16186/* Expand SSE sequence for computing round from OPERAND1 storing
16187   into OPERAND0 without relying on DImode truncation via cvttsd2siq
16188   that is only available on 64bit targets.  */
16189void
16190ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16191{
16192  /* C code for the stuff we expand below.
16193        double xa = fabs (x), xa2, x2;
16194        if (!isless (xa, TWO52))
16195          return x;
16196     Using the absolute value and copying back sign makes
16197     -0.0 -> -0.0 correct.
16198        xa2 = xa + TWO52 - TWO52;
16199     Compensate.
16200	dxa = xa2 - xa;
16201        if (dxa <= -0.5)
16202          xa2 += 1;
16203        else if (dxa > 0.5)
16204          xa2 -= 1;
16205        x2 = copysign (xa2, x);
16206        return x2;
16207   */
16208  machine_mode mode = GET_MODE (operand0);
16209  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16210  rtx_code_label *label;
16211
16212  TWO52 = ix86_gen_TWO52 (mode);
16213
16214  /* Temporary for holding the result, initialized to the input
16215     operand to ease control flow.  */
16216  res = gen_reg_rtx (mode);
16217  emit_move_insn (res, operand1);
16218
16219  /* xa = abs (operand1) */
16220  xa = ix86_expand_sse_fabs (res, &mask);
16221
16222  /* if (!isless (xa, TWO52)) goto label; */
16223  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16224
16225  /* xa2 = xa + TWO52 - TWO52; */
16226  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16227  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16228
16229  /* dxa = xa2 - xa; */
16230  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16231
16232  /* generate 0.5, 1.0 and -0.5 */
16233  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16234  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16235  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16236			       0, OPTAB_DIRECT);
16237
16238  /* Compensate.  */
16239  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16240  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16241  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16242  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16243  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16244  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16245  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16246  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16247
16248  /* res = copysign (xa2, operand1) */
16249  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16250
16251  emit_label (label);
16252  LABEL_NUSES (label) = 1;
16253
16254  emit_move_insn (operand0, res);
16255}
16256
16257/* Expand SSE sequence for computing round
16258   from OP1 storing into OP0 using sse4 round insn.  */
16259void
16260ix86_expand_round_sse4 (rtx op0, rtx op1)
16261{
16262  machine_mode mode = GET_MODE (op0);
16263  rtx e1, e2, res, half;
16264  const struct real_format *fmt;
16265  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16266  rtx (*gen_copysign) (rtx, rtx, rtx);
16267  rtx (*gen_round) (rtx, rtx, rtx);
16268
16269  switch (mode)
16270    {
16271    case E_SFmode:
16272      gen_copysign = gen_copysignsf3;
16273      gen_round = gen_sse4_1_roundsf2;
16274      break;
16275    case E_DFmode:
16276      gen_copysign = gen_copysigndf3;
16277      gen_round = gen_sse4_1_rounddf2;
16278      break;
16279    default:
16280      gcc_unreachable ();
16281    }
16282
16283  /* round (a) = trunc (a + copysign (0.5, a)) */
16284
16285  /* load nextafter (0.5, 0.0) */
16286  fmt = REAL_MODE_FORMAT (mode);
16287  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16288  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16289  half = const_double_from_real_value (pred_half, mode);
16290
16291  /* e1 = copysign (0.5, op1) */
16292  e1 = gen_reg_rtx (mode);
16293  emit_insn (gen_copysign (e1, half, op1));
16294
16295  /* e2 = op1 + e1 */
16296  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16297
16298  /* res = trunc (e2) */
16299  res = gen_reg_rtx (mode);
16300  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16301
16302  emit_move_insn (op0, res);
16303}
16304
16305/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16306   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16307   insn every time.  */
16308
16309static GTY(()) rtx_insn *vselect_insn;
16310
16311/* Initialize vselect_insn.  */
16312
16313static void
16314init_vselect_insn (void)
16315{
16316  unsigned i;
16317  rtx x;
16318
16319  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16320  for (i = 0; i < MAX_VECT_LEN; ++i)
16321    XVECEXP (x, 0, i) = const0_rtx;
16322  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16323							const0_rtx), x);
16324  x = gen_rtx_SET (const0_rtx, x);
16325  start_sequence ();
16326  vselect_insn = emit_insn (x);
16327  end_sequence ();
16328}
16329
16330/* Construct (set target (vec_select op0 (parallel perm))) and
16331   return true if that's a valid instruction in the active ISA.  */
16332
16333static bool
16334expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16335		unsigned nelt, bool testing_p)
16336{
16337  unsigned int i;
16338  rtx x, save_vconcat;
16339  int icode;
16340
16341  if (vselect_insn == NULL_RTX)
16342    init_vselect_insn ();
16343
16344  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16345  PUT_NUM_ELEM (XVEC (x, 0), nelt);
16346  for (i = 0; i < nelt; ++i)
16347    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16348  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16349  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16350  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16351  SET_DEST (PATTERN (vselect_insn)) = target;
16352  icode = recog_memoized (vselect_insn);
16353
16354  if (icode >= 0 && !testing_p)
16355    emit_insn (copy_rtx (PATTERN (vselect_insn)));
16356
16357  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16358  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16359  INSN_CODE (vselect_insn) = -1;
16360
16361  return icode >= 0;
16362}
16363
16364/* Similar, but generate a vec_concat from op0 and op1 as well.  */
16365
16366static bool
16367expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16368			const unsigned char *perm, unsigned nelt,
16369			bool testing_p)
16370{
16371  machine_mode v2mode;
16372  rtx x;
16373  bool ok;
16374
16375  if (vselect_insn == NULL_RTX)
16376    init_vselect_insn ();
16377
16378  if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16379    return false;
16380  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16381  PUT_MODE (x, v2mode);
16382  XEXP (x, 0) = op0;
16383  XEXP (x, 1) = op1;
16384  ok = expand_vselect (target, x, perm, nelt, testing_p);
16385  XEXP (x, 0) = const0_rtx;
16386  XEXP (x, 1) = const0_rtx;
16387  return ok;
16388}
16389
16390/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16391   using movss or movsd.  */
16392static bool
16393expand_vec_perm_movs (struct expand_vec_perm_d *d)
16394{
16395  machine_mode vmode = d->vmode;
16396  unsigned i, nelt = d->nelt;
16397  rtx x;
16398
16399  if (d->one_operand_p)
16400    return false;
16401
16402  if (!(TARGET_SSE && vmode == V4SFmode)
16403      && !(TARGET_SSE2 && vmode == V2DFmode))
16404    return false;
16405
16406  /* Only the first element is changed.  */
16407  if (d->perm[0] != nelt && d->perm[0] != 0)
16408    return false;
16409  for (i = 1; i < nelt; ++i)
16410    if (d->perm[i] != i + nelt - d->perm[0])
16411      return false;
16412
16413  if (d->testing_p)
16414    return true;
16415
16416  if (d->perm[0] == nelt)
16417    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16418  else
16419    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16420
16421  emit_insn (gen_rtx_SET (d->target, x));
16422
16423  return true;
16424}
16425
16426/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16427   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
16428
16429static bool
16430expand_vec_perm_blend (struct expand_vec_perm_d *d)
16431{
16432  machine_mode mmode, vmode = d->vmode;
16433  unsigned i, nelt = d->nelt;
16434  unsigned HOST_WIDE_INT mask;
16435  rtx target, op0, op1, maskop, x;
16436  rtx rperm[32], vperm;
16437
16438  if (d->one_operand_p)
16439    return false;
16440  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16441      && (TARGET_AVX512BW
16442	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
16443    ;
16444  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16445    ;
16446  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16447    ;
16448  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16449    ;
16450  else
16451    return false;
16452
16453  /* This is a blend, not a permute.  Elements must stay in their
16454     respective lanes.  */
16455  for (i = 0; i < nelt; ++i)
16456    {
16457      unsigned e = d->perm[i];
16458      if (!(e == i || e == i + nelt))
16459	return false;
16460    }
16461
16462  if (d->testing_p)
16463    return true;
16464
16465  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
16466     decision should be extracted elsewhere, so that we only try that
16467     sequence once all budget==3 options have been tried.  */
16468  target = d->target;
16469  op0 = d->op0;
16470  op1 = d->op1;
16471  mask = 0;
16472
16473  switch (vmode)
16474    {
16475    case E_V8DFmode:
16476    case E_V16SFmode:
16477    case E_V4DFmode:
16478    case E_V8SFmode:
16479    case E_V2DFmode:
16480    case E_V4SFmode:
16481    case E_V8HImode:
16482    case E_V8SImode:
16483    case E_V32HImode:
16484    case E_V64QImode:
16485    case E_V16SImode:
16486    case E_V8DImode:
16487      for (i = 0; i < nelt; ++i)
16488	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16489      break;
16490
16491    case E_V2DImode:
16492      for (i = 0; i < 2; ++i)
16493	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16494      vmode = V8HImode;
16495      goto do_subreg;
16496
16497    case E_V4SImode:
16498      for (i = 0; i < 4; ++i)
16499	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16500      vmode = V8HImode;
16501      goto do_subreg;
16502
16503    case E_V16QImode:
16504      /* See if bytes move in pairs so we can use pblendw with
16505	 an immediate argument, rather than pblendvb with a vector
16506	 argument.  */
16507      for (i = 0; i < 16; i += 2)
16508	if (d->perm[i] + 1 != d->perm[i + 1])
16509	  {
16510	  use_pblendvb:
16511	    for (i = 0; i < nelt; ++i)
16512	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16513
16514	  finish_pblendvb:
16515	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16516	    vperm = force_reg (vmode, vperm);
16517
16518	    if (GET_MODE_SIZE (vmode) == 16)
16519	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16520	    else
16521	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16522	    if (target != d->target)
16523	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16524	    return true;
16525	  }
16526
16527      for (i = 0; i < 8; ++i)
16528	mask |= (d->perm[i * 2] >= 16) << i;
16529      vmode = V8HImode;
16530      /* FALLTHRU */
16531
16532    do_subreg:
16533      target = gen_reg_rtx (vmode);
16534      op0 = gen_lowpart (vmode, op0);
16535      op1 = gen_lowpart (vmode, op1);
16536      break;
16537
16538    case E_V32QImode:
16539      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
16540      for (i = 0; i < 32; i += 2)
16541	if (d->perm[i] + 1 != d->perm[i + 1])
16542	  goto use_pblendvb;
16543      /* See if bytes move in quadruplets.  If yes, vpblendd
16544	 with immediate can be used.  */
16545      for (i = 0; i < 32; i += 4)
16546	if (d->perm[i] + 2 != d->perm[i + 2])
16547	  break;
16548      if (i < 32)
16549	{
16550	  /* See if bytes move the same in both lanes.  If yes,
16551	     vpblendw with immediate can be used.  */
16552	  for (i = 0; i < 16; i += 2)
16553	    if (d->perm[i] + 16 != d->perm[i + 16])
16554	      goto use_pblendvb;
16555
16556	  /* Use vpblendw.  */
16557	  for (i = 0; i < 16; ++i)
16558	    mask |= (d->perm[i * 2] >= 32) << i;
16559	  vmode = V16HImode;
16560	  goto do_subreg;
16561	}
16562
16563      /* Use vpblendd.  */
16564      for (i = 0; i < 8; ++i)
16565	mask |= (d->perm[i * 4] >= 32) << i;
16566      vmode = V8SImode;
16567      goto do_subreg;
16568
16569    case E_V16HImode:
16570      /* See if words move in pairs.  If yes, vpblendd can be used.  */
16571      for (i = 0; i < 16; i += 2)
16572	if (d->perm[i] + 1 != d->perm[i + 1])
16573	  break;
16574      if (i < 16)
16575	{
16576	  /* See if words move the same in both lanes.  If not,
16577	     vpblendvb must be used.  */
16578	  for (i = 0; i < 8; i++)
16579	    if (d->perm[i] + 8 != d->perm[i + 8])
16580	      {
16581		/* Use vpblendvb.  */
16582		for (i = 0; i < 32; ++i)
16583		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16584
16585		vmode = V32QImode;
16586		nelt = 32;
16587		target = gen_reg_rtx (vmode);
16588		op0 = gen_lowpart (vmode, op0);
16589		op1 = gen_lowpart (vmode, op1);
16590		goto finish_pblendvb;
16591	      }
16592
16593	  /* Use vpblendw.  */
16594	  for (i = 0; i < 16; ++i)
16595	    mask |= (d->perm[i] >= 16) << i;
16596	  break;
16597	}
16598
16599      /* Use vpblendd.  */
16600      for (i = 0; i < 8; ++i)
16601	mask |= (d->perm[i * 2] >= 16) << i;
16602      vmode = V8SImode;
16603      goto do_subreg;
16604
16605    case E_V4DImode:
16606      /* Use vpblendd.  */
16607      for (i = 0; i < 4; ++i)
16608	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16609      vmode = V8SImode;
16610      goto do_subreg;
16611
16612    default:
16613      gcc_unreachable ();
16614    }
16615
16616  switch (vmode)
16617    {
16618    case E_V8DFmode:
16619    case E_V8DImode:
16620      mmode = QImode;
16621      break;
16622    case E_V16SFmode:
16623    case E_V16SImode:
16624      mmode = HImode;
16625      break;
16626    case E_V32HImode:
16627      mmode = SImode;
16628      break;
16629    case E_V64QImode:
16630      mmode = DImode;
16631      break;
16632    default:
16633      mmode = VOIDmode;
16634    }
16635
16636  if (mmode != VOIDmode)
16637    maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16638  else
16639    maskop = GEN_INT (mask);
16640
16641  /* This matches five different patterns with the different modes.  */
16642  x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16643  x = gen_rtx_SET (target, x);
16644  emit_insn (x);
16645  if (target != d->target)
16646    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16647
16648  return true;
16649}
16650
16651/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16652   in terms of the variable form of vpermilps.
16653
16654   Note that we will have already failed the immediate input vpermilps,
16655   which requires that the high and low part shuffle be identical; the
16656   variable form doesn't require that.  */
16657
16658static bool
16659expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16660{
16661  rtx rperm[8], vperm;
16662  unsigned i;
16663
16664  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16665    return false;
16666
16667  /* We can only permute within the 128-bit lane.  */
16668  for (i = 0; i < 8; ++i)
16669    {
16670      unsigned e = d->perm[i];
16671      if (i < 4 ? e >= 4 : e < 4)
16672	return false;
16673    }
16674
16675  if (d->testing_p)
16676    return true;
16677
16678  for (i = 0; i < 8; ++i)
16679    {
16680      unsigned e = d->perm[i];
16681
16682      /* Within each 128-bit lane, the elements of op0 are numbered
16683	 from 0 and the elements of op1 are numbered from 4.  */
16684      if (e >= 8 + 4)
16685	e -= 8;
16686      else if (e >= 4)
16687	e -= 4;
16688
16689      rperm[i] = GEN_INT (e);
16690    }
16691
16692  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16693  vperm = force_reg (V8SImode, vperm);
16694  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16695
16696  return true;
16697}
16698
16699/* Return true if permutation D can be performed as VMODE permutation
16700   instead.  */
16701
16702static bool
16703valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16704{
16705  unsigned int i, j, chunk;
16706
16707  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16708      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16709      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16710    return false;
16711
16712  if (GET_MODE_NUNITS (vmode) >= d->nelt)
16713    return true;
16714
16715  chunk = d->nelt / GET_MODE_NUNITS (vmode);
16716  for (i = 0; i < d->nelt; i += chunk)
16717    if (d->perm[i] & (chunk - 1))
16718      return false;
16719    else
16720      for (j = 1; j < chunk; ++j)
16721	if (d->perm[i] + j != d->perm[i + j])
16722	  return false;
16723
16724  return true;
16725}
16726
16727/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16728   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
16729
16730static bool
16731expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16732{
16733  unsigned i, nelt, eltsz, mask;
16734  unsigned char perm[64];
16735  machine_mode vmode = V16QImode;
16736  rtx rperm[64], vperm, target, op0, op1;
16737
16738  nelt = d->nelt;
16739
16740  if (!d->one_operand_p)
16741    {
16742      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16743	{
16744	  if (TARGET_AVX2
16745	      && valid_perm_using_mode_p (V2TImode, d))
16746	    {
16747	      if (d->testing_p)
16748		return true;
16749
16750	      /* Use vperm2i128 insn.  The pattern uses
16751		 V4DImode instead of V2TImode.  */
16752	      target = d->target;
16753	      if (d->vmode != V4DImode)
16754		target = gen_reg_rtx (V4DImode);
16755	      op0 = gen_lowpart (V4DImode, d->op0);
16756	      op1 = gen_lowpart (V4DImode, d->op1);
16757	      rperm[0]
16758		= GEN_INT ((d->perm[0] / (nelt / 2))
16759			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16760	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16761	      if (target != d->target)
16762		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16763	      return true;
16764	    }
16765	  return false;
16766	}
16767    }
16768  else
16769    {
16770      if (GET_MODE_SIZE (d->vmode) == 16)
16771	{
16772	  if (!TARGET_SSSE3)
16773	    return false;
16774	}
16775      else if (GET_MODE_SIZE (d->vmode) == 32)
16776	{
16777	  if (!TARGET_AVX2)
16778	    return false;
16779
16780	  /* V4DImode should be already handled through
16781	     expand_vselect by vpermq instruction.  */
16782	  gcc_assert (d->vmode != V4DImode);
16783
16784	  vmode = V32QImode;
16785	  if (d->vmode == V8SImode
16786	      || d->vmode == V16HImode
16787	      || d->vmode == V32QImode)
16788	    {
16789	      /* First see if vpermq can be used for
16790		 V8SImode/V16HImode/V32QImode.  */
16791	      if (valid_perm_using_mode_p (V4DImode, d))
16792		{
16793		  for (i = 0; i < 4; i++)
16794		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16795		  if (d->testing_p)
16796		    return true;
16797		  target = gen_reg_rtx (V4DImode);
16798		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16799				      perm, 4, false))
16800		    {
16801		      emit_move_insn (d->target,
16802				      gen_lowpart (d->vmode, target));
16803		      return true;
16804		    }
16805		  return false;
16806		}
16807
16808	      /* Next see if vpermd can be used.  */
16809	      if (valid_perm_using_mode_p (V8SImode, d))
16810		vmode = V8SImode;
16811	    }
16812	  /* Or if vpermps can be used.  */
16813	  else if (d->vmode == V8SFmode)
16814	    vmode = V8SImode;
16815
16816	  if (vmode == V32QImode)
16817	    {
16818	      /* vpshufb only works intra lanes, it is not
16819		 possible to shuffle bytes in between the lanes.  */
16820	      for (i = 0; i < nelt; ++i)
16821		if ((d->perm[i] ^ i) & (nelt / 2))
16822		  return false;
16823	    }
16824	}
16825      else if (GET_MODE_SIZE (d->vmode) == 64)
16826	{
16827	  if (!TARGET_AVX512BW)
16828	    return false;
16829
16830	  /* If vpermq didn't work, vpshufb won't work either.  */
16831	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
16832	    return false;
16833
16834	  vmode = V64QImode;
16835	  if (d->vmode == V16SImode
16836	      || d->vmode == V32HImode
16837	      || d->vmode == V64QImode)
16838	    {
16839	      /* First see if vpermq can be used for
16840		 V16SImode/V32HImode/V64QImode.  */
16841	      if (valid_perm_using_mode_p (V8DImode, d))
16842		{
16843		  for (i = 0; i < 8; i++)
16844		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16845		  if (d->testing_p)
16846		    return true;
16847		  target = gen_reg_rtx (V8DImode);
16848		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16849				      perm, 8, false))
16850		    {
16851		      emit_move_insn (d->target,
16852				      gen_lowpart (d->vmode, target));
16853		      return true;
16854		    }
16855		  return false;
16856		}
16857
16858	      /* Next see if vpermd can be used.  */
16859	      if (valid_perm_using_mode_p (V16SImode, d))
16860		vmode = V16SImode;
16861	    }
16862	  /* Or if vpermps can be used.  */
16863	  else if (d->vmode == V16SFmode)
16864	    vmode = V16SImode;
16865	  if (vmode == V64QImode)
16866	    {
16867	      /* vpshufb only works intra lanes, it is not
16868		 possible to shuffle bytes in between the lanes.  */
16869	      for (i = 0; i < nelt; ++i)
16870		if ((d->perm[i] ^ i) & (3 * nelt / 4))
16871		  return false;
16872	    }
16873	}
16874      else
16875	return false;
16876    }
16877
16878  if (d->testing_p)
16879    return true;
16880
16881  if (vmode == V8SImode)
16882    for (i = 0; i < 8; ++i)
16883      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16884  else if (vmode == V16SImode)
16885    for (i = 0; i < 16; ++i)
16886      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16887  else
16888    {
16889      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16890      if (!d->one_operand_p)
16891	mask = 2 * nelt - 1;
16892      else if (vmode == V16QImode)
16893	mask = nelt - 1;
16894      else if (vmode == V64QImode)
16895	mask = nelt / 4 - 1;
16896      else
16897	mask = nelt / 2 - 1;
16898
16899      for (i = 0; i < nelt; ++i)
16900	{
16901	  unsigned j, e = d->perm[i] & mask;
16902	  for (j = 0; j < eltsz; ++j)
16903	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16904	}
16905    }
16906
16907  vperm = gen_rtx_CONST_VECTOR (vmode,
16908				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16909  vperm = force_reg (vmode, vperm);
16910
16911  target = d->target;
16912  if (d->vmode != vmode)
16913    target = gen_reg_rtx (vmode);
16914  op0 = gen_lowpart (vmode, d->op0);
16915  if (d->one_operand_p)
16916    {
16917      if (vmode == V16QImode)
16918	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16919      else if (vmode == V32QImode)
16920	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16921      else if (vmode == V64QImode)
16922	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16923      else if (vmode == V8SFmode)
16924	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16925      else if (vmode == V8SImode)
16926	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16927      else if (vmode == V16SFmode)
16928	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16929      else if (vmode == V16SImode)
16930	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16931      else
16932	gcc_unreachable ();
16933    }
16934  else
16935    {
16936      op1 = gen_lowpart (vmode, d->op1);
16937      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16938    }
16939  if (target != d->target)
16940    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16941
16942  return true;
16943}
16944
16945/* For V*[QHS]Imode permutations, check if the same permutation
16946   can't be performed in a 2x, 4x or 8x wider inner mode.  */
16947
16948static bool
16949canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16950			      struct expand_vec_perm_d *nd)
16951{
16952  int i;
16953  machine_mode mode = VOIDmode;
16954
16955  switch (d->vmode)
16956    {
16957    case E_V16QImode: mode = V8HImode; break;
16958    case E_V32QImode: mode = V16HImode; break;
16959    case E_V64QImode: mode = V32HImode; break;
16960    case E_V8HImode: mode = V4SImode; break;
16961    case E_V16HImode: mode = V8SImode; break;
16962    case E_V32HImode: mode = V16SImode; break;
16963    case E_V4SImode: mode = V2DImode; break;
16964    case E_V8SImode: mode = V4DImode; break;
16965    case E_V16SImode: mode = V8DImode; break;
16966    default: return false;
16967    }
16968  for (i = 0; i < d->nelt; i += 2)
16969    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16970      return false;
16971  nd->vmode = mode;
16972  nd->nelt = d->nelt / 2;
16973  for (i = 0; i < nd->nelt; i++)
16974    nd->perm[i] = d->perm[2 * i] / 2;
16975  if (GET_MODE_INNER (mode) != DImode)
16976    canonicalize_vector_int_perm (nd, nd);
16977  if (nd != d)
16978    {
16979      nd->one_operand_p = d->one_operand_p;
16980      nd->testing_p = d->testing_p;
16981      if (d->op0 == d->op1)
16982	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16983      else
16984	{
16985	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
16986	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
16987	}
16988      if (d->testing_p)
16989	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16990      else
16991	nd->target = gen_reg_rtx (nd->vmode);
16992    }
16993  return true;
16994}
16995
16996/* Try to expand one-operand permutation with constant mask.  */
16997
16998static bool
16999ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17000{
17001  machine_mode mode = GET_MODE (d->op0);
17002  machine_mode maskmode = mode;
17003  rtx (*gen) (rtx, rtx, rtx) = NULL;
17004  rtx target, op0, mask;
17005  rtx vec[64];
17006
17007  if (!rtx_equal_p (d->op0, d->op1))
17008    return false;
17009
17010  if (!TARGET_AVX512F)
17011    return false;
17012
17013  switch (mode)
17014    {
17015    case E_V16SImode:
17016      gen = gen_avx512f_permvarv16si;
17017      break;
17018    case E_V16SFmode:
17019      gen = gen_avx512f_permvarv16sf;
17020      maskmode = V16SImode;
17021      break;
17022    case E_V8DImode:
17023      gen = gen_avx512f_permvarv8di;
17024      break;
17025    case E_V8DFmode:
17026      gen = gen_avx512f_permvarv8df;
17027      maskmode = V8DImode;
17028      break;
17029    default:
17030      return false;
17031    }
17032
17033  target = d->target;
17034  op0 = d->op0;
17035  for (int i = 0; i < d->nelt; ++i)
17036    vec[i] = GEN_INT (d->perm[i]);
17037  mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17038  emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17039  return true;
17040}
17041
17042static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17043
17044/* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
17045   in a single instruction.  */
17046
17047static bool
17048expand_vec_perm_1 (struct expand_vec_perm_d *d)
17049{
17050  unsigned i, nelt = d->nelt;
17051  struct expand_vec_perm_d nd;
17052
17053  /* Check plain VEC_SELECT first, because AVX has instructions that could
17054     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17055     input where SEL+CONCAT may not.  */
17056  if (d->one_operand_p)
17057    {
17058      int mask = nelt - 1;
17059      bool identity_perm = true;
17060      bool broadcast_perm = true;
17061
17062      for (i = 0; i < nelt; i++)
17063	{
17064	  nd.perm[i] = d->perm[i] & mask;
17065	  if (nd.perm[i] != i)
17066	    identity_perm = false;
17067	  if (nd.perm[i])
17068	    broadcast_perm = false;
17069	}
17070
17071      if (identity_perm)
17072	{
17073	  if (!d->testing_p)
17074	    emit_move_insn (d->target, d->op0);
17075	  return true;
17076	}
17077      else if (broadcast_perm && TARGET_AVX2)
17078	{
17079	  /* Use vpbroadcast{b,w,d}.  */
17080	  rtx (*gen) (rtx, rtx) = NULL;
17081	  switch (d->vmode)
17082	    {
17083	    case E_V64QImode:
17084	      if (TARGET_AVX512BW)
17085		gen = gen_avx512bw_vec_dupv64qi_1;
17086	      break;
17087	    case E_V32QImode:
17088	      gen = gen_avx2_pbroadcastv32qi_1;
17089	      break;
17090	    case E_V32HImode:
17091	      if (TARGET_AVX512BW)
17092		gen = gen_avx512bw_vec_dupv32hi_1;
17093	      break;
17094	    case E_V16HImode:
17095	      gen = gen_avx2_pbroadcastv16hi_1;
17096	      break;
17097	    case E_V16SImode:
17098	      if (TARGET_AVX512F)
17099		gen = gen_avx512f_vec_dupv16si_1;
17100	      break;
17101	    case E_V8SImode:
17102	      gen = gen_avx2_pbroadcastv8si_1;
17103	      break;
17104	    case E_V16QImode:
17105	      gen = gen_avx2_pbroadcastv16qi;
17106	      break;
17107	    case E_V8HImode:
17108	      gen = gen_avx2_pbroadcastv8hi;
17109	      break;
17110	    case E_V16SFmode:
17111	      if (TARGET_AVX512F)
17112		gen = gen_avx512f_vec_dupv16sf_1;
17113	      break;
17114	    case E_V8SFmode:
17115	      gen = gen_avx2_vec_dupv8sf_1;
17116	      break;
17117	    case E_V8DFmode:
17118	      if (TARGET_AVX512F)
17119		gen = gen_avx512f_vec_dupv8df_1;
17120	      break;
17121	    case E_V8DImode:
17122	      if (TARGET_AVX512F)
17123		gen = gen_avx512f_vec_dupv8di_1;
17124	      break;
17125	    /* For other modes prefer other shuffles this function creates.  */
17126	    default: break;
17127	    }
17128	  if (gen != NULL)
17129	    {
17130	      if (!d->testing_p)
17131		emit_insn (gen (d->target, d->op0));
17132	      return true;
17133	    }
17134	}
17135
17136      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17137	return true;
17138
17139      /* There are plenty of patterns in sse.md that are written for
17140	 SEL+CONCAT and are not replicated for a single op.  Perhaps
17141	 that should be changed, to avoid the nastiness here.  */
17142
17143      /* Recognize interleave style patterns, which means incrementing
17144	 every other permutation operand.  */
17145      for (i = 0; i < nelt; i += 2)
17146	{
17147	  nd.perm[i] = d->perm[i] & mask;
17148	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17149	}
17150      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17151				  d->testing_p))
17152	return true;
17153
17154      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
17155      if (nelt >= 4)
17156	{
17157	  for (i = 0; i < nelt; i += 4)
17158	    {
17159	      nd.perm[i + 0] = d->perm[i + 0] & mask;
17160	      nd.perm[i + 1] = d->perm[i + 1] & mask;
17161	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17162	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17163	    }
17164
17165	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17166				      d->testing_p))
17167	    return true;
17168	}
17169    }
17170
17171  /* Try movss/movsd instructions.  */
17172  if (expand_vec_perm_movs (d))
17173    return true;
17174
17175  /* Finally, try the fully general two operand permute.  */
17176  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17177			      d->testing_p))
17178    return true;
17179
17180  /* Recognize interleave style patterns with reversed operands.  */
17181  if (!d->one_operand_p)
17182    {
17183      for (i = 0; i < nelt; ++i)
17184	{
17185	  unsigned e = d->perm[i];
17186	  if (e >= nelt)
17187	    e -= nelt;
17188	  else
17189	    e += nelt;
17190	  nd.perm[i] = e;
17191	}
17192
17193      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17194				  d->testing_p))
17195	return true;
17196    }
17197
17198  /* Try the SSE4.1 blend variable merge instructions.  */
17199  if (expand_vec_perm_blend (d))
17200    return true;
17201
17202  /* Try one of the AVX vpermil variable permutations.  */
17203  if (expand_vec_perm_vpermil (d))
17204    return true;
17205
17206  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17207     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
17208  if (expand_vec_perm_pshufb (d))
17209    return true;
17210
17211  /* Try the AVX2 vpalignr instruction.  */
17212  if (expand_vec_perm_palignr (d, true))
17213    return true;
17214
17215  /* Try the AVX512F vperm{s,d} instructions.  */
17216  if (ix86_expand_vec_one_operand_perm_avx512 (d))
17217    return true;
17218
17219  /* Try the AVX512F vpermt2/vpermi2 instructions.  */
17220  if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17221    return true;
17222
17223  /* See if we can get the same permutation in different vector integer
17224     mode.  */
17225  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17226    {
17227      if (!d->testing_p)
17228	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17229      return true;
17230    }
17231  return false;
17232}
17233
17234/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
17235   in terms of a pair of pshuflw + pshufhw instructions.  */
17236
17237static bool
17238expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17239{
17240  unsigned char perm2[MAX_VECT_LEN];
17241  unsigned i;
17242  bool ok;
17243
17244  if (d->vmode != V8HImode || !d->one_operand_p)
17245    return false;
17246
17247  /* The two permutations only operate in 64-bit lanes.  */
17248  for (i = 0; i < 4; ++i)
17249    if (d->perm[i] >= 4)
17250      return false;
17251  for (i = 4; i < 8; ++i)
17252    if (d->perm[i] < 4)
17253      return false;
17254
17255  if (d->testing_p)
17256    return true;
17257
17258  /* Emit the pshuflw.  */
17259  memcpy (perm2, d->perm, 4);
17260  for (i = 4; i < 8; ++i)
17261    perm2[i] = i;
17262  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17263  gcc_assert (ok);
17264
17265  /* Emit the pshufhw.  */
17266  memcpy (perm2 + 4, d->perm + 4, 4);
17267  for (i = 0; i < 4; ++i)
17268    perm2[i] = i;
17269  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17270  gcc_assert (ok);
17271
17272  return true;
17273}
17274
17275/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17276   the permutation using the SSSE3 palignr instruction.  This succeeds
17277   when all of the elements in PERM fit within one vector and we merely
17278   need to shift them down so that a single vector permutation has a
17279   chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
17280   the vpalignr instruction itself can perform the requested permutation.  */
17281
17282static bool
17283expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17284{
17285  unsigned i, nelt = d->nelt;
17286  unsigned min, max, minswap, maxswap;
17287  bool in_order, ok, swap = false;
17288  rtx shift, target;
17289  struct expand_vec_perm_d dcopy;
17290
17291  /* Even with AVX, palignr only operates on 128-bit vectors,
17292     in AVX2 palignr operates on both 128-bit lanes.  */
17293  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17294      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17295    return false;
17296
17297  min = 2 * nelt;
17298  max = 0;
17299  minswap = 2 * nelt;
17300  maxswap = 0;
17301  for (i = 0; i < nelt; ++i)
17302    {
17303      unsigned e = d->perm[i];
17304      unsigned eswap = d->perm[i] ^ nelt;
17305      if (GET_MODE_SIZE (d->vmode) == 32)
17306	{
17307	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17308	  eswap = e ^ (nelt / 2);
17309	}
17310      if (e < min)
17311	min = e;
17312      if (e > max)
17313	max = e;
17314      if (eswap < minswap)
17315	minswap = eswap;
17316      if (eswap > maxswap)
17317	maxswap = eswap;
17318    }
17319  if (min == 0
17320      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17321    {
17322      if (d->one_operand_p
17323	  || minswap == 0
17324	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17325				   ? nelt / 2 : nelt))
17326	return false;
17327      swap = true;
17328      min = minswap;
17329      max = maxswap;
17330    }
17331
17332  /* Given that we have SSSE3, we know we'll be able to implement the
17333     single operand permutation after the palignr with pshufb for
17334     128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
17335     first.  */
17336  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17337    return true;
17338
17339  dcopy = *d;
17340  if (swap)
17341    {
17342      dcopy.op0 = d->op1;
17343      dcopy.op1 = d->op0;
17344      for (i = 0; i < nelt; ++i)
17345	dcopy.perm[i] ^= nelt;
17346    }
17347
17348  in_order = true;
17349  for (i = 0; i < nelt; ++i)
17350    {
17351      unsigned e = dcopy.perm[i];
17352      if (GET_MODE_SIZE (d->vmode) == 32
17353	  && e >= nelt
17354	  && (e & (nelt / 2 - 1)) < min)
17355	e = e - min - (nelt / 2);
17356      else
17357	e = e - min;
17358      if (e != i)
17359	in_order = false;
17360      dcopy.perm[i] = e;
17361    }
17362  dcopy.one_operand_p = true;
17363
17364  if (single_insn_only_p && !in_order)
17365    return false;
17366
17367  /* For AVX2, test whether we can permute the result in one instruction.  */
17368  if (d->testing_p)
17369    {
17370      if (in_order)
17371	return true;
17372      dcopy.op1 = dcopy.op0;
17373      return expand_vec_perm_1 (&dcopy);
17374    }
17375
17376  shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17377  if (GET_MODE_SIZE (d->vmode) == 16)
17378    {
17379      target = gen_reg_rtx (TImode);
17380      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17381				      gen_lowpart (TImode, dcopy.op0), shift));
17382    }
17383  else
17384    {
17385      target = gen_reg_rtx (V2TImode);
17386      emit_insn (gen_avx2_palignrv2ti (target,
17387				       gen_lowpart (V2TImode, dcopy.op1),
17388				       gen_lowpart (V2TImode, dcopy.op0),
17389				       shift));
17390    }
17391
17392  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17393
17394  /* Test for the degenerate case where the alignment by itself
17395     produces the desired permutation.  */
17396  if (in_order)
17397    {
17398      emit_move_insn (d->target, dcopy.op0);
17399      return true;
17400    }
17401
17402  ok = expand_vec_perm_1 (&dcopy);
17403  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17404
17405  return ok;
17406}
17407
17408/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17409   the permutation using the SSE4_1 pblendv instruction.  Potentially
17410   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
17411
17412static bool
17413expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17414{
17415  unsigned i, which, nelt = d->nelt;
17416  struct expand_vec_perm_d dcopy, dcopy1;
17417  machine_mode vmode = d->vmode;
17418  bool ok;
17419
17420  /* Use the same checks as in expand_vec_perm_blend.  */
17421  if (d->one_operand_p)
17422    return false;
17423  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17424    ;
17425  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17426    ;
17427  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17428    ;
17429  else
17430    return false;
17431
17432  /* Figure out where permutation elements stay not in their
17433     respective lanes.  */
17434  for (i = 0, which = 0; i < nelt; ++i)
17435    {
17436      unsigned e = d->perm[i];
17437      if (e != i)
17438	which |= (e < nelt ? 1 : 2);
17439    }
17440  /* We can pblend the part where elements stay not in their
17441     respective lanes only when these elements are all in one
17442     half of a permutation.
17443     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17444     lanes, but both 8 and 9 >= 8
17445     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17446     respective lanes and 8 >= 8, but 2 not.  */
17447  if (which != 1 && which != 2)
17448    return false;
17449  if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17450    return true;
17451
17452  /* First we apply one operand permutation to the part where
17453     elements stay not in their respective lanes.  */
17454  dcopy = *d;
17455  if (which == 2)
17456    dcopy.op0 = dcopy.op1 = d->op1;
17457  else
17458    dcopy.op0 = dcopy.op1 = d->op0;
17459  if (!d->testing_p)
17460    dcopy.target = gen_reg_rtx (vmode);
17461  dcopy.one_operand_p = true;
17462
17463  for (i = 0; i < nelt; ++i)
17464    dcopy.perm[i] = d->perm[i] & (nelt - 1);
17465
17466  ok = expand_vec_perm_1 (&dcopy);
17467  if (GET_MODE_SIZE (vmode) != 16 && !ok)
17468    return false;
17469  else
17470    gcc_assert (ok);
17471  if (d->testing_p)
17472    return true;
17473
17474  /* Next we put permuted elements into their positions.  */
17475  dcopy1 = *d;
17476  if (which == 2)
17477    dcopy1.op1 = dcopy.target;
17478  else
17479    dcopy1.op0 = dcopy.target;
17480
17481  for (i = 0; i < nelt; ++i)
17482    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17483
17484  ok = expand_vec_perm_blend (&dcopy1);
17485  gcc_assert (ok);
17486
17487  return true;
17488}
17489
17490static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17491
17492/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17493   a two vector permutation into a single vector permutation by using
17494   an interleave operation to merge the vectors.  */
17495
17496static bool
17497expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17498{
17499  struct expand_vec_perm_d dremap, dfinal;
17500  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17501  unsigned HOST_WIDE_INT contents;
17502  unsigned char remap[2 * MAX_VECT_LEN];
17503  rtx_insn *seq;
17504  bool ok, same_halves = false;
17505
17506  if (GET_MODE_SIZE (d->vmode) == 16)
17507    {
17508      if (d->one_operand_p)
17509	return false;
17510    }
17511  else if (GET_MODE_SIZE (d->vmode) == 32)
17512    {
17513      if (!TARGET_AVX)
17514	return false;
17515      /* For 32-byte modes allow even d->one_operand_p.
17516	 The lack of cross-lane shuffling in some instructions
17517	 might prevent a single insn shuffle.  */
17518      dfinal = *d;
17519      dfinal.testing_p = true;
17520      /* If expand_vec_perm_interleave3 can expand this into
17521	 a 3 insn sequence, give up and let it be expanded as
17522	 3 insn sequence.  While that is one insn longer,
17523	 it doesn't need a memory operand and in the common
17524	 case that both interleave low and high permutations
17525	 with the same operands are adjacent needs 4 insns
17526	 for both after CSE.  */
17527      if (expand_vec_perm_interleave3 (&dfinal))
17528	return false;
17529    }
17530  else
17531    return false;
17532
17533  /* Examine from whence the elements come.  */
17534  contents = 0;
17535  for (i = 0; i < nelt; ++i)
17536    contents |= HOST_WIDE_INT_1U << d->perm[i];
17537
17538  memset (remap, 0xff, sizeof (remap));
17539  dremap = *d;
17540
17541  if (GET_MODE_SIZE (d->vmode) == 16)
17542    {
17543      unsigned HOST_WIDE_INT h1, h2, h3, h4;
17544
17545      /* Split the two input vectors into 4 halves.  */
17546      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17547      h2 = h1 << nelt2;
17548      h3 = h2 << nelt2;
17549      h4 = h3 << nelt2;
17550
17551      /* If the elements from the low halves use interleave low, and similarly
17552	 for interleave high.  If the elements are from mis-matched halves, we
17553	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
17554      if ((contents & (h1 | h3)) == contents)
17555	{
17556	  /* punpckl* */
17557	  for (i = 0; i < nelt2; ++i)
17558	    {
17559	      remap[i] = i * 2;
17560	      remap[i + nelt] = i * 2 + 1;
17561	      dremap.perm[i * 2] = i;
17562	      dremap.perm[i * 2 + 1] = i + nelt;
17563	    }
17564	  if (!TARGET_SSE2 && d->vmode == V4SImode)
17565	    dremap.vmode = V4SFmode;
17566	}
17567      else if ((contents & (h2 | h4)) == contents)
17568	{
17569	  /* punpckh* */
17570	  for (i = 0; i < nelt2; ++i)
17571	    {
17572	      remap[i + nelt2] = i * 2;
17573	      remap[i + nelt + nelt2] = i * 2 + 1;
17574	      dremap.perm[i * 2] = i + nelt2;
17575	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17576	    }
17577	  if (!TARGET_SSE2 && d->vmode == V4SImode)
17578	    dremap.vmode = V4SFmode;
17579	}
17580      else if ((contents & (h1 | h4)) == contents)
17581	{
17582	  /* shufps */
17583	  for (i = 0; i < nelt2; ++i)
17584	    {
17585	      remap[i] = i;
17586	      remap[i + nelt + nelt2] = i + nelt2;
17587	      dremap.perm[i] = i;
17588	      dremap.perm[i + nelt2] = i + nelt + nelt2;
17589	    }
17590	  if (nelt != 4)
17591	    {
17592	      /* shufpd */
17593	      dremap.vmode = V2DImode;
17594	      dremap.nelt = 2;
17595	      dremap.perm[0] = 0;
17596	      dremap.perm[1] = 3;
17597	    }
17598	}
17599      else if ((contents & (h2 | h3)) == contents)
17600	{
17601	  /* shufps */
17602	  for (i = 0; i < nelt2; ++i)
17603	    {
17604	      remap[i + nelt2] = i;
17605	      remap[i + nelt] = i + nelt2;
17606	      dremap.perm[i] = i + nelt2;
17607	      dremap.perm[i + nelt2] = i + nelt;
17608	    }
17609	  if (nelt != 4)
17610	    {
17611	      /* shufpd */
17612	      dremap.vmode = V2DImode;
17613	      dremap.nelt = 2;
17614	      dremap.perm[0] = 1;
17615	      dremap.perm[1] = 2;
17616	    }
17617	}
17618      else
17619	return false;
17620    }
17621  else
17622    {
17623      unsigned int nelt4 = nelt / 4, nzcnt = 0;
17624      unsigned HOST_WIDE_INT q[8];
17625      unsigned int nonzero_halves[4];
17626
17627      /* Split the two input vectors into 8 quarters.  */
17628      q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17629      for (i = 1; i < 8; ++i)
17630	q[i] = q[0] << (nelt4 * i);
17631      for (i = 0; i < 4; ++i)
17632	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17633	  {
17634	    nonzero_halves[nzcnt] = i;
17635	    ++nzcnt;
17636	  }
17637
17638      if (nzcnt == 1)
17639	{
17640	  gcc_assert (d->one_operand_p);
17641	  nonzero_halves[1] = nonzero_halves[0];
17642	  same_halves = true;
17643	}
17644      else if (d->one_operand_p)
17645	{
17646	  gcc_assert (nonzero_halves[0] == 0);
17647	  gcc_assert (nonzero_halves[1] == 1);
17648	}
17649
17650      if (nzcnt <= 2)
17651	{
17652	  if (d->perm[0] / nelt2 == nonzero_halves[1])
17653	    {
17654	      /* Attempt to increase the likelihood that dfinal
17655		 shuffle will be intra-lane.  */
17656	      std::swap (nonzero_halves[0], nonzero_halves[1]);
17657	    }
17658
17659	  /* vperm2f128 or vperm2i128.  */
17660	  for (i = 0; i < nelt2; ++i)
17661	    {
17662	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17663	      remap[i + nonzero_halves[0] * nelt2] = i;
17664	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17665	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17666	    }
17667
17668	  if (d->vmode != V8SFmode
17669	      && d->vmode != V4DFmode
17670	      && d->vmode != V8SImode)
17671	    {
17672	      dremap.vmode = V8SImode;
17673	      dremap.nelt = 8;
17674	      for (i = 0; i < 4; ++i)
17675		{
17676		  dremap.perm[i] = i + nonzero_halves[0] * 4;
17677		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17678		}
17679	    }
17680	}
17681      else if (d->one_operand_p)
17682	return false;
17683      else if (TARGET_AVX2
17684	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17685	{
17686	  /* vpunpckl* */
17687	  for (i = 0; i < nelt4; ++i)
17688	    {
17689	      remap[i] = i * 2;
17690	      remap[i + nelt] = i * 2 + 1;
17691	      remap[i + nelt2] = i * 2 + nelt2;
17692	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17693	      dremap.perm[i * 2] = i;
17694	      dremap.perm[i * 2 + 1] = i + nelt;
17695	      dremap.perm[i * 2 + nelt2] = i + nelt2;
17696	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17697	    }
17698	}
17699      else if (TARGET_AVX2
17700	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17701	{
17702	  /* vpunpckh* */
17703	  for (i = 0; i < nelt4; ++i)
17704	    {
17705	      remap[i + nelt4] = i * 2;
17706	      remap[i + nelt + nelt4] = i * 2 + 1;
17707	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17708	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17709	      dremap.perm[i * 2] = i + nelt4;
17710	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17711	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17712	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17713	    }
17714	}
17715      else
17716	return false;
17717    }
17718
17719  /* Use the remapping array set up above to move the elements from their
17720     swizzled locations into their final destinations.  */
17721  dfinal = *d;
17722  for (i = 0; i < nelt; ++i)
17723    {
17724      unsigned e = remap[d->perm[i]];
17725      gcc_assert (e < nelt);
17726      /* If same_halves is true, both halves of the remapped vector are the
17727	 same.  Avoid cross-lane accesses if possible.  */
17728      if (same_halves && i >= nelt2)
17729	{
17730	  gcc_assert (e < nelt2);
17731	  dfinal.perm[i] = e + nelt2;
17732	}
17733      else
17734	dfinal.perm[i] = e;
17735    }
17736  if (!d->testing_p)
17737    {
17738      dremap.target = gen_reg_rtx (dremap.vmode);
17739      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17740    }
17741  dfinal.op1 = dfinal.op0;
17742  dfinal.one_operand_p = true;
17743
17744  /* Test if the final remap can be done with a single insn.  For V4SFmode or
17745     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
17746  start_sequence ();
17747  ok = expand_vec_perm_1 (&dfinal);
17748  seq = get_insns ();
17749  end_sequence ();
17750
17751  if (!ok)
17752    return false;
17753
17754  if (d->testing_p)
17755    return true;
17756
17757  if (dremap.vmode != dfinal.vmode)
17758    {
17759      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17760      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17761    }
17762
17763  ok = expand_vec_perm_1 (&dremap);
17764  gcc_assert (ok);
17765
17766  emit_insn (seq);
17767  return true;
17768}
17769
17770/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17771   a single vector cross-lane permutation into vpermq followed
17772   by any of the single insn permutations.  */
17773
17774static bool
17775expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17776{
17777  struct expand_vec_perm_d dremap, dfinal;
17778  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17779  unsigned contents[2];
17780  bool ok;
17781
17782  if (!(TARGET_AVX2
17783	&& (d->vmode == V32QImode || d->vmode == V16HImode)
17784	&& d->one_operand_p))
17785    return false;
17786
17787  contents[0] = 0;
17788  contents[1] = 0;
17789  for (i = 0; i < nelt2; ++i)
17790    {
17791      contents[0] |= 1u << (d->perm[i] / nelt4);
17792      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17793    }
17794
17795  for (i = 0; i < 2; ++i)
17796    {
17797      unsigned int cnt = 0;
17798      for (j = 0; j < 4; ++j)
17799	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17800	  return false;
17801    }
17802
17803  if (d->testing_p)
17804    return true;
17805
17806  dremap = *d;
17807  dremap.vmode = V4DImode;
17808  dremap.nelt = 4;
17809  dremap.target = gen_reg_rtx (V4DImode);
17810  dremap.op0 = gen_lowpart (V4DImode, d->op0);
17811  dremap.op1 = dremap.op0;
17812  dremap.one_operand_p = true;
17813  for (i = 0; i < 2; ++i)
17814    {
17815      unsigned int cnt = 0;
17816      for (j = 0; j < 4; ++j)
17817	if ((contents[i] & (1u << j)) != 0)
17818	  dremap.perm[2 * i + cnt++] = j;
17819      for (; cnt < 2; ++cnt)
17820	dremap.perm[2 * i + cnt] = 0;
17821    }
17822
17823  dfinal = *d;
17824  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17825  dfinal.op1 = dfinal.op0;
17826  dfinal.one_operand_p = true;
17827  for (i = 0, j = 0; i < nelt; ++i)
17828    {
17829      if (i == nelt2)
17830	j = 2;
17831      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17832      if ((d->perm[i] / nelt4) == dremap.perm[j])
17833	;
17834      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17835	dfinal.perm[i] |= nelt4;
17836      else
17837	gcc_unreachable ();
17838    }
17839
17840  ok = expand_vec_perm_1 (&dremap);
17841  gcc_assert (ok);
17842
17843  ok = expand_vec_perm_1 (&dfinal);
17844  gcc_assert (ok);
17845
17846  return true;
17847}
17848
17849static bool canonicalize_perm (struct expand_vec_perm_d *d);
17850
17851/* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
17852   a vector permutation using two instructions, vperm2f128 resp.
17853   vperm2i128 followed by any single in-lane permutation.  */
17854
17855static bool
17856expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17857{
17858  struct expand_vec_perm_d dfirst, dsecond;
17859  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17860  bool ok;
17861
17862  if (!TARGET_AVX
17863      || GET_MODE_SIZE (d->vmode) != 32
17864      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17865    return false;
17866
17867  dsecond = *d;
17868  dsecond.one_operand_p = false;
17869  dsecond.testing_p = true;
17870
17871  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17872     immediate.  For perm < 16 the second permutation uses
17873     d->op0 as first operand, for perm >= 16 it uses d->op1
17874     as first operand.  The second operand is the result of
17875     vperm2[fi]128.  */
17876  for (perm = 0; perm < 32; perm++)
17877    {
17878      /* Ignore permutations which do not move anything cross-lane.  */
17879      if (perm < 16)
17880	{
17881	  /* The second shuffle for e.g. V4DFmode has
17882	     0123 and ABCD operands.
17883	     Ignore AB23, as 23 is already in the second lane
17884	     of the first operand.  */
17885	  if ((perm & 0xc) == (1 << 2)) continue;
17886	  /* And 01CD, as 01 is in the first lane of the first
17887	     operand.  */
17888	  if ((perm & 3) == 0) continue;
17889	  /* And 4567, as then the vperm2[fi]128 doesn't change
17890	     anything on the original 4567 second operand.  */
17891	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17892	}
17893      else
17894	{
17895	  /* The second shuffle for e.g. V4DFmode has
17896	     4567 and ABCD operands.
17897	     Ignore AB67, as 67 is already in the second lane
17898	     of the first operand.  */
17899	  if ((perm & 0xc) == (3 << 2)) continue;
17900	  /* And 45CD, as 45 is in the first lane of the first
17901	     operand.  */
17902	  if ((perm & 3) == 2) continue;
17903	  /* And 0123, as then the vperm2[fi]128 doesn't change
17904	     anything on the original 0123 first operand.  */
17905	  if ((perm & 0xf) == (1 << 2)) continue;
17906	}
17907
17908      for (i = 0; i < nelt; i++)
17909	{
17910	  j = d->perm[i] / nelt2;
17911	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17912	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17913	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17914	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
17915	  else
17916	    break;
17917	}
17918
17919      if (i == nelt)
17920	{
17921	  start_sequence ();
17922	  ok = expand_vec_perm_1 (&dsecond);
17923	  end_sequence ();
17924	}
17925      else
17926	ok = false;
17927
17928      if (ok)
17929	{
17930	  if (d->testing_p)
17931	    return true;
17932
17933	  /* Found a usable second shuffle.  dfirst will be
17934	     vperm2f128 on d->op0 and d->op1.  */
17935	  dsecond.testing_p = false;
17936	  dfirst = *d;
17937	  dfirst.target = gen_reg_rtx (d->vmode);
17938	  for (i = 0; i < nelt; i++)
17939	    dfirst.perm[i] = (i & (nelt2 - 1))
17940			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17941
17942	  canonicalize_perm (&dfirst);
17943	  ok = expand_vec_perm_1 (&dfirst);
17944	  gcc_assert (ok);
17945
17946	  /* And dsecond is some single insn shuffle, taking
17947	     d->op0 and result of vperm2f128 (if perm < 16) or
17948	     d->op1 and result of vperm2f128 (otherwise).  */
17949	  if (perm >= 16)
17950	    dsecond.op0 = dsecond.op1;
17951	  dsecond.op1 = dfirst.target;
17952
17953	  ok = expand_vec_perm_1 (&dsecond);
17954	  gcc_assert (ok);
17955
17956	  return true;
17957	}
17958
17959      /* For one operand, the only useful vperm2f128 permutation is 0x01
17960	 aka lanes swap.  */
17961      if (d->one_operand_p)
17962	return false;
17963    }
17964
17965  return false;
17966}
17967
17968/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17969   a two vector permutation using 2 intra-lane interleave insns
17970   and cross-lane shuffle for 32-byte vectors.  */
17971
17972static bool
17973expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17974{
17975  unsigned i, nelt;
17976  rtx (*gen) (rtx, rtx, rtx);
17977
17978  if (d->one_operand_p)
17979    return false;
17980  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17981    ;
17982  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17983    ;
17984  else
17985    return false;
17986
17987  nelt = d->nelt;
17988  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17989    return false;
17990  for (i = 0; i < nelt; i += 2)
17991    if (d->perm[i] != d->perm[0] + i / 2
17992	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17993      return false;
17994
17995  if (d->testing_p)
17996    return true;
17997
17998  switch (d->vmode)
17999    {
18000    case E_V32QImode:
18001      if (d->perm[0])
18002	gen = gen_vec_interleave_highv32qi;
18003      else
18004	gen = gen_vec_interleave_lowv32qi;
18005      break;
18006    case E_V16HImode:
18007      if (d->perm[0])
18008	gen = gen_vec_interleave_highv16hi;
18009      else
18010	gen = gen_vec_interleave_lowv16hi;
18011      break;
18012    case E_V8SImode:
18013      if (d->perm[0])
18014	gen = gen_vec_interleave_highv8si;
18015      else
18016	gen = gen_vec_interleave_lowv8si;
18017      break;
18018    case E_V4DImode:
18019      if (d->perm[0])
18020	gen = gen_vec_interleave_highv4di;
18021      else
18022	gen = gen_vec_interleave_lowv4di;
18023      break;
18024    case E_V8SFmode:
18025      if (d->perm[0])
18026	gen = gen_vec_interleave_highv8sf;
18027      else
18028	gen = gen_vec_interleave_lowv8sf;
18029      break;
18030    case E_V4DFmode:
18031      if (d->perm[0])
18032	gen = gen_vec_interleave_highv4df;
18033      else
18034	gen = gen_vec_interleave_lowv4df;
18035      break;
18036    default:
18037      gcc_unreachable ();
18038    }
18039
18040  emit_insn (gen (d->target, d->op0, d->op1));
18041  return true;
18042}
18043
18044/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18045   a single vector permutation using a single intra-lane vector
18046   permutation, vperm2f128 swapping the lanes and vblend* insn blending
18047   the non-swapped and swapped vectors together.  */
18048
18049static bool
18050expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18051{
18052  struct expand_vec_perm_d dfirst, dsecond;
18053  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18054  rtx_insn *seq;
18055  bool ok;
18056  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18057
18058  if (!TARGET_AVX
18059      || TARGET_AVX2
18060      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18061      || !d->one_operand_p)
18062    return false;
18063
18064  dfirst = *d;
18065  for (i = 0; i < nelt; i++)
18066    dfirst.perm[i] = 0xff;
18067  for (i = 0, msk = 0; i < nelt; i++)
18068    {
18069      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18070      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18071	return false;
18072      dfirst.perm[j] = d->perm[i];
18073      if (j != i)
18074	msk |= (1 << i);
18075    }
18076  for (i = 0; i < nelt; i++)
18077    if (dfirst.perm[i] == 0xff)
18078      dfirst.perm[i] = i;
18079
18080  if (!d->testing_p)
18081    dfirst.target = gen_reg_rtx (dfirst.vmode);
18082
18083  start_sequence ();
18084  ok = expand_vec_perm_1 (&dfirst);
18085  seq = get_insns ();
18086  end_sequence ();
18087
18088  if (!ok)
18089    return false;
18090
18091  if (d->testing_p)
18092    return true;
18093
18094  emit_insn (seq);
18095
18096  dsecond = *d;
18097  dsecond.op0 = dfirst.target;
18098  dsecond.op1 = dfirst.target;
18099  dsecond.one_operand_p = true;
18100  dsecond.target = gen_reg_rtx (dsecond.vmode);
18101  for (i = 0; i < nelt; i++)
18102    dsecond.perm[i] = i ^ nelt2;
18103
18104  ok = expand_vec_perm_1 (&dsecond);
18105  gcc_assert (ok);
18106
18107  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18108  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18109  return true;
18110}
18111
18112/* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
18113   permutation using two vperm2f128, followed by a vshufpd insn blending
18114   the two vectors together.  */
18115
18116static bool
18117expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18118{
18119  struct expand_vec_perm_d dfirst, dsecond, dthird;
18120  bool ok;
18121
18122  if (!TARGET_AVX || (d->vmode != V4DFmode))
18123    return false;
18124
18125  if (d->testing_p)
18126    return true;
18127
18128  dfirst = *d;
18129  dsecond = *d;
18130  dthird = *d;
18131
18132  dfirst.perm[0] = (d->perm[0] & ~1);
18133  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18134  dfirst.perm[2] = (d->perm[2] & ~1);
18135  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18136  dsecond.perm[0] = (d->perm[1] & ~1);
18137  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18138  dsecond.perm[2] = (d->perm[3] & ~1);
18139  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18140  dthird.perm[0] = (d->perm[0] % 2);
18141  dthird.perm[1] = (d->perm[1] % 2) + 4;
18142  dthird.perm[2] = (d->perm[2] % 2) + 2;
18143  dthird.perm[3] = (d->perm[3] % 2) + 6;
18144
18145  dfirst.target = gen_reg_rtx (dfirst.vmode);
18146  dsecond.target = gen_reg_rtx (dsecond.vmode);
18147  dthird.op0 = dfirst.target;
18148  dthird.op1 = dsecond.target;
18149  dthird.one_operand_p = false;
18150
18151  canonicalize_perm (&dfirst);
18152  canonicalize_perm (&dsecond);
18153
18154  ok = expand_vec_perm_1 (&dfirst)
18155       && expand_vec_perm_1 (&dsecond)
18156       && expand_vec_perm_1 (&dthird);
18157
18158  gcc_assert (ok);
18159
18160  return true;
18161}
18162
18163static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18164
18165/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18166   a two vector permutation using two intra-lane vector
18167   permutations, vperm2f128 swapping the lanes and vblend* insn blending
18168   the non-swapped and swapped vectors together.  */
18169
18170static bool
18171expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18172{
18173  struct expand_vec_perm_d dfirst, dsecond, dthird;
18174  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18175  rtx_insn *seq1, *seq2;
18176  bool ok;
18177  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18178
18179  if (!TARGET_AVX
18180      || TARGET_AVX2
18181      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18182      || d->one_operand_p)
18183    return false;
18184
18185  dfirst = *d;
18186  dsecond = *d;
18187  for (i = 0; i < nelt; i++)
18188    {
18189      dfirst.perm[i] = 0xff;
18190      dsecond.perm[i] = 0xff;
18191    }
18192  for (i = 0, msk = 0; i < nelt; i++)
18193    {
18194      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18195      if (j == i)
18196	{
18197	  dfirst.perm[j] = d->perm[i];
18198	  which1 |= (d->perm[i] < nelt ? 1 : 2);
18199	}
18200      else
18201	{
18202	  dsecond.perm[j] = d->perm[i];
18203	  which2 |= (d->perm[i] < nelt ? 1 : 2);
18204	  msk |= (1U << i);
18205	}
18206    }
18207  if (msk == 0 || msk == (1U << nelt) - 1)
18208    return false;
18209
18210  if (!d->testing_p)
18211    {
18212      dfirst.target = gen_reg_rtx (dfirst.vmode);
18213      dsecond.target = gen_reg_rtx (dsecond.vmode);
18214    }
18215
18216  for (i = 0; i < nelt; i++)
18217    {
18218      if (dfirst.perm[i] == 0xff)
18219	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18220      if (dsecond.perm[i] == 0xff)
18221	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18222    }
18223  canonicalize_perm (&dfirst);
18224  start_sequence ();
18225  ok = ix86_expand_vec_perm_const_1 (&dfirst);
18226  seq1 = get_insns ();
18227  end_sequence ();
18228
18229  if (!ok)
18230    return false;
18231
18232  canonicalize_perm (&dsecond);
18233  start_sequence ();
18234  ok = ix86_expand_vec_perm_const_1 (&dsecond);
18235  seq2 = get_insns ();
18236  end_sequence ();
18237
18238  if (!ok)
18239    return false;
18240
18241  if (d->testing_p)
18242    return true;
18243
18244  emit_insn (seq1);
18245  emit_insn (seq2);
18246
18247  dthird = *d;
18248  dthird.op0 = dsecond.target;
18249  dthird.op1 = dsecond.target;
18250  dthird.one_operand_p = true;
18251  dthird.target = gen_reg_rtx (dthird.vmode);
18252  for (i = 0; i < nelt; i++)
18253    dthird.perm[i] = i ^ nelt2;
18254
18255  ok = expand_vec_perm_1 (&dthird);
18256  gcc_assert (ok);
18257
18258  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18259  emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18260  return true;
18261}
18262
18263/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
18264   permutation with two pshufb insns and an ior.  We should have already
18265   failed all two instruction sequences.  */
18266
18267static bool
18268expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18269{
18270  rtx rperm[2][16], vperm, l, h, op, m128;
18271  unsigned int i, nelt, eltsz;
18272
18273  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18274    return false;
18275  gcc_assert (!d->one_operand_p);
18276
18277  if (d->testing_p)
18278    return true;
18279
18280  nelt = d->nelt;
18281  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18282
18283  /* Generate two permutation masks.  If the required element is within
18284     the given vector it is shuffled into the proper lane.  If the required
18285     element is in the other vector, force a zero into the lane by setting
18286     bit 7 in the permutation mask.  */
18287  m128 = GEN_INT (-128);
18288  for (i = 0; i < nelt; ++i)
18289    {
18290      unsigned j, e = d->perm[i];
18291      unsigned which = (e >= nelt);
18292      if (e >= nelt)
18293	e -= nelt;
18294
18295      for (j = 0; j < eltsz; ++j)
18296	{
18297	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18298	  rperm[1-which][i*eltsz + j] = m128;
18299	}
18300    }
18301
18302  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18303  vperm = force_reg (V16QImode, vperm);
18304
18305  l = gen_reg_rtx (V16QImode);
18306  op = gen_lowpart (V16QImode, d->op0);
18307  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18308
18309  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18310  vperm = force_reg (V16QImode, vperm);
18311
18312  h = gen_reg_rtx (V16QImode);
18313  op = gen_lowpart (V16QImode, d->op1);
18314  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18315
18316  op = d->target;
18317  if (d->vmode != V16QImode)
18318    op = gen_reg_rtx (V16QImode);
18319  emit_insn (gen_iorv16qi3 (op, l, h));
18320  if (op != d->target)
18321    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18322
18323  return true;
18324}
18325
18326/* Implement arbitrary permutation of one V32QImode and V16QImode operand
18327   with two vpshufb insns, vpermq and vpor.  We should have already failed
18328   all two or three instruction sequences.  */
18329
18330static bool
18331expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18332{
18333  rtx rperm[2][32], vperm, l, h, hp, op, m128;
18334  unsigned int i, nelt, eltsz;
18335
18336  if (!TARGET_AVX2
18337      || !d->one_operand_p
18338      || (d->vmode != V32QImode && d->vmode != V16HImode))
18339    return false;
18340
18341  if (d->testing_p)
18342    return true;
18343
18344  nelt = d->nelt;
18345  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18346
18347  /* Generate two permutation masks.  If the required element is within
18348     the same lane, it is shuffled in.  If the required element from the
18349     other lane, force a zero by setting bit 7 in the permutation mask.
18350     In the other mask the mask has non-negative elements if element
18351     is requested from the other lane, but also moved to the other lane,
18352     so that the result of vpshufb can have the two V2TImode halves
18353     swapped.  */
18354  m128 = GEN_INT (-128);
18355  for (i = 0; i < nelt; ++i)
18356    {
18357      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18358      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18359
18360      for (j = 0; j < eltsz; ++j)
18361	{
18362	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18363	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18364	}
18365    }
18366
18367  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18368  vperm = force_reg (V32QImode, vperm);
18369
18370  h = gen_reg_rtx (V32QImode);
18371  op = gen_lowpart (V32QImode, d->op0);
18372  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18373
18374  /* Swap the 128-byte lanes of h into hp.  */
18375  hp = gen_reg_rtx (V4DImode);
18376  op = gen_lowpart (V4DImode, h);
18377  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18378				  const1_rtx));
18379
18380  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18381  vperm = force_reg (V32QImode, vperm);
18382
18383  l = gen_reg_rtx (V32QImode);
18384  op = gen_lowpart (V32QImode, d->op0);
18385  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18386
18387  op = d->target;
18388  if (d->vmode != V32QImode)
18389    op = gen_reg_rtx (V32QImode);
18390  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18391  if (op != d->target)
18392    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18393
18394  return true;
18395}
18396
18397/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18398   and extract-odd permutations of two V32QImode and V16QImode operand
18399   with two vpshufb insns, vpor and vpermq.  We should have already
18400   failed all two or three instruction sequences.  */
18401
18402static bool
18403expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18404{
18405  rtx rperm[2][32], vperm, l, h, ior, op, m128;
18406  unsigned int i, nelt, eltsz;
18407
18408  if (!TARGET_AVX2
18409      || d->one_operand_p
18410      || (d->vmode != V32QImode && d->vmode != V16HImode))
18411    return false;
18412
18413  for (i = 0; i < d->nelt; ++i)
18414    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18415      return false;
18416
18417  if (d->testing_p)
18418    return true;
18419
18420  nelt = d->nelt;
18421  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18422
18423  /* Generate two permutation masks.  In the first permutation mask
18424     the first quarter will contain indexes for the first half
18425     of the op0, the second quarter will contain bit 7 set, third quarter
18426     will contain indexes for the second half of the op0 and the
18427     last quarter bit 7 set.  In the second permutation mask
18428     the first quarter will contain bit 7 set, the second quarter
18429     indexes for the first half of the op1, the third quarter bit 7 set
18430     and last quarter indexes for the second half of the op1.
18431     I.e. the first mask e.g. for V32QImode extract even will be:
18432     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18433     (all values masked with 0xf except for -128) and second mask
18434     for extract even will be
18435     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
18436  m128 = GEN_INT (-128);
18437  for (i = 0; i < nelt; ++i)
18438    {
18439      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18440      unsigned which = d->perm[i] >= nelt;
18441      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18442
18443      for (j = 0; j < eltsz; ++j)
18444	{
18445	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18446	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18447	}
18448    }
18449
18450  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18451  vperm = force_reg (V32QImode, vperm);
18452
18453  l = gen_reg_rtx (V32QImode);
18454  op = gen_lowpart (V32QImode, d->op0);
18455  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18456
18457  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18458  vperm = force_reg (V32QImode, vperm);
18459
18460  h = gen_reg_rtx (V32QImode);
18461  op = gen_lowpart (V32QImode, d->op1);
18462  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18463
18464  ior = gen_reg_rtx (V32QImode);
18465  emit_insn (gen_iorv32qi3 (ior, l, h));
18466
18467  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
18468  op = gen_reg_rtx (V4DImode);
18469  ior = gen_lowpart (V4DImode, ior);
18470  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18471				  const1_rtx, GEN_INT (3)));
18472  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18473
18474  return true;
18475}
18476
18477/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18478   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18479   with two "and" and "pack" or two "shift" and "pack" insns.  We should
18480   have already failed all two instruction sequences.  */
18481
18482static bool
18483expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18484{
18485  rtx op, dop0, dop1, t;
18486  unsigned i, odd, c, s, nelt = d->nelt;
18487  bool end_perm = false;
18488  machine_mode half_mode;
18489  rtx (*gen_and) (rtx, rtx, rtx);
18490  rtx (*gen_pack) (rtx, rtx, rtx);
18491  rtx (*gen_shift) (rtx, rtx, rtx);
18492
18493  if (d->one_operand_p)
18494    return false;
18495
18496  switch (d->vmode)
18497    {
18498    case E_V8HImode:
18499      /* Required for "pack".  */
18500      if (!TARGET_SSE4_1)
18501        return false;
18502      c = 0xffff;
18503      s = 16;
18504      half_mode = V4SImode;
18505      gen_and = gen_andv4si3;
18506      gen_pack = gen_sse4_1_packusdw;
18507      gen_shift = gen_lshrv4si3;
18508      break;
18509    case E_V16QImode:
18510      /* No check as all instructions are SSE2.  */
18511      c = 0xff;
18512      s = 8;
18513      half_mode = V8HImode;
18514      gen_and = gen_andv8hi3;
18515      gen_pack = gen_sse2_packuswb;
18516      gen_shift = gen_lshrv8hi3;
18517      break;
18518    case E_V16HImode:
18519      if (!TARGET_AVX2)
18520        return false;
18521      c = 0xffff;
18522      s = 16;
18523      half_mode = V8SImode;
18524      gen_and = gen_andv8si3;
18525      gen_pack = gen_avx2_packusdw;
18526      gen_shift = gen_lshrv8si3;
18527      end_perm = true;
18528      break;
18529    case E_V32QImode:
18530      if (!TARGET_AVX2)
18531        return false;
18532      c = 0xff;
18533      s = 8;
18534      half_mode = V16HImode;
18535      gen_and = gen_andv16hi3;
18536      gen_pack = gen_avx2_packuswb;
18537      gen_shift = gen_lshrv16hi3;
18538      end_perm = true;
18539      break;
18540    default:
18541      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18542	 general shuffles.  */
18543      return false;
18544    }
18545
18546  /* Check that permutation is even or odd.  */
18547  odd = d->perm[0];
18548  if (odd > 1)
18549    return false;
18550
18551  for (i = 1; i < nelt; ++i)
18552    if (d->perm[i] != 2 * i + odd)
18553      return false;
18554
18555  if (d->testing_p)
18556    return true;
18557
18558  dop0 = gen_reg_rtx (half_mode);
18559  dop1 = gen_reg_rtx (half_mode);
18560  if (odd == 0)
18561    {
18562      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18563      t = force_reg (half_mode, t);
18564      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18565      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18566    }
18567  else
18568    {
18569      emit_insn (gen_shift (dop0,
18570			    gen_lowpart (half_mode, d->op0),
18571			    GEN_INT (s)));
18572      emit_insn (gen_shift (dop1,
18573			    gen_lowpart (half_mode, d->op1),
18574			    GEN_INT (s)));
18575    }
18576  /* In AVX2 for 256 bit case we need to permute pack result.  */
18577  if (TARGET_AVX2 && end_perm)
18578    {
18579      op = gen_reg_rtx (d->vmode);
18580      t = gen_reg_rtx (V4DImode);
18581      emit_insn (gen_pack (op, dop0, dop1));
18582      emit_insn (gen_avx2_permv4di_1 (t,
18583				      gen_lowpart (V4DImode, op),
18584				      const0_rtx,
18585				      const2_rtx,
18586				      const1_rtx,
18587				      GEN_INT (3)));
18588      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18589    }
18590  else
18591    emit_insn (gen_pack (d->target, dop0, dop1));
18592
18593  return true;
18594}
18595
18596/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18597   and extract-odd permutations of two V64QI operands
18598   with two "shifts", two "truncs" and one "concat" insns for "odd"
18599   and two "truncs" and one concat insn for "even."
18600   Have already failed all two instruction sequences.  */
18601
18602static bool
18603expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18604{
18605  rtx t1, t2, t3, t4;
18606  unsigned i, odd, nelt = d->nelt;
18607
18608  if (!TARGET_AVX512BW
18609      || d->one_operand_p
18610      || d->vmode != V64QImode)
18611    return false;
18612
18613  /* Check that permutation is even or odd.  */
18614  odd = d->perm[0];
18615  if (odd > 1)
18616    return false;
18617
18618  for (i = 1; i < nelt; ++i)
18619    if (d->perm[i] != 2 * i + odd)
18620      return false;
18621
18622  if (d->testing_p)
18623    return true;
18624
18625
18626  if (odd)
18627    {
18628      t1 = gen_reg_rtx (V32HImode);
18629      t2 = gen_reg_rtx (V32HImode);
18630      emit_insn (gen_lshrv32hi3 (t1,
18631				 gen_lowpart (V32HImode, d->op0),
18632				 GEN_INT (8)));
18633      emit_insn (gen_lshrv32hi3 (t2,
18634				 gen_lowpart (V32HImode, d->op1),
18635				 GEN_INT (8)));
18636    }
18637  else
18638    {
18639      t1 = gen_lowpart (V32HImode, d->op0);
18640      t2 = gen_lowpart (V32HImode, d->op1);
18641    }
18642
18643  t3 = gen_reg_rtx (V32QImode);
18644  t4 = gen_reg_rtx (V32QImode);
18645  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18646  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18647  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18648
18649  return true;
18650}
18651
18652/* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
18653   and extract-odd permutations.  */
18654
18655static bool
18656expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18657{
18658  rtx t1, t2, t3, t4, t5;
18659
18660  switch (d->vmode)
18661    {
18662    case E_V4DFmode:
18663      if (d->testing_p)
18664	break;
18665      t1 = gen_reg_rtx (V4DFmode);
18666      t2 = gen_reg_rtx (V4DFmode);
18667
18668      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
18669      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18670      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18671
18672      /* Now an unpck[lh]pd will produce the result required.  */
18673      if (odd)
18674	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18675      else
18676	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18677      emit_insn (t3);
18678      break;
18679
18680    case E_V8SFmode:
18681      {
18682	int mask = odd ? 0xdd : 0x88;
18683
18684	if (d->testing_p)
18685	  break;
18686	t1 = gen_reg_rtx (V8SFmode);
18687	t2 = gen_reg_rtx (V8SFmode);
18688	t3 = gen_reg_rtx (V8SFmode);
18689
18690	/* Shuffle within the 128-bit lanes to produce:
18691	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
18692	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18693				      GEN_INT (mask)));
18694
18695	/* Shuffle the lanes around to produce:
18696	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
18697	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18698					    GEN_INT (0x3)));
18699
18700	/* Shuffle within the 128-bit lanes to produce:
18701	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
18702	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18703
18704	/* Shuffle within the 128-bit lanes to produce:
18705	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
18706	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18707
18708	/* Shuffle the lanes around to produce:
18709	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
18710	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18711					    GEN_INT (0x20)));
18712      }
18713      break;
18714
18715    case E_V2DFmode:
18716    case E_V4SFmode:
18717    case E_V2DImode:
18718    case E_V4SImode:
18719      /* These are always directly implementable by expand_vec_perm_1.  */
18720      gcc_unreachable ();
18721
18722    case E_V8HImode:
18723      if (TARGET_SSE4_1)
18724	return expand_vec_perm_even_odd_pack (d);
18725      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18726	return expand_vec_perm_pshufb2 (d);
18727      else
18728	{
18729	  if (d->testing_p)
18730	    break;
18731	  /* We need 2*log2(N)-1 operations to achieve odd/even
18732	     with interleave. */
18733	  t1 = gen_reg_rtx (V8HImode);
18734	  t2 = gen_reg_rtx (V8HImode);
18735	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18736	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18737	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18738	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18739	  if (odd)
18740	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18741	  else
18742	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18743	  emit_insn (t3);
18744	}
18745      break;
18746
18747    case E_V16QImode:
18748      return expand_vec_perm_even_odd_pack (d);
18749
18750    case E_V16HImode:
18751    case E_V32QImode:
18752      return expand_vec_perm_even_odd_pack (d);
18753
18754    case E_V64QImode:
18755      return expand_vec_perm_even_odd_trunc (d);
18756
18757    case E_V4DImode:
18758      if (!TARGET_AVX2)
18759	{
18760	  struct expand_vec_perm_d d_copy = *d;
18761	  d_copy.vmode = V4DFmode;
18762	  if (d->testing_p)
18763	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18764	  else
18765	    d_copy.target = gen_reg_rtx (V4DFmode);
18766	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18767	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18768	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18769	    {
18770	      if (!d->testing_p)
18771		emit_move_insn (d->target,
18772				gen_lowpart (V4DImode, d_copy.target));
18773	      return true;
18774	    }
18775	  return false;
18776	}
18777
18778      if (d->testing_p)
18779	break;
18780
18781      t1 = gen_reg_rtx (V4DImode);
18782      t2 = gen_reg_rtx (V4DImode);
18783
18784      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
18785      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18786      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18787
18788      /* Now an vpunpck[lh]qdq will produce the result required.  */
18789      if (odd)
18790	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18791      else
18792	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18793      emit_insn (t3);
18794      break;
18795
18796    case E_V8SImode:
18797      if (!TARGET_AVX2)
18798	{
18799	  struct expand_vec_perm_d d_copy = *d;
18800	  d_copy.vmode = V8SFmode;
18801	  if (d->testing_p)
18802	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18803	  else
18804	    d_copy.target = gen_reg_rtx (V8SFmode);
18805	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18806	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18807	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18808	    {
18809	      if (!d->testing_p)
18810		emit_move_insn (d->target,
18811				gen_lowpart (V8SImode, d_copy.target));
18812	      return true;
18813	    }
18814	  return false;
18815	}
18816
18817      if (d->testing_p)
18818	break;
18819
18820      t1 = gen_reg_rtx (V8SImode);
18821      t2 = gen_reg_rtx (V8SImode);
18822      t3 = gen_reg_rtx (V4DImode);
18823      t4 = gen_reg_rtx (V4DImode);
18824      t5 = gen_reg_rtx (V4DImode);
18825
18826      /* Shuffle the lanes around into
18827	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
18828      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18829				    gen_lowpart (V4DImode, d->op1),
18830				    GEN_INT (0x20)));
18831      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18832				    gen_lowpart (V4DImode, d->op1),
18833				    GEN_INT (0x31)));
18834
18835      /* Swap the 2nd and 3rd position in each lane into
18836	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
18837      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18838				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18839      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18840				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18841
18842      /* Now an vpunpck[lh]qdq will produce
18843	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
18844      if (odd)
18845	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18846					   gen_lowpart (V4DImode, t2));
18847      else
18848	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18849					  gen_lowpart (V4DImode, t2));
18850      emit_insn (t3);
18851      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18852      break;
18853
18854    default:
18855      gcc_unreachable ();
18856    }
18857
18858  return true;
18859}
18860
18861/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
18862   extract-even and extract-odd permutations.  */
18863
18864static bool
18865expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18866{
18867  unsigned i, odd, nelt = d->nelt;
18868
18869  odd = d->perm[0];
18870  if (odd != 0 && odd != 1)
18871    return false;
18872
18873  for (i = 1; i < nelt; ++i)
18874    if (d->perm[i] != 2 * i + odd)
18875      return false;
18876
18877  return expand_vec_perm_even_odd_1 (d, odd);
18878}
18879
18880/* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
18881   permutations.  We assume that expand_vec_perm_1 has already failed.  */
18882
18883static bool
18884expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18885{
18886  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18887  machine_mode vmode = d->vmode;
18888  unsigned char perm2[4];
18889  rtx op0 = d->op0, dest;
18890  bool ok;
18891
18892  switch (vmode)
18893    {
18894    case E_V4DFmode:
18895    case E_V8SFmode:
18896      /* These are special-cased in sse.md so that we can optionally
18897	 use the vbroadcast instruction.  They expand to two insns
18898	 if the input happens to be in a register.  */
18899      gcc_unreachable ();
18900
18901    case E_V2DFmode:
18902    case E_V2DImode:
18903    case E_V4SFmode:
18904    case E_V4SImode:
18905      /* These are always implementable using standard shuffle patterns.  */
18906      gcc_unreachable ();
18907
18908    case E_V8HImode:
18909    case E_V16QImode:
18910      /* These can be implemented via interleave.  We save one insn by
18911	 stopping once we have promoted to V4SImode and then use pshufd.  */
18912      if (d->testing_p)
18913	return true;
18914      do
18915	{
18916	  rtx dest;
18917	  rtx (*gen) (rtx, rtx, rtx)
18918	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18919				 : gen_vec_interleave_lowv8hi;
18920
18921	  if (elt >= nelt2)
18922	    {
18923	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18924				       : gen_vec_interleave_highv8hi;
18925	      elt -= nelt2;
18926	    }
18927	  nelt2 /= 2;
18928
18929	  dest = gen_reg_rtx (vmode);
18930	  emit_insn (gen (dest, op0, op0));
18931	  vmode = get_mode_wider_vector (vmode);
18932	  op0 = gen_lowpart (vmode, dest);
18933	}
18934      while (vmode != V4SImode);
18935
18936      memset (perm2, elt, 4);
18937      dest = gen_reg_rtx (V4SImode);
18938      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18939      gcc_assert (ok);
18940      if (!d->testing_p)
18941	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18942      return true;
18943
18944    case E_V64QImode:
18945    case E_V32QImode:
18946    case E_V16HImode:
18947    case E_V8SImode:
18948    case E_V4DImode:
18949      /* For AVX2 broadcasts of the first element vpbroadcast* or
18950	 vpermq should be used by expand_vec_perm_1.  */
18951      gcc_assert (!TARGET_AVX2 || d->perm[0]);
18952      return false;
18953
18954    default:
18955      gcc_unreachable ();
18956    }
18957}
18958
18959/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
18960   broadcast permutations.  */
18961
18962static bool
18963expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18964{
18965  unsigned i, elt, nelt = d->nelt;
18966
18967  if (!d->one_operand_p)
18968    return false;
18969
18970  elt = d->perm[0];
18971  for (i = 1; i < nelt; ++i)
18972    if (d->perm[i] != elt)
18973      return false;
18974
18975  return expand_vec_perm_broadcast_1 (d);
18976}
18977
18978/* Implement arbitrary permutations of two V64QImode operands
18979   with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
18980static bool
18981expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18982{
18983  if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18984    return false;
18985
18986  if (d->testing_p)
18987    return true;
18988
18989  struct expand_vec_perm_d ds[2];
18990  rtx rperm[128], vperm, target0, target1;
18991  unsigned int i, nelt;
18992  machine_mode vmode;
18993
18994  nelt = d->nelt;
18995  vmode = V64QImode;
18996
18997  for (i = 0; i < 2; i++)
18998    {
18999      ds[i] = *d;
19000      ds[i].vmode = V32HImode;
19001      ds[i].nelt = 32;
19002      ds[i].target = gen_reg_rtx (V32HImode);
19003      ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19004      ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19005    }
19006
19007  /* Prepare permutations such that the first one takes care of
19008     putting the even bytes into the right positions or one higher
19009     positions (ds[0]) and the second one takes care of
19010     putting the odd bytes into the right positions or one below
19011     (ds[1]).  */
19012
19013  for (i = 0; i < nelt; i++)
19014    {
19015      ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19016      if (i & 1)
19017	{
19018	  rperm[i] = constm1_rtx;
19019	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19020	}
19021      else
19022	{
19023	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19024	  rperm[i + 64] = constm1_rtx;
19025	}
19026    }
19027
19028  bool ok = expand_vec_perm_1 (&ds[0]);
19029  gcc_assert (ok);
19030  ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19031
19032  ok = expand_vec_perm_1 (&ds[1]);
19033  gcc_assert (ok);
19034  ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19035
19036  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19037  vperm = force_reg (vmode, vperm);
19038  target0 = gen_reg_rtx (V64QImode);
19039  emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19040
19041  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19042  vperm = force_reg (vmode, vperm);
19043  target1 = gen_reg_rtx (V64QImode);
19044  emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19045
19046  emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19047  return true;
19048}
19049
19050/* Implement arbitrary permutation of two V32QImode and V16QImode operands
19051   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
19052   all the shorter instruction sequences.  */
19053
19054static bool
19055expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19056{
19057  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19058  unsigned int i, nelt, eltsz;
19059  bool used[4];
19060
19061  if (!TARGET_AVX2
19062      || d->one_operand_p
19063      || (d->vmode != V32QImode && d->vmode != V16HImode))
19064    return false;
19065
19066  if (d->testing_p)
19067    return true;
19068
19069  nelt = d->nelt;
19070  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19071
19072  /* Generate 4 permutation masks.  If the required element is within
19073     the same lane, it is shuffled in.  If the required element from the
19074     other lane, force a zero by setting bit 7 in the permutation mask.
19075     In the other mask the mask has non-negative elements if element
19076     is requested from the other lane, but also moved to the other lane,
19077     so that the result of vpshufb can have the two V2TImode halves
19078     swapped.  */
19079  m128 = GEN_INT (-128);
19080  for (i = 0; i < 32; ++i)
19081    {
19082      rperm[0][i] = m128;
19083      rperm[1][i] = m128;
19084      rperm[2][i] = m128;
19085      rperm[3][i] = m128;
19086    }
19087  used[0] = false;
19088  used[1] = false;
19089  used[2] = false;
19090  used[3] = false;
19091  for (i = 0; i < nelt; ++i)
19092    {
19093      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19094      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19095      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19096
19097      for (j = 0; j < eltsz; ++j)
19098	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19099      used[which] = true;
19100    }
19101
19102  for (i = 0; i < 2; ++i)
19103    {
19104      if (!used[2 * i + 1])
19105	{
19106	  h[i] = NULL_RTX;
19107	  continue;
19108	}
19109      vperm = gen_rtx_CONST_VECTOR (V32QImode,
19110				    gen_rtvec_v (32, rperm[2 * i + 1]));
19111      vperm = force_reg (V32QImode, vperm);
19112      h[i] = gen_reg_rtx (V32QImode);
19113      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19114      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19115    }
19116
19117  /* Swap the 128-byte lanes of h[X].  */
19118  for (i = 0; i < 2; ++i)
19119   {
19120     if (h[i] == NULL_RTX)
19121       continue;
19122     op = gen_reg_rtx (V4DImode);
19123     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19124				     const2_rtx, GEN_INT (3), const0_rtx,
19125				     const1_rtx));
19126     h[i] = gen_lowpart (V32QImode, op);
19127   }
19128
19129  for (i = 0; i < 2; ++i)
19130    {
19131      if (!used[2 * i])
19132	{
19133	  l[i] = NULL_RTX;
19134	  continue;
19135	}
19136      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19137      vperm = force_reg (V32QImode, vperm);
19138      l[i] = gen_reg_rtx (V32QImode);
19139      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19140      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19141    }
19142
19143  for (i = 0; i < 2; ++i)
19144    {
19145      if (h[i] && l[i])
19146	{
19147	  op = gen_reg_rtx (V32QImode);
19148	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19149	  l[i] = op;
19150	}
19151      else if (h[i])
19152	l[i] = h[i];
19153    }
19154
19155  gcc_assert (l[0] && l[1]);
19156  op = d->target;
19157  if (d->vmode != V32QImode)
19158    op = gen_reg_rtx (V32QImode);
19159  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19160  if (op != d->target)
19161    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19162  return true;
19163}
19164
19165/* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
19166   taken care of, perform the expansion in D and return true on success.  */
19167
19168static bool
19169ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19170{
19171  /* Try a single instruction expansion.  */
19172  if (expand_vec_perm_1 (d))
19173    return true;
19174
19175  /* Try sequences of two instructions.  */
19176
19177  if (expand_vec_perm_pshuflw_pshufhw (d))
19178    return true;
19179
19180  if (expand_vec_perm_palignr (d, false))
19181    return true;
19182
19183  if (expand_vec_perm_interleave2 (d))
19184    return true;
19185
19186  if (expand_vec_perm_broadcast (d))
19187    return true;
19188
19189  if (expand_vec_perm_vpermq_perm_1 (d))
19190    return true;
19191
19192  if (expand_vec_perm_vperm2f128 (d))
19193    return true;
19194
19195  if (expand_vec_perm_pblendv (d))
19196    return true;
19197
19198  /* Try sequences of three instructions.  */
19199
19200  if (expand_vec_perm_even_odd_pack (d))
19201    return true;
19202
19203  if (expand_vec_perm_2vperm2f128_vshuf (d))
19204    return true;
19205
19206  if (expand_vec_perm_pshufb2 (d))
19207    return true;
19208
19209  if (expand_vec_perm_interleave3 (d))
19210    return true;
19211
19212  if (expand_vec_perm_vperm2f128_vblend (d))
19213    return true;
19214
19215  /* Try sequences of four instructions.  */
19216
19217  if (expand_vec_perm_even_odd_trunc (d))
19218    return true;
19219  if (expand_vec_perm_vpshufb2_vpermq (d))
19220    return true;
19221
19222  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19223    return true;
19224
19225  if (expand_vec_perm_vpermt2_vpshub2 (d))
19226    return true;
19227
19228  /* ??? Look for narrow permutations whose element orderings would
19229     allow the promotion to a wider mode.  */
19230
19231  /* ??? Look for sequences of interleave or a wider permute that place
19232     the data into the correct lanes for a half-vector shuffle like
19233     pshuf[lh]w or vpermilps.  */
19234
19235  /* ??? Look for sequences of interleave that produce the desired results.
19236     The combinatorics of punpck[lh] get pretty ugly... */
19237
19238  if (expand_vec_perm_even_odd (d))
19239    return true;
19240
19241  /* Even longer sequences.  */
19242  if (expand_vec_perm_vpshufb4_vpermq2 (d))
19243    return true;
19244
19245  /* See if we can get the same permutation in different vector integer
19246     mode.  */
19247  struct expand_vec_perm_d nd;
19248  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19249    {
19250      if (!d->testing_p)
19251	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19252      return true;
19253    }
19254
19255  /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
19256  if (expand_vec_perm2_vperm2f128_vblend (d))
19257    return true;
19258
19259  return false;
19260}
19261
19262/* If a permutation only uses one operand, make it clear. Returns true
19263   if the permutation references both operands.  */
19264
19265static bool
19266canonicalize_perm (struct expand_vec_perm_d *d)
19267{
19268  int i, which, nelt = d->nelt;
19269
19270  for (i = which = 0; i < nelt; ++i)
19271    which |= (d->perm[i] < nelt ? 1 : 2);
19272
19273  d->one_operand_p = true;
19274  switch (which)
19275    {
19276    default:
19277      gcc_unreachable();
19278
19279    case 3:
19280      if (!rtx_equal_p (d->op0, d->op1))
19281        {
19282	  d->one_operand_p = false;
19283	  break;
19284        }
19285      /* The elements of PERM do not suggest that only the first operand
19286	 is used, but both operands are identical.  Allow easier matching
19287	 of the permutation by folding the permutation into the single
19288	 input vector.  */
19289      /* FALLTHRU */
19290
19291    case 2:
19292      for (i = 0; i < nelt; ++i)
19293        d->perm[i] &= nelt - 1;
19294      d->op0 = d->op1;
19295      break;
19296
19297    case 1:
19298      d->op1 = d->op0;
19299      break;
19300    }
19301
19302  return (which == 3);
19303}
19304
19305/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19306
19307bool
19308ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19309			       rtx op1, const vec_perm_indices &sel)
19310{
19311  struct expand_vec_perm_d d;
19312  unsigned char perm[MAX_VECT_LEN];
19313  unsigned int i, nelt, which;
19314  bool two_args;
19315
19316  d.target = target;
19317  d.op0 = op0;
19318  d.op1 = op1;
19319
19320  d.vmode = vmode;
19321  gcc_assert (VECTOR_MODE_P (d.vmode));
19322  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19323  d.testing_p = !target;
19324
19325  gcc_assert (sel.length () == nelt);
19326  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19327
19328  /* Given sufficient ISA support we can just return true here
19329     for selected vector modes.  */
19330  switch (d.vmode)
19331    {
19332    case E_V16SFmode:
19333    case E_V16SImode:
19334    case E_V8DImode:
19335    case E_V8DFmode:
19336      if (!TARGET_AVX512F)
19337	return false;
19338      /* All implementable with a single vperm[it]2 insn.  */
19339      if (d.testing_p)
19340	return true;
19341      break;
19342    case E_V32HImode:
19343      if (!TARGET_AVX512BW)
19344	return false;
19345      if (d.testing_p)
19346	/* All implementable with a single vperm[it]2 insn.  */
19347	return true;
19348      break;
19349    case E_V64QImode:
19350      if (!TARGET_AVX512BW)
19351	return false;
19352      if (d.testing_p)
19353	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
19354	return true;
19355      break;
19356    case E_V8SImode:
19357    case E_V8SFmode:
19358    case E_V4DFmode:
19359    case E_V4DImode:
19360      if (!TARGET_AVX)
19361	return false;
19362      if (d.testing_p && TARGET_AVX512VL)
19363	/* All implementable with a single vperm[it]2 insn.  */
19364	return true;
19365      break;
19366    case E_V16HImode:
19367      if (!TARGET_SSE2)
19368	return false;
19369      if (d.testing_p && TARGET_AVX2)
19370	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19371	return true;
19372      break;
19373    case E_V32QImode:
19374      if (!TARGET_SSE2)
19375	return false;
19376      if (d.testing_p && TARGET_AVX2)
19377	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19378	return true;
19379      break;
19380    case E_V8HImode:
19381    case E_V16QImode:
19382      if (!TARGET_SSE2)
19383	return false;
19384      /* Fall through.  */
19385    case E_V4SImode:
19386    case E_V4SFmode:
19387      if (!TARGET_SSE)
19388	return false;
19389      /* All implementable with a single vpperm insn.  */
19390      if (d.testing_p && TARGET_XOP)
19391	return true;
19392      /* All implementable with 2 pshufb + 1 ior.  */
19393      if (d.testing_p && TARGET_SSSE3)
19394	return true;
19395      break;
19396    case E_V2DImode:
19397    case E_V2DFmode:
19398      if (!TARGET_SSE)
19399	return false;
19400      /* All implementable with shufpd or unpck[lh]pd.  */
19401      if (d.testing_p)
19402	return true;
19403      break;
19404    default:
19405      return false;
19406    }
19407
19408  for (i = which = 0; i < nelt; ++i)
19409    {
19410      unsigned char e = sel[i];
19411      gcc_assert (e < 2 * nelt);
19412      d.perm[i] = e;
19413      perm[i] = e;
19414      which |= (e < nelt ? 1 : 2);
19415    }
19416
19417  if (d.testing_p)
19418    {
19419      /* For all elements from second vector, fold the elements to first.  */
19420      if (which == 2)
19421	for (i = 0; i < nelt; ++i)
19422	  d.perm[i] -= nelt;
19423
19424      /* Check whether the mask can be applied to the vector type.  */
19425      d.one_operand_p = (which != 3);
19426
19427      /* Implementable with shufps or pshufd.  */
19428      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19429	return true;
19430
19431      /* Otherwise we have to go through the motions and see if we can
19432	 figure out how to generate the requested permutation.  */
19433      d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19434      d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19435      if (!d.one_operand_p)
19436	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19437
19438      start_sequence ();
19439      bool ret = ix86_expand_vec_perm_const_1 (&d);
19440      end_sequence ();
19441
19442      return ret;
19443    }
19444
19445  two_args = canonicalize_perm (&d);
19446
19447  if (ix86_expand_vec_perm_const_1 (&d))
19448    return true;
19449
19450  /* If the selector says both arguments are needed, but the operands are the
19451     same, the above tried to expand with one_operand_p and flattened selector.
19452     If that didn't work, retry without one_operand_p; we succeeded with that
19453     during testing.  */
19454  if (two_args && d.one_operand_p)
19455    {
19456      d.one_operand_p = false;
19457      memcpy (d.perm, perm, sizeof (perm));
19458      return ix86_expand_vec_perm_const_1 (&d);
19459    }
19460
19461  return false;
19462}
19463
19464void
19465ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19466{
19467  struct expand_vec_perm_d d;
19468  unsigned i, nelt;
19469
19470  d.target = targ;
19471  d.op0 = op0;
19472  d.op1 = op1;
19473  d.vmode = GET_MODE (targ);
19474  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19475  d.one_operand_p = false;
19476  d.testing_p = false;
19477
19478  for (i = 0; i < nelt; ++i)
19479    d.perm[i] = i * 2 + odd;
19480
19481  /* We'll either be able to implement the permutation directly...  */
19482  if (expand_vec_perm_1 (&d))
19483    return;
19484
19485  /* ... or we use the special-case patterns.  */
19486  expand_vec_perm_even_odd_1 (&d, odd);
19487}
19488
19489static void
19490ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19491{
19492  struct expand_vec_perm_d d;
19493  unsigned i, nelt, base;
19494  bool ok;
19495
19496  d.target = targ;
19497  d.op0 = op0;
19498  d.op1 = op1;
19499  d.vmode = GET_MODE (targ);
19500  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19501  d.one_operand_p = false;
19502  d.testing_p = false;
19503
19504  base = high_p ? nelt / 2 : 0;
19505  for (i = 0; i < nelt / 2; ++i)
19506    {
19507      d.perm[i * 2] = i + base;
19508      d.perm[i * 2 + 1] = i + base + nelt;
19509    }
19510
19511  /* Note that for AVX this isn't one instruction.  */
19512  ok = ix86_expand_vec_perm_const_1 (&d);
19513  gcc_assert (ok);
19514}
19515
19516
19517/* Expand a vector operation CODE for a V*QImode in terms of the
19518   same operation on V*HImode.  */
19519
19520void
19521ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19522{
19523  machine_mode qimode = GET_MODE (dest);
19524  machine_mode himode;
19525  rtx (*gen_il) (rtx, rtx, rtx);
19526  rtx (*gen_ih) (rtx, rtx, rtx);
19527  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19528  struct expand_vec_perm_d d;
19529  bool ok, full_interleave;
19530  bool uns_p = false;
19531  int i;
19532
19533  switch (qimode)
19534    {
19535    case E_V16QImode:
19536      himode = V8HImode;
19537      gen_il = gen_vec_interleave_lowv16qi;
19538      gen_ih = gen_vec_interleave_highv16qi;
19539      break;
19540    case E_V32QImode:
19541      himode = V16HImode;
19542      gen_il = gen_avx2_interleave_lowv32qi;
19543      gen_ih = gen_avx2_interleave_highv32qi;
19544      break;
19545    case E_V64QImode:
19546      himode = V32HImode;
19547      gen_il = gen_avx512bw_interleave_lowv64qi;
19548      gen_ih = gen_avx512bw_interleave_highv64qi;
19549      break;
19550    default:
19551      gcc_unreachable ();
19552    }
19553
19554  op2_l = op2_h = op2;
19555  switch (code)
19556    {
19557    case MULT:
19558      /* Unpack data such that we've got a source byte in each low byte of
19559	 each word.  We don't care what goes into the high byte of each word.
19560	 Rather than trying to get zero in there, most convenient is to let
19561	 it be a copy of the low byte.  */
19562      op2_l = gen_reg_rtx (qimode);
19563      op2_h = gen_reg_rtx (qimode);
19564      emit_insn (gen_il (op2_l, op2, op2));
19565      emit_insn (gen_ih (op2_h, op2, op2));
19566
19567      op1_l = gen_reg_rtx (qimode);
19568      op1_h = gen_reg_rtx (qimode);
19569      emit_insn (gen_il (op1_l, op1, op1));
19570      emit_insn (gen_ih (op1_h, op1, op1));
19571      full_interleave = qimode == V16QImode;
19572      break;
19573
19574    case ASHIFT:
19575    case LSHIFTRT:
19576      uns_p = true;
19577      /* FALLTHRU */
19578    case ASHIFTRT:
19579      op1_l = gen_reg_rtx (himode);
19580      op1_h = gen_reg_rtx (himode);
19581      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19582      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19583      full_interleave = true;
19584      break;
19585    default:
19586      gcc_unreachable ();
19587    }
19588
19589  /* Perform the operation.  */
19590  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19591			       1, OPTAB_DIRECT);
19592  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19593			       1, OPTAB_DIRECT);
19594  gcc_assert (res_l && res_h);
19595
19596  /* Merge the data back into the right place.  */
19597  d.target = dest;
19598  d.op0 = gen_lowpart (qimode, res_l);
19599  d.op1 = gen_lowpart (qimode, res_h);
19600  d.vmode = qimode;
19601  d.nelt = GET_MODE_NUNITS (qimode);
19602  d.one_operand_p = false;
19603  d.testing_p = false;
19604
19605  if (full_interleave)
19606    {
19607      /* For SSE2, we used an full interleave, so the desired
19608	 results are in the even elements.  */
19609      for (i = 0; i < d.nelt; ++i)
19610	d.perm[i] = i * 2;
19611    }
19612  else
19613    {
19614      /* For AVX, the interleave used above was not cross-lane.  So the
19615	 extraction is evens but with the second and third quarter swapped.
19616	 Happily, that is even one insn shorter than even extraction.
19617	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
19618	 always first from the first and then from the second source operand,
19619	 the index bits above the low 4 bits remains the same.
19620	 Thus, for d.nelt == 32 we want permutation
19621	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19622	 and for d.nelt == 64 we want permutation
19623	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19624	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
19625      for (i = 0; i < d.nelt; ++i)
19626	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19627    }
19628
19629  ok = ix86_expand_vec_perm_const_1 (&d);
19630  gcc_assert (ok);
19631
19632  set_unique_reg_note (get_last_insn (), REG_EQUAL,
19633		       gen_rtx_fmt_ee (code, qimode, op1, op2));
19634}
19635
19636/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
19637   if op is CONST_VECTOR with all odd elements equal to their
19638   preceding element.  */
19639
19640static bool
19641const_vector_equal_evenodd_p (rtx op)
19642{
19643  machine_mode mode = GET_MODE (op);
19644  int i, nunits = GET_MODE_NUNITS (mode);
19645  if (GET_CODE (op) != CONST_VECTOR
19646      || nunits != CONST_VECTOR_NUNITS (op))
19647    return false;
19648  for (i = 0; i < nunits; i += 2)
19649    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19650      return false;
19651  return true;
19652}
19653
19654void
19655ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19656			       bool uns_p, bool odd_p)
19657{
19658  machine_mode mode = GET_MODE (op1);
19659  machine_mode wmode = GET_MODE (dest);
19660  rtx x;
19661  rtx orig_op1 = op1, orig_op2 = op2;
19662
19663  if (!nonimmediate_operand (op1, mode))
19664    op1 = force_reg (mode, op1);
19665  if (!nonimmediate_operand (op2, mode))
19666    op2 = force_reg (mode, op2);
19667
19668  /* We only play even/odd games with vectors of SImode.  */
19669  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19670
19671  /* If we're looking for the odd results, shift those members down to
19672     the even slots.  For some cpus this is faster than a PSHUFD.  */
19673  if (odd_p)
19674    {
19675      /* For XOP use vpmacsdqh, but only for smult, as it is only
19676	 signed.  */
19677      if (TARGET_XOP && mode == V4SImode && !uns_p)
19678	{
19679	  x = force_reg (wmode, CONST0_RTX (wmode));
19680	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19681	  return;
19682	}
19683
19684      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19685      if (!const_vector_equal_evenodd_p (orig_op1))
19686	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19687			    x, NULL, 1, OPTAB_DIRECT);
19688      if (!const_vector_equal_evenodd_p (orig_op2))
19689	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19690			    x, NULL, 1, OPTAB_DIRECT);
19691      op1 = gen_lowpart (mode, op1);
19692      op2 = gen_lowpart (mode, op2);
19693    }
19694
19695  if (mode == V16SImode)
19696    {
19697      if (uns_p)
19698	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19699      else
19700	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19701    }
19702  else if (mode == V8SImode)
19703    {
19704      if (uns_p)
19705	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19706      else
19707	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19708    }
19709  else if (uns_p)
19710    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19711  else if (TARGET_SSE4_1)
19712    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19713  else
19714    {
19715      rtx s1, s2, t0, t1, t2;
19716
19717      /* The easiest way to implement this without PMULDQ is to go through
19718	 the motions as if we are performing a full 64-bit multiply.  With
19719	 the exception that we need to do less shuffling of the elements.  */
19720
19721      /* Compute the sign-extension, aka highparts, of the two operands.  */
19722      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19723				op1, pc_rtx, pc_rtx);
19724      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19725				op2, pc_rtx, pc_rtx);
19726
19727      /* Multiply LO(A) * HI(B), and vice-versa.  */
19728      t1 = gen_reg_rtx (wmode);
19729      t2 = gen_reg_rtx (wmode);
19730      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19731      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19732
19733      /* Multiply LO(A) * LO(B).  */
19734      t0 = gen_reg_rtx (wmode);
19735      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19736
19737      /* Combine and shift the highparts into place.  */
19738      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19739      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19740			 1, OPTAB_DIRECT);
19741
19742      /* Combine high and low parts.  */
19743      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19744      return;
19745    }
19746  emit_insn (x);
19747}
19748
19749void
19750ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19751			    bool uns_p, bool high_p)
19752{
19753  machine_mode wmode = GET_MODE (dest);
19754  machine_mode mode = GET_MODE (op1);
19755  rtx t1, t2, t3, t4, mask;
19756
19757  switch (mode)
19758    {
19759    case E_V4SImode:
19760      t1 = gen_reg_rtx (mode);
19761      t2 = gen_reg_rtx (mode);
19762      if (TARGET_XOP && !uns_p)
19763	{
19764	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
19765	     shuffle the elements once so that all elements are in the right
19766	     place for immediate use: { A C B D }.  */
19767	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19768					const1_rtx, GEN_INT (3)));
19769	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19770					const1_rtx, GEN_INT (3)));
19771	}
19772      else
19773	{
19774	  /* Put the elements into place for the multiply.  */
19775	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
19776	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
19777	  high_p = false;
19778	}
19779      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19780      break;
19781
19782    case E_V8SImode:
19783      /* Shuffle the elements between the lanes.  After this we
19784	 have { A B E F | C D G H } for each operand.  */
19785      t1 = gen_reg_rtx (V4DImode);
19786      t2 = gen_reg_rtx (V4DImode);
19787      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19788				      const0_rtx, const2_rtx,
19789				      const1_rtx, GEN_INT (3)));
19790      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19791				      const0_rtx, const2_rtx,
19792				      const1_rtx, GEN_INT (3)));
19793
19794      /* Shuffle the elements within the lanes.  After this we
19795	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
19796      t3 = gen_reg_rtx (V8SImode);
19797      t4 = gen_reg_rtx (V8SImode);
19798      mask = GEN_INT (high_p
19799		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19800		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19801      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19802      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19803
19804      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19805      break;
19806
19807    case E_V8HImode:
19808    case E_V16HImode:
19809      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19810			 uns_p, OPTAB_DIRECT);
19811      t2 = expand_binop (mode,
19812			 uns_p ? umul_highpart_optab : smul_highpart_optab,
19813			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19814      gcc_assert (t1 && t2);
19815
19816      t3 = gen_reg_rtx (mode);
19817      ix86_expand_vec_interleave (t3, t1, t2, high_p);
19818      emit_move_insn (dest, gen_lowpart (wmode, t3));
19819      break;
19820
19821    case E_V16QImode:
19822    case E_V32QImode:
19823    case E_V32HImode:
19824    case E_V16SImode:
19825    case E_V64QImode:
19826      t1 = gen_reg_rtx (wmode);
19827      t2 = gen_reg_rtx (wmode);
19828      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19829      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19830
19831      emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19832      break;
19833
19834    default:
19835      gcc_unreachable ();
19836    }
19837}
19838
19839void
19840ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19841{
19842  rtx res_1, res_2, res_3, res_4;
19843
19844  res_1 = gen_reg_rtx (V4SImode);
19845  res_2 = gen_reg_rtx (V4SImode);
19846  res_3 = gen_reg_rtx (V2DImode);
19847  res_4 = gen_reg_rtx (V2DImode);
19848  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19849  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19850
19851  /* Move the results in element 2 down to element 1; we don't care
19852     what goes in elements 2 and 3.  Then we can merge the parts
19853     back together with an interleave.
19854
19855     Note that two other sequences were tried:
19856     (1) Use interleaves at the start instead of psrldq, which allows
19857     us to use a single shufps to merge things back at the end.
19858     (2) Use shufps here to combine the two vectors, then pshufd to
19859     put the elements in the correct order.
19860     In both cases the cost of the reformatting stall was too high
19861     and the overall sequence slower.  */
19862
19863  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19864				const0_rtx, const2_rtx,
19865				const0_rtx, const0_rtx));
19866  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19867				const0_rtx, const2_rtx,
19868				const0_rtx, const0_rtx));
19869  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19870
19871  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19872}
19873
19874void
19875ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19876{
19877  machine_mode mode = GET_MODE (op0);
19878  rtx t1, t2, t3, t4, t5, t6;
19879
19880  if (TARGET_AVX512DQ && mode == V8DImode)
19881    emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19882  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19883    emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19884  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19885    emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19886  else if (TARGET_XOP && mode == V2DImode)
19887    {
19888      /* op1: A,B,C,D, op2: E,F,G,H */
19889      op1 = gen_lowpart (V4SImode, op1);
19890      op2 = gen_lowpart (V4SImode, op2);
19891
19892      t1 = gen_reg_rtx (V4SImode);
19893      t2 = gen_reg_rtx (V4SImode);
19894      t3 = gen_reg_rtx (V2DImode);
19895      t4 = gen_reg_rtx (V2DImode);
19896
19897      /* t1: B,A,D,C */
19898      emit_insn (gen_sse2_pshufd_1 (t1, op1,
19899				    GEN_INT (1),
19900				    GEN_INT (0),
19901				    GEN_INT (3),
19902				    GEN_INT (2)));
19903
19904      /* t2: (B*E),(A*F),(D*G),(C*H) */
19905      emit_insn (gen_mulv4si3 (t2, t1, op2));
19906
19907      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19908      emit_insn (gen_xop_phadddq (t3, t2));
19909
19910      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19911      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19912
19913      /* Multiply lower parts and add all */
19914      t5 = gen_reg_rtx (V2DImode);
19915      emit_insn (gen_vec_widen_umult_even_v4si (t5,
19916					gen_lowpart (V4SImode, op1),
19917					gen_lowpart (V4SImode, op2)));
19918      force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19919    }
19920  else
19921    {
19922      machine_mode nmode;
19923      rtx (*umul) (rtx, rtx, rtx);
19924
19925      if (mode == V2DImode)
19926	{
19927	  umul = gen_vec_widen_umult_even_v4si;
19928	  nmode = V4SImode;
19929	}
19930      else if (mode == V4DImode)
19931	{
19932	  umul = gen_vec_widen_umult_even_v8si;
19933	  nmode = V8SImode;
19934	}
19935      else if (mode == V8DImode)
19936	{
19937	  umul = gen_vec_widen_umult_even_v16si;
19938	  nmode = V16SImode;
19939	}
19940      else
19941	gcc_unreachable ();
19942
19943
19944      /* Multiply low parts.  */
19945      t1 = gen_reg_rtx (mode);
19946      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19947
19948      /* Shift input vectors right 32 bits so we can multiply high parts.  */
19949      t6 = GEN_INT (32);
19950      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19951      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19952
19953      /* Multiply high parts by low parts.  */
19954      t4 = gen_reg_rtx (mode);
19955      t5 = gen_reg_rtx (mode);
19956      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19957      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19958
19959      /* Combine and shift the highparts back.  */
19960      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19961      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19962
19963      /* Combine high and low parts.  */
19964      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19965    }
19966
19967  set_unique_reg_note (get_last_insn (), REG_EQUAL,
19968		       gen_rtx_MULT (mode, op1, op2));
19969}
19970
19971/* Return 1 if control tansfer instruction INSN
19972   should be encoded with notrack prefix.  */
19973
19974bool
19975ix86_notrack_prefixed_insn_p (rtx_insn *insn)
19976{
19977  if (!insn || !((flag_cf_protection & CF_BRANCH)))
19978    return false;
19979
19980  if (CALL_P (insn))
19981    {
19982      rtx call = get_call_rtx_from (insn);
19983      gcc_assert (call != NULL_RTX);
19984      rtx addr = XEXP (call, 0);
19985
19986      /* Do not emit 'notrack' if it's not an indirect call.  */
19987      if (MEM_P (addr)
19988	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19989	return false;
19990      else
19991	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19992    }
19993
19994  if (JUMP_P (insn) && !flag_cet_switch)
19995    {
19996      rtx target = JUMP_LABEL (insn);
19997      if (target == NULL_RTX || ANY_RETURN_P (target))
19998	return false;
19999
20000      /* Check the jump is a switch table.  */
20001      rtx_insn *label = as_a<rtx_insn *> (target);
20002      rtx_insn *table = next_insn (label);
20003      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20004	return false;
20005      else
20006	return true;
20007    }
20008  return false;
20009}
20010
20011/* Calculate integer abs() using only SSE2 instructions.  */
20012
20013void
20014ix86_expand_sse2_abs (rtx target, rtx input)
20015{
20016  machine_mode mode = GET_MODE (target);
20017  rtx tmp0, tmp1, x;
20018
20019  switch (mode)
20020    {
20021    case E_V2DImode:
20022    case E_V4DImode:
20023      /* For 64-bit signed integer X, with SSE4.2 use
20024	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20025	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20026	 32 and use logical instead of arithmetic right shift (which is
20027	 unimplemented) and subtract.  */
20028      if (TARGET_SSE4_2)
20029	{
20030	  tmp0 = gen_reg_rtx (mode);
20031	  tmp1 = gen_reg_rtx (mode);
20032	  emit_move_insn (tmp1, CONST0_RTX (mode));
20033	  if (mode == E_V2DImode)
20034	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20035	  else
20036	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20037	}
20038      else
20039	{
20040	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20041				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20042					       - 1), NULL, 0, OPTAB_DIRECT);
20043	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20044	}
20045
20046      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20047				  NULL, 0, OPTAB_DIRECT);
20048      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20049			       target, 0, OPTAB_DIRECT);
20050      break;
20051
20052    case E_V4SImode:
20053      /* For 32-bit signed integer X, the best way to calculate the absolute
20054	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
20055      tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20056				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20057				  NULL, 0, OPTAB_DIRECT);
20058      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20059				  NULL, 0, OPTAB_DIRECT);
20060      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20061			       target, 0, OPTAB_DIRECT);
20062      break;
20063
20064    case E_V8HImode:
20065      /* For 16-bit signed integer X, the best way to calculate the absolute
20066	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
20067      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20068
20069      x = expand_simple_binop (mode, SMAX, tmp0, input,
20070			       target, 0, OPTAB_DIRECT);
20071      break;
20072
20073    case E_V16QImode:
20074      /* For 8-bit signed integer X, the best way to calculate the absolute
20075	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20076	 as SSE2 provides the PMINUB insn.  */
20077      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20078
20079      x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20080			       target, 0, OPTAB_DIRECT);
20081      break;
20082
20083    default:
20084      gcc_unreachable ();
20085    }
20086
20087  if (x != target)
20088    emit_move_insn (target, x);
20089}
20090
20091/* Expand an extract from a vector register through pextr insn.
20092   Return true if successful.  */
20093
20094bool
20095ix86_expand_pextr (rtx *operands)
20096{
20097  rtx dst = operands[0];
20098  rtx src = operands[1];
20099
20100  unsigned int size = INTVAL (operands[2]);
20101  unsigned int pos = INTVAL (operands[3]);
20102
20103  if (SUBREG_P (dst))
20104    {
20105      /* Reject non-lowpart subregs.  */
20106      if (SUBREG_BYTE (dst) > 0)
20107	return false;
20108      dst = SUBREG_REG (dst);
20109    }
20110
20111  if (SUBREG_P (src))
20112    {
20113      pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20114      src = SUBREG_REG (src);
20115    }
20116
20117  switch (GET_MODE (src))
20118    {
20119    case E_V16QImode:
20120    case E_V8HImode:
20121    case E_V4SImode:
20122    case E_V2DImode:
20123    case E_V1TImode:
20124      {
20125	machine_mode srcmode, dstmode;
20126	rtx d, pat;
20127
20128	if (!int_mode_for_size (size, 0).exists (&dstmode))
20129	  return false;
20130
20131	switch (dstmode)
20132	  {
20133	  case E_QImode:
20134	    if (!TARGET_SSE4_1)
20135	      return false;
20136	    srcmode = V16QImode;
20137	    break;
20138
20139	  case E_HImode:
20140	    if (!TARGET_SSE2)
20141	      return false;
20142	    srcmode = V8HImode;
20143	    break;
20144
20145	  case E_SImode:
20146	    if (!TARGET_SSE4_1)
20147	      return false;
20148	    srcmode = V4SImode;
20149	    break;
20150
20151	  case E_DImode:
20152	    gcc_assert (TARGET_64BIT);
20153	    if (!TARGET_SSE4_1)
20154	      return false;
20155	    srcmode = V2DImode;
20156	    break;
20157
20158	  default:
20159	    return false;
20160	  }
20161
20162	/* Reject extractions from misaligned positions.  */
20163	if (pos & (size-1))
20164	  return false;
20165
20166	if (GET_MODE (dst) == dstmode)
20167	  d = dst;
20168	else
20169	  d = gen_reg_rtx (dstmode);
20170
20171	/* Construct insn pattern.  */
20172	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20173	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20174
20175	/* Let the rtl optimizers know about the zero extension performed.  */
20176	if (dstmode == QImode || dstmode == HImode)
20177	  {
20178	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20179	    d = gen_lowpart (SImode, d);
20180	  }
20181
20182	emit_insn (gen_rtx_SET (d, pat));
20183
20184	if (d != dst)
20185	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20186	return true;
20187      }
20188
20189    default:
20190      return false;
20191    }
20192}
20193
20194/* Expand an insert into a vector register through pinsr insn.
20195   Return true if successful.  */
20196
20197bool
20198ix86_expand_pinsr (rtx *operands)
20199{
20200  rtx dst = operands[0];
20201  rtx src = operands[3];
20202
20203  unsigned int size = INTVAL (operands[1]);
20204  unsigned int pos = INTVAL (operands[2]);
20205
20206  if (SUBREG_P (dst))
20207    {
20208      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20209      dst = SUBREG_REG (dst);
20210    }
20211
20212  switch (GET_MODE (dst))
20213    {
20214    case E_V16QImode:
20215    case E_V8HImode:
20216    case E_V4SImode:
20217    case E_V2DImode:
20218    case E_V1TImode:
20219      {
20220	machine_mode srcmode, dstmode;
20221	rtx (*pinsr)(rtx, rtx, rtx, rtx);
20222	rtx d;
20223
20224	if (!int_mode_for_size (size, 0).exists (&srcmode))
20225	  return false;
20226
20227	switch (srcmode)
20228	  {
20229	  case E_QImode:
20230	    if (!TARGET_SSE4_1)
20231	      return false;
20232	    dstmode = V16QImode;
20233	    pinsr = gen_sse4_1_pinsrb;
20234	    break;
20235
20236	  case E_HImode:
20237	    if (!TARGET_SSE2)
20238	      return false;
20239	    dstmode = V8HImode;
20240	    pinsr = gen_sse2_pinsrw;
20241	    break;
20242
20243	  case E_SImode:
20244	    if (!TARGET_SSE4_1)
20245	      return false;
20246	    dstmode = V4SImode;
20247	    pinsr = gen_sse4_1_pinsrd;
20248	    break;
20249
20250	  case E_DImode:
20251	    gcc_assert (TARGET_64BIT);
20252	    if (!TARGET_SSE4_1)
20253	      return false;
20254	    dstmode = V2DImode;
20255	    pinsr = gen_sse4_1_pinsrq;
20256	    break;
20257
20258	  default:
20259	    return false;
20260	  }
20261
20262	/* Reject insertions to misaligned positions.  */
20263	if (pos & (size-1))
20264	  return false;
20265
20266	if (SUBREG_P (src))
20267	  {
20268	    unsigned int srcpos = SUBREG_BYTE (src);
20269
20270	    if (srcpos > 0)
20271	      {
20272		rtx extr_ops[4];
20273
20274		extr_ops[0] = gen_reg_rtx (srcmode);
20275		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20276		extr_ops[2] = GEN_INT (size);
20277		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20278
20279		if (!ix86_expand_pextr (extr_ops))
20280		  return false;
20281
20282		src = extr_ops[0];
20283	      }
20284	    else
20285	      src = gen_lowpart (srcmode, SUBREG_REG (src));
20286	  }
20287
20288	if (GET_MODE (dst) == dstmode)
20289	  d = dst;
20290	else
20291	  d = gen_reg_rtx (dstmode);
20292
20293	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20294			  gen_lowpart (srcmode, src),
20295			  GEN_INT (1 << (pos / size))));
20296	if (d != dst)
20297	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20298	return true;
20299      }
20300
20301    default:
20302      return false;
20303    }
20304}
20305
20306/* All CPUs prefer to avoid cross-lane operations so perform reductions
20307   upper against lower halves up to SSE reg size.  */
20308
20309machine_mode
20310ix86_split_reduction (machine_mode mode)
20311{
20312  /* Reduce lowpart against highpart until we reach SSE reg width to
20313     avoid cross-lane operations.  */
20314  switch (mode)
20315    {
20316    case E_V8DImode:
20317    case E_V4DImode:
20318      return V2DImode;
20319    case E_V16SImode:
20320    case E_V8SImode:
20321      return V4SImode;
20322    case E_V32HImode:
20323    case E_V16HImode:
20324      return V8HImode;
20325    case E_V64QImode:
20326    case E_V32QImode:
20327      return V16QImode;
20328    case E_V16SFmode:
20329    case E_V8SFmode:
20330      return V4SFmode;
20331    case E_V8DFmode:
20332    case E_V4DFmode:
20333      return V2DFmode;
20334    default:
20335      return mode;
20336    }
20337}
20338
20339/* Generate call to __divmoddi4.  */
20340
20341void
20342ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20343			    rtx op0, rtx op1,
20344			    rtx *quot_p, rtx *rem_p)
20345{
20346  rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20347
20348  rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20349				      mode, op0, mode, op1, mode,
20350				      XEXP (rem, 0), Pmode);
20351  *quot_p = quot;
20352  *rem_p = rem;
20353}
20354
20355#include "gt-i386-expand.h"
20356