X86ISelLowering.h revision 363496
1//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
15#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
16
17#include "llvm/CodeGen/CallingConvLower.h"
18#include "llvm/CodeGen/SelectionDAG.h"
19#include "llvm/CodeGen/TargetLowering.h"
20
21namespace llvm {
22  class X86Subtarget;
23  class X86TargetMachine;
24
25  namespace X86ISD {
26    // X86 Specific DAG Nodes
27    enum NodeType : unsigned {
28      // Start the numbering where the builtin ops leave off.
29      FIRST_NUMBER = ISD::BUILTIN_OP_END,
30
31      /// Bit scan forward.
32      BSF,
33      /// Bit scan reverse.
34      BSR,
35
36      /// Double shift instructions. These correspond to
37      /// X86::SHLDxx and X86::SHRDxx instructions.
38      SHLD,
39      SHRD,
40
41      /// Bitwise logical AND of floating point values. This corresponds
42      /// to X86::ANDPS or X86::ANDPD.
43      FAND,
44
45      /// Bitwise logical OR of floating point values. This corresponds
46      /// to X86::ORPS or X86::ORPD.
47      FOR,
48
49      /// Bitwise logical XOR of floating point values. This corresponds
50      /// to X86::XORPS or X86::XORPD.
51      FXOR,
52
53      ///  Bitwise logical ANDNOT of floating point values. This
54      /// corresponds to X86::ANDNPS or X86::ANDNPD.
55      FANDN,
56
57      /// These operations represent an abstract X86 call
58      /// instruction, which includes a bunch of information.  In particular the
59      /// operands of these node are:
60      ///
61      ///     #0 - The incoming token chain
62      ///     #1 - The callee
63      ///     #2 - The number of arg bytes the caller pushes on the stack.
64      ///     #3 - The number of arg bytes the callee pops off the stack.
65      ///     #4 - The value to pass in AL/AX/EAX (optional)
66      ///     #5 - The value to pass in DL/DX/EDX (optional)
67      ///
68      /// The result values of these nodes are:
69      ///
70      ///     #0 - The outgoing token chain
71      ///     #1 - The first register result value (optional)
72      ///     #2 - The second register result value (optional)
73      ///
74      CALL,
75
76      /// Same as call except it adds the NoTrack prefix.
77      NT_CALL,
78
79      /// X86 compare and logical compare instructions.
80      CMP, COMI, UCOMI,
81
82      /// X86 bit-test instructions.
83      BT,
84
85      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
86      /// operand, usually produced by a CMP instruction.
87      SETCC,
88
89      /// X86 Select
90      SELECTS,
91
92      // Same as SETCC except it's materialized with a sbb and the value is all
93      // one's or all zero's.
94      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
95
96      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
97      /// Operands are two FP values to compare; result is a mask of
98      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
99      FSETCC,
100
101      /// X86 FP SETCC, similar to above, but with output as an i1 mask and
102      /// and a version with SAE.
103      FSETCCM, FSETCCM_SAE,
104
105      /// X86 conditional moves. Operand 0 and operand 1 are the two values
106      /// to select from. Operand 2 is the condition code, and operand 3 is the
107      /// flag operand produced by a CMP or TEST instruction.
108      CMOV,
109
110      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
111      /// is the block to branch if condition is true, operand 2 is the
112      /// condition code, and operand 3 is the flag operand produced by a CMP
113      /// or TEST instruction.
114      BRCOND,
115
116      /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
117      /// operand 1 is the target address.
118      NT_BRIND,
119
120      /// Return with a flag operand. Operand 0 is the chain operand, operand
121      /// 1 is the number of bytes of stack to pop.
122      RET_FLAG,
123
124      /// Return from interrupt. Operand 0 is the number of bytes to pop.
125      IRET,
126
127      /// Repeat fill, corresponds to X86::REP_STOSx.
128      REP_STOS,
129
130      /// Repeat move, corresponds to X86::REP_MOVSx.
131      REP_MOVS,
132
133      /// On Darwin, this node represents the result of the popl
134      /// at function entry, used for PIC code.
135      GlobalBaseReg,
136
137      /// A wrapper node for TargetConstantPool, TargetJumpTable,
138      /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
139      /// MCSymbol and TargetBlockAddress.
140      Wrapper,
141
142      /// Special wrapper used under X86-64 PIC mode for RIP
143      /// relative displacements.
144      WrapperRIP,
145
146      /// Copies a 64-bit value from an MMX vector to the low word
147      /// of an XMM vector, with the high word zero filled.
148      MOVQ2DQ,
149
150      /// Copies a 64-bit value from the low word of an XMM vector
151      /// to an MMX vector.
152      MOVDQ2Q,
153
154      /// Copies a 32-bit value from the low word of a MMX
155      /// vector to a GPR.
156      MMX_MOVD2W,
157
158      /// Copies a GPR into the low 32-bit word of a MMX vector
159      /// and zero out the high word.
160      MMX_MOVW2D,
161
162      /// Extract an 8-bit value from a vector and zero extend it to
163      /// i32, corresponds to X86::PEXTRB.
164      PEXTRB,
165
166      /// Extract a 16-bit value from a vector and zero extend it to
167      /// i32, corresponds to X86::PEXTRW.
168      PEXTRW,
169
170      /// Insert any element of a 4 x float vector into any element
171      /// of a destination 4 x floatvector.
172      INSERTPS,
173
174      /// Insert the lower 8-bits of a 32-bit value to a vector,
175      /// corresponds to X86::PINSRB.
176      PINSRB,
177
178      /// Insert the lower 16-bits of a 32-bit value to a vector,
179      /// corresponds to X86::PINSRW.
180      PINSRW,
181
182      /// Shuffle 16 8-bit values within a vector.
183      PSHUFB,
184
185      /// Compute Sum of Absolute Differences.
186      PSADBW,
187      /// Compute Double Block Packed Sum-Absolute-Differences
188      DBPSADBW,
189
190      /// Bitwise Logical AND NOT of Packed FP values.
191      ANDNP,
192
193      /// Blend where the selector is an immediate.
194      BLENDI,
195
196      /// Dynamic (non-constant condition) vector blend where only the sign bits
197      /// of the condition elements are used. This is used to enforce that the
198      /// condition mask is not valid for generic VSELECT optimizations. This
199      /// is also used to implement the intrinsics.
200      /// Operands are in VSELECT order: MASK, TRUE, FALSE
201      BLENDV,
202
203      /// Combined add and sub on an FP vector.
204      ADDSUB,
205
206      //  FP vector ops with rounding mode.
207      FADD_RND, FADDS, FADDS_RND,
208      FSUB_RND, FSUBS, FSUBS_RND,
209      FMUL_RND, FMULS, FMULS_RND,
210      FDIV_RND, FDIVS, FDIVS_RND,
211      FMAX_SAE, FMAXS_SAE,
212      FMIN_SAE, FMINS_SAE,
213      FSQRT_RND, FSQRTS, FSQRTS_RND,
214
215      // FP vector get exponent.
216      FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
217      // Extract Normalized Mantissas.
218      VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
219      // FP Scale.
220      SCALEF, SCALEF_RND,
221      SCALEFS, SCALEFS_RND,
222
223      // Unsigned Integer average.
224      AVG,
225
226      /// Integer horizontal add/sub.
227      HADD,
228      HSUB,
229
230      /// Floating point horizontal add/sub.
231      FHADD,
232      FHSUB,
233
234      // Detect Conflicts Within a Vector
235      CONFLICT,
236
237      /// Floating point max and min.
238      FMAX, FMIN,
239
240      /// Commutative FMIN and FMAX.
241      FMAXC, FMINC,
242
243      /// Scalar intrinsic floating point max and min.
244      FMAXS, FMINS,
245
246      /// Floating point reciprocal-sqrt and reciprocal approximation.
247      /// Note that these typically require refinement
248      /// in order to obtain suitable precision.
249      FRSQRT, FRCP,
250
251      // AVX-512 reciprocal approximations with a little more precision.
252      RSQRT14, RSQRT14S, RCP14, RCP14S,
253
254      // Thread Local Storage.
255      TLSADDR,
256
257      // Thread Local Storage. A call to get the start address
258      // of the TLS block for the current module.
259      TLSBASEADDR,
260
261      // Thread Local Storage.  When calling to an OS provided
262      // thunk at the address from an earlier relocation.
263      TLSCALL,
264
265      // Exception Handling helpers.
266      EH_RETURN,
267
268      // SjLj exception handling setjmp.
269      EH_SJLJ_SETJMP,
270
271      // SjLj exception handling longjmp.
272      EH_SJLJ_LONGJMP,
273
274      // SjLj exception handling dispatch.
275      EH_SJLJ_SETUP_DISPATCH,
276
277      /// Tail call return. See X86TargetLowering::LowerCall for
278      /// the list of operands.
279      TC_RETURN,
280
281      // Vector move to low scalar and zero higher vector elements.
282      VZEXT_MOVL,
283
284      // Vector integer truncate.
285      VTRUNC,
286      // Vector integer truncate with unsigned/signed saturation.
287      VTRUNCUS, VTRUNCS,
288
289      // Masked version of the above. Used when less than a 128-bit result is
290      // produced since the mask only applies to the lower elements and can't
291      // be represented by a select.
292      // SRC, PASSTHRU, MASK
293      VMTRUNC, VMTRUNCUS, VMTRUNCS,
294
295      // Vector FP extend.
296      VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
297
298      // Vector FP round.
299      VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
300
301      // Masked version of above. Used for v2f64->v4f32.
302      // SRC, PASSTHRU, MASK
303      VMFPROUND,
304
305      // 128-bit vector logical left / right shift
306      VSHLDQ, VSRLDQ,
307
308      // Vector shift elements
309      VSHL, VSRL, VSRA,
310
311      // Vector variable shift
312      VSHLV, VSRLV, VSRAV,
313
314      // Vector shift elements by immediate
315      VSHLI, VSRLI, VSRAI,
316
317      // Shifts of mask registers.
318      KSHIFTL, KSHIFTR,
319
320      // Bit rotate by immediate
321      VROTLI, VROTRI,
322
323      // Vector packed double/float comparison.
324      CMPP,
325
326      // Vector integer comparisons.
327      PCMPEQ, PCMPGT,
328
329      // v8i16 Horizontal minimum and position.
330      PHMINPOS,
331
332      MULTISHIFT,
333
334      /// Vector comparison generating mask bits for fp and
335      /// integer signed and unsigned data types.
336      CMPM,
337      // Vector comparison with SAE for FP values
338      CMPM_SAE,
339
340      // Arithmetic operations with FLAGS results.
341      ADD, SUB, ADC, SBB, SMUL, UMUL,
342      OR, XOR, AND,
343
344      // Bit field extract.
345      BEXTR,
346
347      // Zero High Bits Starting with Specified Bit Position.
348      BZHI,
349
350      // X86-specific multiply by immediate.
351      MUL_IMM,
352
353      // Vector sign bit extraction.
354      MOVMSK,
355
356      // Vector bitwise comparisons.
357      PTEST,
358
359      // Vector packed fp sign bitwise comparisons.
360      TESTP,
361
362      // OR/AND test for masks.
363      KORTEST,
364      KTEST,
365
366      // ADD for masks.
367      KADD,
368
369      // Several flavors of instructions with vector shuffle behaviors.
370      // Saturated signed/unnsigned packing.
371      PACKSS,
372      PACKUS,
373      // Intra-lane alignr.
374      PALIGNR,
375      // AVX512 inter-lane alignr.
376      VALIGN,
377      PSHUFD,
378      PSHUFHW,
379      PSHUFLW,
380      SHUFP,
381      // VBMI2 Concat & Shift.
382      VSHLD,
383      VSHRD,
384      VSHLDV,
385      VSHRDV,
386      //Shuffle Packed Values at 128-bit granularity.
387      SHUF128,
388      MOVDDUP,
389      MOVSHDUP,
390      MOVSLDUP,
391      MOVLHPS,
392      MOVHLPS,
393      MOVSD,
394      MOVSS,
395      UNPCKL,
396      UNPCKH,
397      VPERMILPV,
398      VPERMILPI,
399      VPERMI,
400      VPERM2X128,
401
402      // Variable Permute (VPERM).
403      // Res = VPERMV MaskV, V0
404      VPERMV,
405
406      // 3-op Variable Permute (VPERMT2).
407      // Res = VPERMV3 V0, MaskV, V1
408      VPERMV3,
409
410      // Bitwise ternary logic.
411      VPTERNLOG,
412      // Fix Up Special Packed Float32/64 values.
413      VFIXUPIMM, VFIXUPIMM_SAE,
414      VFIXUPIMMS, VFIXUPIMMS_SAE,
415      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
416      VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
417      // Reduce - Perform Reduction Transformation on scalar\packed FP.
418      VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
419      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
420      // Also used by the legacy (V)ROUND intrinsics where we mask out the
421      // scaling part of the immediate.
422      VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
423      // Tests Types Of a FP Values for packed types.
424      VFPCLASS,
425      // Tests Types Of a FP Values for scalar types.
426      VFPCLASSS,
427
428      // Broadcast (splat) scalar or element 0 of a vector. If the operand is
429      // a vector, this node may change the vector length as part of the splat.
430      VBROADCAST,
431      // Broadcast mask to vector.
432      VBROADCASTM,
433      // Broadcast subvector to vector.
434      SUBV_BROADCAST,
435
436      /// SSE4A Extraction and Insertion.
437      EXTRQI, INSERTQI,
438
439      // XOP arithmetic/logical shifts.
440      VPSHA, VPSHL,
441      // XOP signed/unsigned integer comparisons.
442      VPCOM, VPCOMU,
443      // XOP packed permute bytes.
444      VPPERM,
445      // XOP two source permutation.
446      VPERMIL2,
447
448      // Vector multiply packed unsigned doubleword integers.
449      PMULUDQ,
450      // Vector multiply packed signed doubleword integers.
451      PMULDQ,
452      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
453      MULHRS,
454
455      // Multiply and Add Packed Integers.
456      VPMADDUBSW, VPMADDWD,
457
458      // AVX512IFMA multiply and add.
459      // NOTE: These are different than the instruction and perform
460      // op0 x op1 + op2.
461      VPMADD52L, VPMADD52H,
462
463      // VNNI
464      VPDPBUSD,
465      VPDPBUSDS,
466      VPDPWSSD,
467      VPDPWSSDS,
468
469      // FMA nodes.
470      // We use the target independent ISD::FMA for the non-inverted case.
471      FNMADD,
472      FMSUB,
473      FNMSUB,
474      FMADDSUB,
475      FMSUBADD,
476
477      // FMA with rounding mode.
478      FMADD_RND,
479      FNMADD_RND,
480      FMSUB_RND,
481      FNMSUB_RND,
482      FMADDSUB_RND,
483      FMSUBADD_RND,
484
485      // Compress and expand.
486      COMPRESS,
487      EXPAND,
488
489      // Bits shuffle
490      VPSHUFBITQMB,
491
492      // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
493      SINT_TO_FP_RND, UINT_TO_FP_RND,
494      SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
495      SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
496
497      // Vector float/double to signed/unsigned integer.
498      CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
499      // Scalar float/double to signed/unsigned integer.
500      CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
501
502      // Vector float/double to signed/unsigned integer with truncation.
503      CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
504      // Scalar float/double to signed/unsigned integer with truncation.
505      CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
506
507      // Vector signed/unsigned integer to float/double.
508      CVTSI2P, CVTUI2P,
509
510      // Masked versions of above. Used for v2f64->v4f32.
511      // SRC, PASSTHRU, MASK
512      MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
513      MCVTSI2P, MCVTUI2P,
514
515      // Vector float to bfloat16.
516      // Convert TWO packed single data to one packed BF16 data
517      CVTNE2PS2BF16,
518      // Convert packed single data to packed BF16 data
519      CVTNEPS2BF16,
520      // Masked version of above.
521      // SRC, PASSTHRU, MASK
522      MCVTNEPS2BF16,
523
524      // Dot product of BF16 pairs to accumulated into
525      // packed single precision.
526      DPBF16PS,
527
528      // Save xmm argument registers to the stack, according to %al. An operator
529      // is needed so that this can be expanded with control flow.
530      VASTART_SAVE_XMM_REGS,
531
532      // Windows's _chkstk call to do stack probing.
533      WIN_ALLOCA,
534
535      // For allocating variable amounts of stack space when using
536      // segmented stacks. Check if the current stacklet has enough space, and
537      // falls back to heap allocation if not.
538      SEG_ALLOCA,
539
540      // Memory barriers.
541      MEMBARRIER,
542      MFENCE,
543
544      // Store FP status word into i16 register.
545      FNSTSW16r,
546
547      // Store contents of %ah into %eflags.
548      SAHF,
549
550      // Get a random integer and indicate whether it is valid in CF.
551      RDRAND,
552
553      // Get a NIST SP800-90B & C compliant random integer and
554      // indicate whether it is valid in CF.
555      RDSEED,
556
557      // Protection keys
558      // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
559      // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
560      // value for ECX.
561      RDPKRU, WRPKRU,
562
563      // SSE42 string comparisons.
564      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
565      // will emit one or two instructions based on which results are used. If
566      // flags and index/mask this allows us to use a single instruction since
567      // we won't have to pick and opcode for flags. Instead we can rely on the
568      // DAG to CSE everything and decide at isel.
569      PCMPISTR,
570      PCMPESTR,
571
572      // Test if in transactional execution.
573      XTEST,
574
575      // ERI instructions.
576      RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
577      RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
578
579      // Conversions between float and half-float.
580      CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
581
582      // Masked version of above.
583      // SRC, RND, PASSTHRU, MASK
584      MCVTPS2PH,
585
586      // Galois Field Arithmetic Instructions
587      GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
588
589      // LWP insert record.
590      LWPINS,
591
592      // User level wait
593      UMWAIT, TPAUSE,
594
595      // Enqueue Stores Instructions
596      ENQCMD, ENQCMDS,
597
598      // For avx512-vp2intersect
599      VP2INTERSECT,
600
601      /// X86 strict FP compare instructions.
602      STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
603      STRICT_FCMPS,
604
605      // Vector packed double/float comparison.
606      STRICT_CMPP,
607
608      /// Vector comparison generating mask bits for fp and
609      /// integer signed and unsigned data types.
610      STRICT_CMPM,
611
612      // Vector float/double to signed/unsigned integer with truncation.
613      STRICT_CVTTP2SI, STRICT_CVTTP2UI,
614
615      // Vector FP extend.
616      STRICT_VFPEXT,
617
618      // Vector FP round.
619      STRICT_VFPROUND,
620
621      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
622      // Also used by the legacy (V)ROUND intrinsics where we mask out the
623      // scaling part of the immediate.
624      STRICT_VRNDSCALE,
625
626      // Vector signed/unsigned integer to float/double.
627      STRICT_CVTSI2P, STRICT_CVTUI2P,
628
629      // Compare and swap.
630      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
631      LCMPXCHG8_DAG,
632      LCMPXCHG16_DAG,
633      LCMPXCHG8_SAVE_EBX_DAG,
634      LCMPXCHG16_SAVE_RBX_DAG,
635
636      /// LOCK-prefixed arithmetic read-modify-write instructions.
637      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
638      LADD, LSUB, LOR, LXOR, LAND,
639
640      // Load, scalar_to_vector, and zero extend.
641      VZEXT_LOAD,
642
643      // extract_vector_elt, store.
644      VEXTRACT_STORE,
645
646      // scalar broadcast from memory
647      VBROADCAST_LOAD,
648
649      // Store FP control world into i16 memory.
650      FNSTCW16m,
651
652      /// This instruction implements FP_TO_SINT with the
653      /// integer destination in memory and a FP reg source.  This corresponds
654      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
655      /// has two inputs (token chain and address) and two outputs (int value
656      /// and token chain). Memory VT specifies the type to store to.
657      FP_TO_INT_IN_MEM,
658
659      /// This instruction implements SINT_TO_FP with the
660      /// integer source in memory and FP reg result.  This corresponds to the
661      /// X86::FILD*m instructions. It has two inputs (token chain and address)
662      /// and two outputs (FP value and token chain). FILD_FLAG also produces a
663      /// flag). The integer source type is specified by the memory VT.
664      FILD,
665      FILD_FLAG,
666
667      /// This instruction implements a fp->int store from FP stack
668      /// slots. This corresponds to the fist instruction. It takes a
669      /// chain operand, value to store, address, and glue. The memory VT
670      /// specifies the type to store as.
671      FIST,
672
673      /// This instruction implements an extending load to FP stack slots.
674      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
675      /// operand, and ptr to load from. The memory VT specifies the type to
676      /// load from.
677      FLD,
678
679      /// This instruction implements a truncating store from FP stack
680      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
681      /// chain operand, value to store, address, and glue. The memory VT
682      /// specifies the type to store as.
683      FST,
684
685      /// This instruction grabs the address of the next argument
686      /// from a va_list. (reads and modifies the va_list in memory)
687      VAARG_64,
688
689      // Vector truncating store with unsigned/signed saturation
690      VTRUNCSTOREUS, VTRUNCSTORES,
691      // Vector truncating masked store with unsigned/signed saturation
692      VMTRUNCSTOREUS, VMTRUNCSTORES,
693
694      // X86 specific gather and scatter
695      MGATHER, MSCATTER,
696
697      // WARNING: Do not add anything in the end unless you want the node to
698      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
699      // opcodes will be thought as target memory ops!
700    };
701  } // end namespace X86ISD
702
703  /// Define some predicates that are used for node matching.
704  namespace X86 {
705    /// Returns true if Elt is a constant zero or floating point constant +0.0.
706    bool isZeroNode(SDValue Elt);
707
708    /// Returns true of the given offset can be
709    /// fit into displacement field of the instruction.
710    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
711                                      bool hasSymbolicDisplacement = true);
712
713    /// Determines whether the callee is required to pop its
714    /// own arguments. Callee pop is necessary to support tail calls.
715    bool isCalleePop(CallingConv::ID CallingConv,
716                     bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
717
718    /// If Op is a constant whose elements are all the same constant or
719    /// undefined, return true and return the constant value in \p SplatVal.
720    bool isConstantSplat(SDValue Op, APInt &SplatVal);
721  } // end namespace X86
722
723  //===--------------------------------------------------------------------===//
724  //  X86 Implementation of the TargetLowering interface
725  class X86TargetLowering final : public TargetLowering {
726  public:
727    explicit X86TargetLowering(const X86TargetMachine &TM,
728                               const X86Subtarget &STI);
729
730    unsigned getJumpTableEncoding() const override;
731    bool useSoftFloat() const override;
732
733    void markLibCallAttributes(MachineFunction *MF, unsigned CC,
734                               ArgListTy &Args) const override;
735
736    MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
737      return MVT::i8;
738    }
739
740    const MCExpr *
741    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
742                              const MachineBasicBlock *MBB, unsigned uid,
743                              MCContext &Ctx) const override;
744
745    /// Returns relocation base for the given PIC jumptable.
746    SDValue getPICJumpTableRelocBase(SDValue Table,
747                                     SelectionDAG &DAG) const override;
748    const MCExpr *
749    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
750                                 unsigned JTI, MCContext &Ctx) const override;
751
752    /// Return the desired alignment for ByVal aggregate
753    /// function arguments in the caller parameter area. For X86, aggregates
754    /// that contains are placed at 16-byte boundaries while the rest are at
755    /// 4-byte boundaries.
756    unsigned getByValTypeAlignment(Type *Ty,
757                                   const DataLayout &DL) const override;
758
759    /// Returns the target specific optimal type for load
760    /// and store operations as a result of memset, memcpy, and memmove
761    /// lowering. If DstAlign is zero that means it's safe to destination
762    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
763    /// means there isn't a need to check it against alignment requirement,
764    /// probably because the source does not need to be loaded. If 'IsMemset' is
765    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
766    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
767    /// source is constant so it does not need to be loaded.
768    /// It returns EVT::Other if the type should be determined using generic
769    /// target-independent logic.
770    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
771                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
772                            const AttributeList &FuncAttributes) const override;
773
774    /// Returns true if it's safe to use load / store of the
775    /// specified type to expand memcpy / memset inline. This is mostly true
776    /// for all types except for some special cases. For example, on X86
777    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
778    /// also does type conversion. Note the specified type doesn't have to be
779    /// legal as the hook is used before type legalization.
780    bool isSafeMemOpType(MVT VT) const override;
781
782    /// Returns true if the target allows unaligned memory accesses of the
783    /// specified type. Returns whether it is "fast" in the last argument.
784    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
785                                        MachineMemOperand::Flags Flags,
786                                        bool *Fast) const override;
787
788    /// Provide custom lowering hooks for some operations.
789    ///
790    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
791
792    /// Places new result values for the node in Results (their number
793    /// and types must exactly match those of the original return values of
794    /// the node), or leaves Results empty, which indicates that the node is not
795    /// to be custom lowered after all.
796    void LowerOperationWrapper(SDNode *N,
797                               SmallVectorImpl<SDValue> &Results,
798                               SelectionDAG &DAG) const override;
799
800    /// Replace the results of node with an illegal result
801    /// type with new values built out of custom code.
802    ///
803    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
804                            SelectionDAG &DAG) const override;
805
806    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
807
808    // Return true if it is profitable to combine a BUILD_VECTOR with a
809    // stride-pattern to a shuffle and a truncate.
810    // Example of such a combine:
811    // v4i32 build_vector((extract_elt V, 1),
812    //                    (extract_elt V, 3),
813    //                    (extract_elt V, 5),
814    //                    (extract_elt V, 7))
815    //  -->
816    // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
817    // v4i64)
818    bool isDesirableToCombineBuildVectorToShuffleTruncate(
819        ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
820
821    /// Return true if the target has native support for
822    /// the specified value type and it is 'desirable' to use the type for the
823    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
824    /// instruction encodings are longer and some i16 instructions are slow.
825    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
826
827    /// Return true if the target has native support for the
828    /// specified value type and it is 'desirable' to use the type. e.g. On x86
829    /// i16 is legal, but undesirable since i16 instruction encodings are longer
830    /// and some i16 instructions are slow.
831    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
832
833    /// Return 1 if we can compute the negated form of the specified expression
834    /// for the same cost as the expression itself, or 2 if we can compute the
835    /// negated form more cheaply than the expression itself. Else return 0.
836    char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
837                            bool ForCodeSize, unsigned Depth) const override;
838
839    /// If isNegatibleForFree returns true, return the newly negated expression.
840    SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
841                                 bool LegalOperations, bool ForCodeSize,
842                                 unsigned Depth) const override;
843
844    MachineBasicBlock *
845    EmitInstrWithCustomInserter(MachineInstr &MI,
846                                MachineBasicBlock *MBB) const override;
847
848    /// This method returns the name of a target specific DAG node.
849    const char *getTargetNodeName(unsigned Opcode) const override;
850
851    /// Do not merge vector stores after legalization because that may conflict
852    /// with x86-specific store splitting optimizations.
853    bool mergeStoresAfterLegalization(EVT MemVT) const override {
854      return !MemVT.isVector();
855    }
856
857    bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
858                          const SelectionDAG &DAG) const override;
859
860    bool isCheapToSpeculateCttz() const override;
861
862    bool isCheapToSpeculateCtlz() const override;
863
864    bool isCtlzFast() const override;
865
866    bool hasBitPreservingFPLogic(EVT VT) const override {
867      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
868    }
869
870    bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
871      // If the pair to store is a mixture of float and int values, we will
872      // save two bitwise instructions and one float-to-int instruction and
873      // increase one store instruction. There is potentially a more
874      // significant benefit because it avoids the float->int domain switch
875      // for input value. So It is more likely a win.
876      if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
877          (LTy.isInteger() && HTy.isFloatingPoint()))
878        return true;
879      // If the pair only contains int values, we will save two bitwise
880      // instructions and increase one store instruction (costing one more
881      // store buffer). Since the benefit is more blurred so we leave
882      // such pair out until we get testcase to prove it is a win.
883      return false;
884    }
885
886    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
887
888    bool hasAndNotCompare(SDValue Y) const override;
889
890    bool hasAndNot(SDValue Y) const override;
891
892    bool hasBitTest(SDValue X, SDValue Y) const override;
893
894    bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
895        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
896        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
897        SelectionDAG &DAG) const override;
898
899    bool shouldFoldConstantShiftPairToMask(const SDNode *N,
900                                           CombineLevel Level) const override;
901
902    bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
903
904    bool
905    shouldTransformSignedTruncationCheck(EVT XVT,
906                                         unsigned KeptBits) const override {
907      // For vectors, we don't have a preference..
908      if (XVT.isVector())
909        return false;
910
911      auto VTIsOk = [](EVT VT) -> bool {
912        return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
913               VT == MVT::i64;
914      };
915
916      // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
917      // XVT will be larger than KeptBitsVT.
918      MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
919      return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
920    }
921
922    bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
923
924    bool shouldSplatInsEltVarIndex(EVT VT) const override;
925
926    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
927      return VT.isScalarInteger();
928    }
929
930    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
931    MVT hasFastEqualityCompare(unsigned NumBits) const override;
932
933    /// Return the value type to use for ISD::SETCC.
934    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
935                           EVT VT) const override;
936
937    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
938                                      TargetLoweringOpt &TLO) const override;
939
940    /// Determine which of the bits specified in Mask are known to be either
941    /// zero or one and return them in the KnownZero/KnownOne bitsets.
942    void computeKnownBitsForTargetNode(const SDValue Op,
943                                       KnownBits &Known,
944                                       const APInt &DemandedElts,
945                                       const SelectionDAG &DAG,
946                                       unsigned Depth = 0) const override;
947
948    /// Determine the number of bits in the operation that are sign bits.
949    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
950                                             const APInt &DemandedElts,
951                                             const SelectionDAG &DAG,
952                                             unsigned Depth) const override;
953
954    bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
955                                                 const APInt &DemandedElts,
956                                                 APInt &KnownUndef,
957                                                 APInt &KnownZero,
958                                                 TargetLoweringOpt &TLO,
959                                                 unsigned Depth) const override;
960
961    bool SimplifyDemandedBitsForTargetNode(SDValue Op,
962                                           const APInt &DemandedBits,
963                                           const APInt &DemandedElts,
964                                           KnownBits &Known,
965                                           TargetLoweringOpt &TLO,
966                                           unsigned Depth) const override;
967
968    SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
969        SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
970        SelectionDAG &DAG, unsigned Depth) const override;
971
972    const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
973
974    SDValue unwrapAddress(SDValue N) const override;
975
976    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
977
978    bool ExpandInlineAsm(CallInst *CI) const override;
979
980    ConstraintType getConstraintType(StringRef Constraint) const override;
981
982    /// Examine constraint string and operand type and determine a weight value.
983    /// The operand object must already have been set up with the operand type.
984    ConstraintWeight
985      getSingleConstraintMatchWeight(AsmOperandInfo &info,
986                                     const char *constraint) const override;
987
988    const char *LowerXConstraint(EVT ConstraintVT) const override;
989
990    /// Lower the specified operand into the Ops vector. If it is invalid, don't
991    /// add anything to Ops. If hasMemory is true it means one of the asm
992    /// constraint of the inline asm instruction being processed is 'm'.
993    void LowerAsmOperandForConstraint(SDValue Op,
994                                      std::string &Constraint,
995                                      std::vector<SDValue> &Ops,
996                                      SelectionDAG &DAG) const override;
997
998    unsigned
999    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
1000      if (ConstraintCode == "o")
1001        return InlineAsm::Constraint_o;
1002      else if (ConstraintCode == "v")
1003        return InlineAsm::Constraint_v;
1004      else if (ConstraintCode == "X")
1005        return InlineAsm::Constraint_X;
1006      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
1007    }
1008
1009    /// Handle Lowering flag assembly outputs.
1010    SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
1011                                        const AsmOperandInfo &Constraint,
1012                                        SelectionDAG &DAG) const override;
1013
1014    /// Given a physical register constraint
1015    /// (e.g. {edx}), return the register number and the register class for the
1016    /// register.  This should only be used for C_Register constraints.  On
1017    /// error, this returns a register number of 0.
1018    std::pair<unsigned, const TargetRegisterClass *>
1019    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
1020                                 StringRef Constraint, MVT VT) const override;
1021
1022    /// Return true if the addressing mode represented
1023    /// by AM is legal for this target, for a load/store of the specified type.
1024    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
1025                               Type *Ty, unsigned AS,
1026                               Instruction *I = nullptr) const override;
1027
1028    /// Return true if the specified immediate is legal
1029    /// icmp immediate, that is the target has icmp instructions which can
1030    /// compare a register against the immediate without having to materialize
1031    /// the immediate into a register.
1032    bool isLegalICmpImmediate(int64_t Imm) const override;
1033
1034    /// Return true if the specified immediate is legal
1035    /// add immediate, that is the target has add instructions which can
1036    /// add a register and the immediate without having to materialize
1037    /// the immediate into a register.
1038    bool isLegalAddImmediate(int64_t Imm) const override;
1039
1040    bool isLegalStoreImmediate(int64_t Imm) const override;
1041
1042    /// Return the cost of the scaling factor used in the addressing
1043    /// mode represented by AM for this target, for a load/store
1044    /// of the specified type.
1045    /// If the AM is supported, the return value must be >= 0.
1046    /// If the AM is not supported, it returns a negative value.
1047    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
1048                             unsigned AS) const override;
1049
1050    bool isVectorShiftByScalarCheap(Type *Ty) const override;
1051
1052    /// Add x86-specific opcodes to the default list.
1053    bool isBinOp(unsigned Opcode) const override;
1054
1055    /// Returns true if the opcode is a commutative binary operation.
1056    bool isCommutativeBinOp(unsigned Opcode) const override;
1057
1058    /// Return true if it's free to truncate a value of
1059    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
1060    /// register EAX to i16 by referencing its sub-register AX.
1061    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
1062    bool isTruncateFree(EVT VT1, EVT VT2) const override;
1063
1064    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
1065
1066    /// Return true if any actual instruction that defines a
1067    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
1068    /// register. This does not necessarily include registers defined in
1069    /// unknown ways, such as incoming arguments, or copies from unknown
1070    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
1071    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
1072    /// all instructions that define 32-bit values implicit zero-extend the
1073    /// result out to 64 bits.
1074    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
1075    bool isZExtFree(EVT VT1, EVT VT2) const override;
1076    bool isZExtFree(SDValue Val, EVT VT2) const override;
1077
1078    /// Return true if folding a vector load into ExtVal (a sign, zero, or any
1079    /// extend node) is profitable.
1080    bool isVectorLoadExtDesirable(SDValue) const override;
1081
1082    /// Return true if an FMA operation is faster than a pair of fmul and fadd
1083    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
1084    /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
1085    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
1086                                    EVT VT) const override;
1087
1088    /// Return true if it's profitable to narrow
1089    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
1090    /// from i32 to i8 but not from i32 to i16.
1091    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
1092
1093    /// Given an intrinsic, checks if on the target the intrinsic will need to map
1094    /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
1095    /// true and stores the intrinsic information into the IntrinsicInfo that was
1096    /// passed to the function.
1097    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
1098                            MachineFunction &MF,
1099                            unsigned Intrinsic) const override;
1100
1101    /// Returns true if the target can instruction select the
1102    /// specified FP immediate natively. If false, the legalizer will
1103    /// materialize the FP immediate as a load from a constant pool.
1104    bool isFPImmLegal(const APFloat &Imm, EVT VT,
1105                      bool ForCodeSize) const override;
1106
1107    /// Targets can use this to indicate that they only support *some*
1108    /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
1109    /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
1110    /// be legal.
1111    bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1112
1113    /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
1114    /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
1115    /// constant pool entry.
1116    bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1117
1118    /// Returns true if lowering to a jump table is allowed.
1119    bool areJTsAllowed(const Function *Fn) const override;
1120
1121    /// If true, then instruction selection should
1122    /// seek to shrink the FP constant of the specified type to a smaller type
1123    /// in order to save space and / or reduce runtime.
1124    bool ShouldShrinkFPConstant(EVT VT) const override {
1125      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
1126      // expensive than a straight movsd. On the other hand, it's important to
1127      // shrink long double fp constant since fldt is very slow.
1128      return !X86ScalarSSEf64 || VT == MVT::f80;
1129    }
1130
1131    /// Return true if we believe it is correct and profitable to reduce the
1132    /// load node to a smaller type.
1133    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
1134                               EVT NewVT) const override;
1135
1136    /// Return true if the specified scalar FP type is computed in an SSE
1137    /// register, not on the X87 floating point stack.
1138    bool isScalarFPTypeInSSEReg(EVT VT) const {
1139      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
1140             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
1141    }
1142
1143    /// Returns true if it is beneficial to convert a load of a constant
1144    /// to just the constant itself.
1145    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
1146                                           Type *Ty) const override;
1147
1148    bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
1149
1150    bool convertSelectOfConstantsToMath(EVT VT) const override;
1151
1152    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
1153                                SDValue C) const override;
1154
1155    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
1156    /// with this index.
1157    bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1158                                 unsigned Index) const override;
1159
1160    /// Scalar ops always have equal or better analysis/performance/power than
1161    /// the vector equivalent, so this always makes sense if the scalar op is
1162    /// supported.
1163    bool shouldScalarizeBinop(SDValue) const override;
1164
1165    /// Extract of a scalar FP value from index 0 of a vector is free.
1166    bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
1167      EVT EltVT = VT.getScalarType();
1168      return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
1169    }
1170
1171    /// Overflow nodes should get combined/lowered to optimal instructions
1172    /// (they should allow eliminating explicit compares by getting flags from
1173    /// math ops).
1174    bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
1175
1176    bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
1177                                      unsigned AddrSpace) const override {
1178      // If we can replace more than 2 scalar stores, there will be a reduction
1179      // in instructions even after we add a vector constant load.
1180      return NumElem > 2;
1181    }
1182
1183    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
1184                                 const SelectionDAG &DAG,
1185                                 const MachineMemOperand &MMO) const override;
1186
1187    /// Intel processors have a unified instruction and data cache
1188    const char * getClearCacheBuiltinName() const override {
1189      return nullptr; // nothing to do, move along.
1190    }
1191
1192    Register getRegisterByName(const char* RegName, LLT VT,
1193                               const MachineFunction &MF) const override;
1194
1195    /// If a physical register, this returns the register that receives the
1196    /// exception address on entry to an EH pad.
1197    unsigned
1198    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
1199
1200    /// If a physical register, this returns the register that receives the
1201    /// exception typeid on entry to a landing pad.
1202    unsigned
1203    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
1204
1205    virtual bool needsFixedCatchObjects() const override;
1206
1207    /// This method returns a target specific FastISel object,
1208    /// or null if the target does not support "fast" ISel.
1209    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1210                             const TargetLibraryInfo *libInfo) const override;
1211
1212    /// If the target has a standard location for the stack protector cookie,
1213    /// returns the address of that location. Otherwise, returns nullptr.
1214    Value *getIRStackGuard(IRBuilder<> &IRB) const override;
1215
1216    bool useLoadStackGuardNode() const override;
1217    bool useStackGuardXorFP() const override;
1218    void insertSSPDeclarations(Module &M) const override;
1219    Value *getSDagStackGuard(const Module &M) const override;
1220    Function *getSSPStackGuardCheck(const Module &M) const override;
1221    SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1222                                const SDLoc &DL) const override;
1223
1224
1225    /// Return true if the target stores SafeStack pointer at a fixed offset in
1226    /// some non-standard address space, and populates the address space and
1227    /// offset as appropriate.
1228    Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
1229
1230    std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
1231                                          SDValue StackSlot,
1232                                          SelectionDAG &DAG) const;
1233
1234    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
1235
1236    /// Customize the preferred legalization strategy for certain types.
1237    LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
1238
1239    MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
1240                                      EVT VT) const override;
1241
1242    unsigned getNumRegistersForCallingConv(LLVMContext &Context,
1243                                           CallingConv::ID CC,
1244                                           EVT VT) const override;
1245
1246    unsigned getVectorTypeBreakdownForCallingConv(
1247        LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1248        unsigned &NumIntermediates, MVT &RegisterVT) const override;
1249
1250    bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
1251
1252    bool supportSwiftError() const override;
1253
1254    StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
1255
1256    unsigned getStackProbeSize(MachineFunction &MF) const;
1257
1258    bool hasVectorBlend() const override { return true; }
1259
1260    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
1261
1262    /// Lower interleaved load(s) into target specific
1263    /// instructions/intrinsics.
1264    bool lowerInterleavedLoad(LoadInst *LI,
1265                              ArrayRef<ShuffleVectorInst *> Shuffles,
1266                              ArrayRef<unsigned> Indices,
1267                              unsigned Factor) const override;
1268
1269    /// Lower interleaved store(s) into target specific
1270    /// instructions/intrinsics.
1271    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
1272                               unsigned Factor) const override;
1273
1274    SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
1275                                   SDValue Addr, SelectionDAG &DAG)
1276                                   const override;
1277
1278  protected:
1279    std::pair<const TargetRegisterClass *, uint8_t>
1280    findRepresentativeClass(const TargetRegisterInfo *TRI,
1281                            MVT VT) const override;
1282
1283  private:
1284    /// Keep a reference to the X86Subtarget around so that we can
1285    /// make the right decision when generating code for different targets.
1286    const X86Subtarget &Subtarget;
1287
1288    /// Select between SSE or x87 floating point ops.
1289    /// When SSE is available, use it for f32 operations.
1290    /// When SSE2 is available, use it for f64 operations.
1291    bool X86ScalarSSEf32;
1292    bool X86ScalarSSEf64;
1293
1294    /// A list of legal FP immediates.
1295    std::vector<APFloat> LegalFPImmediates;
1296
1297    /// Indicate that this x86 target can instruction
1298    /// select the specified FP immediate natively.
1299    void addLegalFPImmediate(const APFloat& Imm) {
1300      LegalFPImmediates.push_back(Imm);
1301    }
1302
1303    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
1304                            CallingConv::ID CallConv, bool isVarArg,
1305                            const SmallVectorImpl<ISD::InputArg> &Ins,
1306                            const SDLoc &dl, SelectionDAG &DAG,
1307                            SmallVectorImpl<SDValue> &InVals,
1308                            uint32_t *RegMask) const;
1309    SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1310                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
1311                             const SDLoc &dl, SelectionDAG &DAG,
1312                             const CCValAssign &VA, MachineFrameInfo &MFI,
1313                             unsigned i) const;
1314    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
1315                             const SDLoc &dl, SelectionDAG &DAG,
1316                             const CCValAssign &VA,
1317                             ISD::ArgFlagsTy Flags) const;
1318
1319    // Call lowering helpers.
1320
1321    /// Check whether the call is eligible for tail call optimization. Targets
1322    /// that want to do tail call optimization should implement this function.
1323    bool IsEligibleForTailCallOptimization(SDValue Callee,
1324                                           CallingConv::ID CalleeCC,
1325                                           bool isVarArg,
1326                                           bool isCalleeStructRet,
1327                                           bool isCallerStructRet,
1328                                           Type *RetTy,
1329                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1330                                    const SmallVectorImpl<SDValue> &OutVals,
1331                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1332                                           SelectionDAG& DAG) const;
1333    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
1334                                    SDValue Chain, bool IsTailCall,
1335                                    bool Is64Bit, int FPDiff,
1336                                    const SDLoc &dl) const;
1337
1338    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
1339                                         SelectionDAG &DAG) const;
1340
1341    unsigned getAddressSpace(void) const;
1342
1343    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
1344                            SDValue &Chain) const;
1345
1346    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
1347    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
1348    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1349    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1350
1351    unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
1352                                  const unsigned char OpFlags = 0) const;
1353    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
1354    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
1355    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
1356    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
1357    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
1358
1359    /// Creates target global address or external symbol nodes for calls or
1360    /// other uses.
1361    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
1362                                  bool ForCall) const;
1363
1364    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1365    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1366    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
1367    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
1368    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
1369    SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
1370    SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
1371    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
1372    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1373    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
1374    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
1375    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
1376    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
1377    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1378    SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1379    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
1380    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
1381    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
1382    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
1383    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
1384    SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
1385    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
1386    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
1387    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
1388    SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
1389    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1390    SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
1391    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
1392    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1393
1394    SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
1395                          RTLIB::Libcall Call) const;
1396
1397    SDValue
1398    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1399                         const SmallVectorImpl<ISD::InputArg> &Ins,
1400                         const SDLoc &dl, SelectionDAG &DAG,
1401                         SmallVectorImpl<SDValue> &InVals) const override;
1402    SDValue LowerCall(CallLoweringInfo &CLI,
1403                      SmallVectorImpl<SDValue> &InVals) const override;
1404
1405    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1406                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1407                        const SmallVectorImpl<SDValue> &OutVals,
1408                        const SDLoc &dl, SelectionDAG &DAG) const override;
1409
1410    bool supportSplitCSR(MachineFunction *MF) const override {
1411      return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
1412          MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
1413    }
1414    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
1415    void insertCopiesSplitCSR(
1416      MachineBasicBlock *Entry,
1417      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
1418
1419    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
1420
1421    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
1422
1423    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
1424                            ISD::NodeType ExtendKind) const override;
1425
1426    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
1427                        bool isVarArg,
1428                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1429                        LLVMContext &Context) const override;
1430
1431    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
1432
1433    TargetLoweringBase::AtomicExpansionKind
1434    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
1435    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
1436    TargetLoweringBase::AtomicExpansionKind
1437    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
1438
1439    LoadInst *
1440    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
1441
1442    bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
1443    bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
1444
1445    bool needsCmpXchgNb(Type *MemType) const;
1446
1447    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
1448                                MachineBasicBlock *DispatchBB, int FI) const;
1449
1450    // Utility function to emit the low-level va_arg code for X86-64.
1451    MachineBasicBlock *
1452    EmitVAARG64WithCustomInserter(MachineInstr &MI,
1453                                  MachineBasicBlock *MBB) const;
1454
1455    /// Utility function to emit the xmm reg save portion of va_start.
1456    MachineBasicBlock *
1457    EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
1458                                             MachineBasicBlock *BB) const;
1459
1460    MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
1461                                                 MachineInstr &MI2,
1462                                                 MachineBasicBlock *BB) const;
1463
1464    MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
1465                                         MachineBasicBlock *BB) const;
1466
1467    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
1468                                           MachineBasicBlock *BB) const;
1469
1470    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
1471                                           MachineBasicBlock *BB) const;
1472
1473    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
1474                                           MachineBasicBlock *BB) const;
1475
1476    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
1477                                            MachineBasicBlock *BB) const;
1478
1479    MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
1480                                          MachineBasicBlock *BB) const;
1481
1482    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
1483                                          MachineBasicBlock *BB) const;
1484
1485    MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
1486                                                MachineBasicBlock *BB) const;
1487
1488    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
1489                                        MachineBasicBlock *MBB) const;
1490
1491    void emitSetJmpShadowStackFix(MachineInstr &MI,
1492                                  MachineBasicBlock *MBB) const;
1493
1494    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
1495                                         MachineBasicBlock *MBB) const;
1496
1497    MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
1498                                                 MachineBasicBlock *MBB) const;
1499
1500    MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
1501                                     MachineBasicBlock *MBB) const;
1502
1503    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
1504                                             MachineBasicBlock *MBB) const;
1505
1506    /// Convert a comparison if required by the subtarget.
1507    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
1508
1509    /// Emit flags for the given setcc condition and operands. Also returns the
1510    /// corresponding X86 condition code constant in X86CC.
1511    SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
1512                              const SDLoc &dl, SelectionDAG &DAG,
1513                              SDValue &X86CC, SDValue &Chain,
1514                              bool IsSignaling) const;
1515
1516    /// Check if replacement of SQRT with RSQRT should be disabled.
1517    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
1518
1519    /// Use rsqrt* to speed up sqrt calculations.
1520    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
1521                            int &RefinementSteps, bool &UseOneConstNR,
1522                            bool Reciprocal) const override;
1523
1524    /// Use rcp* to speed up fdiv calculations.
1525    SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
1526                             int &RefinementSteps) const override;
1527
1528    /// Reassociate floating point divisions into multiply by reciprocal.
1529    unsigned combineRepeatedFPDivisors() const override;
1530
1531    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
1532                          SmallVectorImpl<SDNode *> &Created) const override;
1533  };
1534
1535  namespace X86 {
1536    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1537                             const TargetLibraryInfo *libInfo);
1538  } // end namespace X86
1539
1540  // Base class for all X86 non-masked store operations.
1541  class X86StoreSDNode : public MemSDNode {
1542  public:
1543    X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
1544                   SDVTList VTs, EVT MemVT,
1545                   MachineMemOperand *MMO)
1546      :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
1547    const SDValue &getValue() const { return getOperand(1); }
1548    const SDValue &getBasePtr() const { return getOperand(2); }
1549
1550    static bool classof(const SDNode *N) {
1551      return N->getOpcode() == X86ISD::VTRUNCSTORES ||
1552        N->getOpcode() == X86ISD::VTRUNCSTOREUS;
1553    }
1554  };
1555
1556  // Base class for all X86 masked store operations.
1557  // The class has the same order of operands as MaskedStoreSDNode for
1558  // convenience.
1559  class X86MaskedStoreSDNode : public MemSDNode {
1560  public:
1561    X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
1562                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
1563                         MachineMemOperand *MMO)
1564      : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
1565
1566    const SDValue &getValue()   const { return getOperand(1); }
1567    const SDValue &getBasePtr() const { return getOperand(2); }
1568    const SDValue &getMask()    const { return getOperand(3); }
1569
1570    static bool classof(const SDNode *N) {
1571      return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
1572        N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
1573    }
1574  };
1575
1576  // X86 Truncating Store with Signed saturation.
1577  class TruncSStoreSDNode : public X86StoreSDNode {
1578  public:
1579    TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
1580                        SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
1581      : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
1582
1583    static bool classof(const SDNode *N) {
1584      return N->getOpcode() == X86ISD::VTRUNCSTORES;
1585    }
1586  };
1587
1588  // X86 Truncating Store with Unsigned saturation.
1589  class TruncUSStoreSDNode : public X86StoreSDNode {
1590  public:
1591    TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
1592                      SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
1593      : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
1594
1595    static bool classof(const SDNode *N) {
1596      return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
1597    }
1598  };
1599
1600  // X86 Truncating Masked Store with Signed saturation.
1601  class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
1602  public:
1603    MaskedTruncSStoreSDNode(unsigned Order,
1604                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
1605                         MachineMemOperand *MMO)
1606      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
1607
1608    static bool classof(const SDNode *N) {
1609      return N->getOpcode() == X86ISD::VMTRUNCSTORES;
1610    }
1611  };
1612
1613  // X86 Truncating Masked Store with Unsigned saturation.
1614  class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
1615  public:
1616    MaskedTruncUSStoreSDNode(unsigned Order,
1617                            const DebugLoc &dl, SDVTList VTs, EVT MemVT,
1618                            MachineMemOperand *MMO)
1619      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
1620
1621    static bool classof(const SDNode *N) {
1622      return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
1623    }
1624  };
1625
1626  // X86 specific Gather/Scatter nodes.
1627  // The class has the same order of operands as MaskedGatherScatterSDNode for
1628  // convenience.
1629  class X86MaskedGatherScatterSDNode : public MemSDNode {
1630  public:
1631    X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
1632                                 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
1633                                 MachineMemOperand *MMO)
1634        : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
1635
1636    const SDValue &getBasePtr() const { return getOperand(3); }
1637    const SDValue &getIndex()   const { return getOperand(4); }
1638    const SDValue &getMask()    const { return getOperand(2); }
1639    const SDValue &getScale()   const { return getOperand(5); }
1640
1641    static bool classof(const SDNode *N) {
1642      return N->getOpcode() == X86ISD::MGATHER ||
1643             N->getOpcode() == X86ISD::MSCATTER;
1644    }
1645  };
1646
1647  class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
1648  public:
1649    X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
1650                          EVT MemVT, MachineMemOperand *MMO)
1651        : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
1652                                       MMO) {}
1653
1654    const SDValue &getPassThru() const { return getOperand(1); }
1655
1656    static bool classof(const SDNode *N) {
1657      return N->getOpcode() == X86ISD::MGATHER;
1658    }
1659  };
1660
1661  class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
1662  public:
1663    X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
1664                           EVT MemVT, MachineMemOperand *MMO)
1665        : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
1666                                       MMO) {}
1667
1668    const SDValue &getValue() const { return getOperand(1); }
1669
1670    static bool classof(const SDNode *N) {
1671      return N->getOpcode() == X86ISD::MSCATTER;
1672    }
1673  };
1674
1675  /// Generate unpacklo/unpackhi shuffle mask.
1676  template <typename T = int>
1677  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
1678                               bool Unary) {
1679    assert(Mask.empty() && "Expected an empty shuffle mask vector");
1680    int NumElts = VT.getVectorNumElements();
1681    int NumEltsInLane = 128 / VT.getScalarSizeInBits();
1682    for (int i = 0; i < NumElts; ++i) {
1683      unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
1684      int Pos = (i % NumEltsInLane) / 2 + LaneStart;
1685      Pos += (Unary ? 0 : NumElts * (i % 2));
1686      Pos += (Lo ? 0 : NumEltsInLane / 2);
1687      Mask.push_back(Pos);
1688    }
1689  }
1690
1691  /// Helper function to scale a shuffle or target shuffle mask, replacing each
1692  /// mask index with the scaled sequential indices for an equivalent narrowed
1693  /// mask. This is the reverse process to canWidenShuffleElements, but can
1694  /// always succeed.
1695  template <typename T>
1696  void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
1697                        SmallVectorImpl<T> &ScaledMask) {
1698    assert(0 < Scale && "Unexpected scaling factor");
1699    size_t NumElts = Mask.size();
1700    ScaledMask.assign(NumElts * Scale, -1);
1701
1702    for (size_t i = 0; i != NumElts; ++i) {
1703      int M = Mask[i];
1704
1705      // Repeat sentinel values in every mask element.
1706      if (M < 0) {
1707        for (size_t s = 0; s != Scale; ++s)
1708          ScaledMask[(Scale * i) + s] = M;
1709        continue;
1710      }
1711
1712      // Scale mask element and increment across each mask element.
1713      for (size_t s = 0; s != Scale; ++s)
1714        ScaledMask[(Scale * i) + s] = (Scale * M) + s;
1715    }
1716  }
1717} // end namespace llvm
1718
1719#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
1720