1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17  InstrItinClass rr = arg_rr;
18  InstrItinClass rm = arg_rm;
19}
20
21class SizeItins<OpndItins arg_s, OpndItins arg_d> {
22  OpndItins s = arg_s;
23  OpndItins d = arg_d;
24}
25
26
27class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
28  InstrItinClass arg_ri> {
29  InstrItinClass rr = arg_rr;
30  InstrItinClass rm = arg_rm;
31  InstrItinClass ri = arg_ri;
32}
33
34
35// scalar
36def SSE_ALU_F32S : OpndItins<
37  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
38>;
39
40def SSE_ALU_F64S : OpndItins<
41  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
42>;
43
44def SSE_ALU_ITINS_S : SizeItins<
45  SSE_ALU_F32S, SSE_ALU_F64S
46>;
47
48def SSE_MUL_F32S : OpndItins<
49  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
50>;
51
52def SSE_MUL_F64S : OpndItins<
53  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
54>;
55
56def SSE_MUL_ITINS_S : SizeItins<
57  SSE_MUL_F32S, SSE_MUL_F64S
58>;
59
60def SSE_DIV_F32S : OpndItins<
61  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
62>;
63
64def SSE_DIV_F64S : OpndItins<
65  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
66>;
67
68def SSE_DIV_ITINS_S : SizeItins<
69  SSE_DIV_F32S, SSE_DIV_F64S
70>;
71
72// parallel
73def SSE_ALU_F32P : OpndItins<
74  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
75>;
76
77def SSE_ALU_F64P : OpndItins<
78  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
79>;
80
81def SSE_ALU_ITINS_P : SizeItins<
82  SSE_ALU_F32P, SSE_ALU_F64P
83>;
84
85def SSE_MUL_F32P : OpndItins<
86  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
87>;
88
89def SSE_MUL_F64P : OpndItins<
90  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
91>;
92
93def SSE_MUL_ITINS_P : SizeItins<
94  SSE_MUL_F32P, SSE_MUL_F64P
95>;
96
97def SSE_DIV_F32P : OpndItins<
98  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
99>;
100
101def SSE_DIV_F64P : OpndItins<
102  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
103>;
104
105def SSE_DIV_ITINS_P : SizeItins<
106  SSE_DIV_F32P, SSE_DIV_F64P
107>;
108
109def SSE_BIT_ITINS_P : OpndItins<
110  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
111>;
112
113def SSE_INTALU_ITINS_P : OpndItins<
114  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
115>;
116
117def SSE_INTALUQ_ITINS_P : OpndItins<
118  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
119>;
120
121def SSE_INTMUL_ITINS_P : OpndItins<
122  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
123>;
124
125def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
126  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
127>;
128
129def SSE_MOVA_ITINS : OpndItins<
130  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
131>;
132
133def SSE_MOVU_ITINS : OpndItins<
134  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
135>;
136
137//===----------------------------------------------------------------------===//
138// SSE 1 & 2 Instructions Classes
139//===----------------------------------------------------------------------===//
140
141/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
142multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
143                           RegisterClass RC, X86MemOperand x86memop,
144                           OpndItins itins,
145                           bit Is2Addr = 1> {
146  let isCommutable = 1 in {
147    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
148       !if(Is2Addr,
149           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
150           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
151       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>;
152  }
153  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
154       !if(Is2Addr,
155           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
156           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
157       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>;
158}
159
160/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
161multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
162                             string asm, string SSEVer, string FPSizeStr,
163                             Operand memopr, ComplexPattern mem_cpat,
164                             OpndItins itins,
165                             bit Is2Addr = 1> {
166  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
167       !if(Is2Addr,
168           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
169           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
170       [(set RC:$dst, (!cast<Intrinsic>(
171                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
172             RC:$src1, RC:$src2))], itins.rr>;
173  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
174       !if(Is2Addr,
175           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
176           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
177       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
178                                          SSEVer, "_", OpcodeStr, FPSizeStr))
179             RC:$src1, mem_cpat:$src2))], itins.rm>;
180}
181
182/// sse12_fp_packed - SSE 1 & 2 packed instructions class
183multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
184                           RegisterClass RC, ValueType vt,
185                           X86MemOperand x86memop, PatFrag mem_frag,
186                           Domain d, OpndItins itins, bit Is2Addr = 1> {
187  let isCommutable = 1 in
188    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
189       !if(Is2Addr,
190           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
191           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
192       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>;
193  let mayLoad = 1 in
194    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
195       !if(Is2Addr,
196           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
197           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
198       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
199          itins.rm, d>;
200}
201
202/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
203multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
204                                      string OpcodeStr, X86MemOperand x86memop,
205                                      list<dag> pat_rr, list<dag> pat_rm,
206                                      bit Is2Addr = 1,
207                                      bit rr_hasSideEffects = 0> {
208  let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in
209    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
210       !if(Is2Addr,
211           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
212           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
213       pat_rr, IIC_DEFAULT, d>;
214  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
215       !if(Is2Addr,
216           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
217           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
218       pat_rm, IIC_DEFAULT, d>;
219}
220
221/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
222multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
223                           string asm, string SSEVer, string FPSizeStr,
224                           X86MemOperand x86memop, PatFrag mem_frag,
225                           Domain d, OpndItins itins, bit Is2Addr = 1> {
226  def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
227       !if(Is2Addr,
228           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
229           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
230           [(set RC:$dst, (!cast<Intrinsic>(
231                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
232                 RC:$src1, RC:$src2))], IIC_DEFAULT, d>;
233  def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
234       !if(Is2Addr,
235           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
236           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
237       [(set RC:$dst, (!cast<Intrinsic>(
238                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
239             RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>;
240}
241
242//===----------------------------------------------------------------------===//
243//  Non-instruction patterns
244//===----------------------------------------------------------------------===//
245
246// A vector extract of the first f32/f64 position is a subregister copy
247def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
248          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
249def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
250          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
251
252// A 128-bit subvector extract from the first 256-bit vector position
253// is a subregister copy that needs no instruction.
254def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
255          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
256def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
257          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
258
259def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
260          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
261def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
262          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
263
264def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
265          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
266def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
267          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
268
269// A 128-bit subvector insert to the first 256-bit vector position
270// is a subregister copy that needs no instruction.
271let AddedComplexity = 25 in { // to give priority over vinsertf128rm
272def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
273          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
274def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
275          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
276def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
277          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
278def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
279          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
280def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
281          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
282def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
283          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
284}
285
286// Implicitly promote a 32-bit scalar to a vector.
287def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
288          (COPY_TO_REGCLASS FR32:$src, VR128)>;
289def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
290          (COPY_TO_REGCLASS FR32:$src, VR128)>;
291// Implicitly promote a 64-bit scalar to a vector.
292def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
293          (COPY_TO_REGCLASS FR64:$src, VR128)>;
294def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
295          (COPY_TO_REGCLASS FR64:$src, VR128)>;
296
297// Bitcasts between 128-bit vector types. Return the original type since
298// no instruction is needed for the conversion
299let Predicates = [HasSSE2] in {
300  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
301  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
302  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
303  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
304  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
305  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
306  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
307  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
308  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
309  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
310  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
311  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
312  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
313  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
314  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
315  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
316  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
317  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
318  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
319  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
320  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
321  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
322  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
323  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
324  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
325  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
326  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
327  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
328  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
329  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
330}
331
332// Bitcasts between 256-bit vector types. Return the original type since
333// no instruction is needed for the conversion
334let Predicates = [HasAVX] in {
335  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
336  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
337  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
338  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
339  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
340  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
341  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
342  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
343  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
344  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
345  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
346  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
347  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
348  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
349  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
350  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
351  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
352  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
353  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
354  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
355  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
356  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
357  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
358  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
359  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
360  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
361  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
362  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
363  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
364  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
365}
366
367// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
368// This is expanded by ExpandPostRAPseudos.
369let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
370    isPseudo = 1 in {
371  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
372                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
373  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
374                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
375}
376
377//===----------------------------------------------------------------------===//
378// AVX & SSE - Zero/One Vectors
379//===----------------------------------------------------------------------===//
380
381// Alias instruction that maps zero vector to pxor / xorp* for sse.
382// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
383// swizzled by ExecutionDepsFix to pxor.
384// We set canFoldAsLoad because this can be converted to a constant-pool
385// load of an all-zeros value if folding it would be beneficial.
386let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
387    isPseudo = 1 in {
388def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
389               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
390}
391
392def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
393def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
394def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
395def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
396def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
397
398
399// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
400// and doesn't need it because on sandy bridge the register is set to zero
401// at the rename stage without using any execution unit, so SET0PSY
402// and SET0PDY can be used for vector int instructions without penalty
403let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
404    isPseudo = 1, Predicates = [HasAVX] in {
405def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
406                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
407}
408
409let Predicates = [HasAVX] in
410  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
411
412let Predicates = [HasAVX2] in {
413  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
414  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
415  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
416  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
417}
418
419// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
420// VPXOR instruction writes zero to its upper part, it's safe build zeros.
421let Predicates = [HasAVX1Only] in {
422def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
423def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
424          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
425
426def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
427def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
428          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
429
430def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
431def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
432          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
433
434def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
435def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
436          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
437}
438
439// We set canFoldAsLoad because this can be converted to a constant-pool
440// load of an all-ones value if folding it would be beneficial.
441let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
442    isPseudo = 1 in {
443  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
444                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
445  let Predicates = [HasAVX2] in
446  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
447                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
448}
449
450
451//===----------------------------------------------------------------------===//
452// SSE 1 & 2 - Move FP Scalar Instructions
453//
454// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
455// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
456// is used instead. Register-to-register movss/movsd is not modeled as an
457// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
458// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
459//===----------------------------------------------------------------------===//
460
461class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
462      SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
463      [(set VR128:$dst, (vt (OpNode VR128:$src1,
464                             (scalar_to_vector RC:$src2))))],
465      IIC_SSE_MOV_S_RR>;
466
467// Loading from memory automatically zeroing upper bits.
468class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
469                    PatFrag mem_pat, string OpcodeStr> :
470      SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
471         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
472                        [(set RC:$dst, (mem_pat addr:$src))],
473                        IIC_SSE_MOV_S_RM>;
474
475// AVX
476def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
477                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
478                VEX_LIG;
479def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
480                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
481                VEX_LIG;
482
483// For the disassembler
484let isCodeGenOnly = 1 in {
485  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
486                        (ins VR128:$src1, FR32:$src2),
487                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
488                        IIC_SSE_MOV_S_RR>,
489                        XS, VEX_4V, VEX_LIG;
490  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
491                        (ins VR128:$src1, FR64:$src2),
492                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
493                        IIC_SSE_MOV_S_RR>,
494                        XD, VEX_4V, VEX_LIG;
495}
496
497let canFoldAsLoad = 1, isReMaterializable = 1 in {
498  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
499                 VEX_LIG;
500  let AddedComplexity = 20 in
501    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
502                   VEX_LIG;
503}
504
505def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
506                  "movss\t{$src, $dst|$dst, $src}",
507                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
508                  XS, VEX, VEX_LIG;
509def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
510                  "movsd\t{$src, $dst|$dst, $src}",
511                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
512                  XD, VEX, VEX_LIG;
513
514// SSE1 & 2
515let Constraints = "$src1 = $dst" in {
516  def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
517                          "movss\t{$src2, $dst|$dst, $src2}">, XS;
518  def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
519                          "movsd\t{$src2, $dst|$dst, $src2}">, XD;
520
521  // For the disassembler
522  let isCodeGenOnly = 1 in {
523    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
524                         (ins VR128:$src1, FR32:$src2),
525                         "movss\t{$src2, $dst|$dst, $src2}", [],
526                         IIC_SSE_MOV_S_RR>, XS;
527    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
528                         (ins VR128:$src1, FR64:$src2),
529                         "movsd\t{$src2, $dst|$dst, $src2}", [],
530                         IIC_SSE_MOV_S_RR>, XD;
531  }
532}
533
534let canFoldAsLoad = 1, isReMaterializable = 1 in {
535  def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
536
537  let AddedComplexity = 20 in
538    def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
539}
540
541def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
542                  "movss\t{$src, $dst|$dst, $src}",
543                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
544def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
545                  "movsd\t{$src, $dst|$dst, $src}",
546                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
547
548// Patterns
549let Predicates = [HasAVX] in {
550  let AddedComplexity = 15 in {
551  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
552  // MOVS{S,D} to the lower bits.
553  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
554            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
555  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
556            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
557  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
558            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
559  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
560            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
561
562  // Move low f32 and clear high bits.
563  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
564            (SUBREG_TO_REG (i32 0),
565             (VMOVSSrr (v4f32 (V_SET0)),
566                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
567  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
568            (SUBREG_TO_REG (i32 0),
569             (VMOVSSrr (v4i32 (V_SET0)),
570                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
571  }
572
573  let AddedComplexity = 20 in {
574  // MOVSSrm zeros the high parts of the register; represent this
575  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
576  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
577            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
578  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
579            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
580  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
581            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
582
583  // MOVSDrm zeros the high parts of the register; represent this
584  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
585  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
586            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
587  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
588            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
589  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
590            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
591  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
592            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
593  def : Pat<(v2f64 (X86vzload addr:$src)),
594            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
595
596  // Represent the same patterns above but in the form they appear for
597  // 256-bit types
598  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
599                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
600            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
601  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
602                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
603            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
604  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
605                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
606            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
607  }
608  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
609                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
610            (SUBREG_TO_REG (i32 0),
611                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
612                           sub_xmm)>;
613  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
614                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
615            (SUBREG_TO_REG (i64 0),
616                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
617                           sub_xmm)>;
618  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
619                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
620            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
621
622  // Move low f64 and clear high bits.
623  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
624            (SUBREG_TO_REG (i32 0),
625             (VMOVSDrr (v2f64 (V_SET0)),
626                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
627
628  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
629            (SUBREG_TO_REG (i32 0),
630             (VMOVSDrr (v2i64 (V_SET0)),
631                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
632
633  // Extract and store.
634  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
635                   addr:$dst),
636            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
637  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
638                   addr:$dst),
639            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
640
641  // Shuffle with VMOVSS
642  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
643            (VMOVSSrr (v4i32 VR128:$src1),
644                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
645  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
646            (VMOVSSrr (v4f32 VR128:$src1),
647                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
648
649  // 256-bit variants
650  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
651            (SUBREG_TO_REG (i32 0),
652              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
653                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
654              sub_xmm)>;
655  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
656            (SUBREG_TO_REG (i32 0),
657              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
658                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
659              sub_xmm)>;
660
661  // Shuffle with VMOVSD
662  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
663            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
664  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
665            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
666  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
667            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
668  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
669            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
670
671  // 256-bit variants
672  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
673            (SUBREG_TO_REG (i32 0),
674              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
675                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
676              sub_xmm)>;
677  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
678            (SUBREG_TO_REG (i32 0),
679              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
680                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
681              sub_xmm)>;
682
683
684  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
685  // is during lowering, where it's not possible to recognize the fold cause
686  // it has two uses through a bitcast. One use disappears at isel time and the
687  // fold opportunity reappears.
688  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
689            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
690  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
691            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
692  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
693            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
694  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
695            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
696}
697
698let Predicates = [UseSSE1] in {
699  let AddedComplexity = 15 in {
700  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
701  // MOVSS to the lower bits.
702  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
703            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
704  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
705            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
706  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
707            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
708  }
709
710  let AddedComplexity = 20 in {
711  // MOVSSrm already zeros the high parts of the register.
712  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
713            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
714  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
715            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
716  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
717            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
718  }
719
720  // Extract and store.
721  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
722                   addr:$dst),
723            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
724
725  // Shuffle with MOVSS
726  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
727            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
728  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
729            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
730}
731
732let Predicates = [UseSSE2] in {
733  let AddedComplexity = 15 in {
734  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
735  // MOVSD to the lower bits.
736  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
737            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
738  }
739
740  let AddedComplexity = 20 in {
741  // MOVSDrm already zeros the high parts of the register.
742  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
743            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
744  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
745            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
746  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
747            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
748  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
749            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
750  def : Pat<(v2f64 (X86vzload addr:$src)),
751            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
752  }
753
754  // Extract and store.
755  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
756                   addr:$dst),
757            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
758
759  // Shuffle with MOVSD
760  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
761            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
762  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
763            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
764  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
765            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
766  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
767            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
768
769  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
770  // is during lowering, where it's not possible to recognize the fold cause
771  // it has two uses through a bitcast. One use disappears at isel time and the
772  // fold opportunity reappears.
773  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
774            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
775  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
776            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
777  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
778            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
779  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
780            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
781}
782
783//===----------------------------------------------------------------------===//
784// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
785//===----------------------------------------------------------------------===//
786
787multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
788                            X86MemOperand x86memop, PatFrag ld_frag,
789                            string asm, Domain d,
790                            OpndItins itins,
791                            bit IsReMaterializable = 1> {
792let neverHasSideEffects = 1 in
793  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
794              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>;
795let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
796  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
797              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
798                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>;
799}
800
801defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
802                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
803                              TB, VEX;
804defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
805                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
806                              TB, OpSize, VEX;
807defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
808                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
809                              TB, VEX;
810defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
811                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
812                              TB, OpSize, VEX;
813
814defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
815                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
816                              TB, VEX, VEX_L;
817defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
818                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
819                              TB, OpSize, VEX, VEX_L;
820defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
821                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
822                              TB, VEX, VEX_L;
823defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
824                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
825                              TB, OpSize, VEX, VEX_L;
826defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
827                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
828                              TB;
829defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
830                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
831                              TB, OpSize;
832defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
833                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
834                              TB;
835defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
836                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
837                              TB, OpSize;
838
839def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
840                   "movaps\t{$src, $dst|$dst, $src}",
841                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
842                   IIC_SSE_MOVA_P_MR>, VEX;
843def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
844                   "movapd\t{$src, $dst|$dst, $src}",
845                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
846                   IIC_SSE_MOVA_P_MR>, VEX;
847def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
848                   "movups\t{$src, $dst|$dst, $src}",
849                   [(store (v4f32 VR128:$src), addr:$dst)],
850                   IIC_SSE_MOVU_P_MR>, VEX;
851def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
852                   "movupd\t{$src, $dst|$dst, $src}",
853                   [(store (v2f64 VR128:$src), addr:$dst)],
854                   IIC_SSE_MOVU_P_MR>, VEX;
855def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
856                   "movaps\t{$src, $dst|$dst, $src}",
857                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
858                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
859def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
860                   "movapd\t{$src, $dst|$dst, $src}",
861                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
862                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
863def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
864                   "movups\t{$src, $dst|$dst, $src}",
865                   [(store (v8f32 VR256:$src), addr:$dst)],
866                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
867def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
868                   "movupd\t{$src, $dst|$dst, $src}",
869                   [(store (v4f64 VR256:$src), addr:$dst)],
870                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
871
872// For disassembler
873let isCodeGenOnly = 1 in {
874  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
875                          (ins VR128:$src),
876                          "movaps\t{$src, $dst|$dst, $src}", [],
877                          IIC_SSE_MOVA_P_RR>, VEX;
878  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
879                           (ins VR128:$src),
880                           "movapd\t{$src, $dst|$dst, $src}", [],
881                           IIC_SSE_MOVA_P_RR>, VEX;
882  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
883                           (ins VR128:$src),
884                           "movups\t{$src, $dst|$dst, $src}", [],
885                           IIC_SSE_MOVU_P_RR>, VEX;
886  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
887                           (ins VR128:$src),
888                           "movupd\t{$src, $dst|$dst, $src}", [],
889                           IIC_SSE_MOVU_P_RR>, VEX;
890  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
891                            (ins VR256:$src),
892                            "movaps\t{$src, $dst|$dst, $src}", [],
893                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
894  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
895                            (ins VR256:$src),
896                            "movapd\t{$src, $dst|$dst, $src}", [],
897                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
898  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
899                            (ins VR256:$src),
900                            "movups\t{$src, $dst|$dst, $src}", [],
901                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
902  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
903                            (ins VR256:$src),
904                            "movupd\t{$src, $dst|$dst, $src}", [],
905                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
906}
907
908let Predicates = [HasAVX] in {
909def : Pat<(v8i32 (X86vzmovl
910                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
911          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
912def : Pat<(v4i64 (X86vzmovl
913                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
914          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
915def : Pat<(v8f32 (X86vzmovl
916                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
917          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
918def : Pat<(v4f64 (X86vzmovl
919                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
920          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
921}
922
923
924def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
925          (VMOVUPSYmr addr:$dst, VR256:$src)>;
926def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
927          (VMOVUPDYmr addr:$dst, VR256:$src)>;
928
929def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
930                   "movaps\t{$src, $dst|$dst, $src}",
931                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
932                   IIC_SSE_MOVA_P_MR>;
933def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
934                   "movapd\t{$src, $dst|$dst, $src}",
935                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
936                   IIC_SSE_MOVA_P_MR>;
937def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
938                   "movups\t{$src, $dst|$dst, $src}",
939                   [(store (v4f32 VR128:$src), addr:$dst)],
940                   IIC_SSE_MOVU_P_MR>;
941def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
942                   "movupd\t{$src, $dst|$dst, $src}",
943                   [(store (v2f64 VR128:$src), addr:$dst)],
944                   IIC_SSE_MOVU_P_MR>;
945
946// For disassembler
947let isCodeGenOnly = 1 in {
948  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
949                         "movaps\t{$src, $dst|$dst, $src}", [],
950                         IIC_SSE_MOVA_P_RR>;
951  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
952                         "movapd\t{$src, $dst|$dst, $src}", [],
953                         IIC_SSE_MOVA_P_RR>;
954  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
955                         "movups\t{$src, $dst|$dst, $src}", [],
956                         IIC_SSE_MOVU_P_RR>;
957  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
958                         "movupd\t{$src, $dst|$dst, $src}", [],
959                         IIC_SSE_MOVU_P_RR>;
960}
961
962let Predicates = [HasAVX] in {
963  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
964            (VMOVUPSmr addr:$dst, VR128:$src)>;
965  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
966            (VMOVUPDmr addr:$dst, VR128:$src)>;
967}
968
969let Predicates = [UseSSE1] in
970  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
971            (MOVUPSmr addr:$dst, VR128:$src)>;
972let Predicates = [UseSSE2] in
973  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
974            (MOVUPDmr addr:$dst, VR128:$src)>;
975
976// Use vmovaps/vmovups for AVX integer load/store.
977let Predicates = [HasAVX] in {
978  // 128-bit load/store
979  def : Pat<(alignedloadv2i64 addr:$src),
980            (VMOVAPSrm addr:$src)>;
981  def : Pat<(loadv2i64 addr:$src),
982            (VMOVUPSrm addr:$src)>;
983
984  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
985            (VMOVAPSmr addr:$dst, VR128:$src)>;
986  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
987            (VMOVAPSmr addr:$dst, VR128:$src)>;
988  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
989            (VMOVAPSmr addr:$dst, VR128:$src)>;
990  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
991            (VMOVAPSmr addr:$dst, VR128:$src)>;
992  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
993            (VMOVUPSmr addr:$dst, VR128:$src)>;
994  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
995            (VMOVUPSmr addr:$dst, VR128:$src)>;
996  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
997            (VMOVUPSmr addr:$dst, VR128:$src)>;
998  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
999            (VMOVUPSmr addr:$dst, VR128:$src)>;
1000
1001  // 256-bit load/store
1002  def : Pat<(alignedloadv4i64 addr:$src),
1003            (VMOVAPSYrm addr:$src)>;
1004  def : Pat<(loadv4i64 addr:$src),
1005            (VMOVUPSYrm addr:$src)>;
1006  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1007            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1008  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1009            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1010  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1011            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1012  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1013            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1014  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1015            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1016  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1017            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1018  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1019            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1020  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1021            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1022
1023  // Special patterns for storing subvector extracts of lower 128-bits
1024  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1025  def : Pat<(alignedstore (v2f64 (extract_subvector
1026                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1027            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1028  def : Pat<(alignedstore (v4f32 (extract_subvector
1029                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1030            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1031  def : Pat<(alignedstore (v2i64 (extract_subvector
1032                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1033            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1034  def : Pat<(alignedstore (v4i32 (extract_subvector
1035                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1036            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1037  def : Pat<(alignedstore (v8i16 (extract_subvector
1038                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1039            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1040  def : Pat<(alignedstore (v16i8 (extract_subvector
1041                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1042            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1043
1044  def : Pat<(store (v2f64 (extract_subvector
1045                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1046            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1047  def : Pat<(store (v4f32 (extract_subvector
1048                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1049            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1050  def : Pat<(store (v2i64 (extract_subvector
1051                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1052            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1053  def : Pat<(store (v4i32 (extract_subvector
1054                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1055            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1056  def : Pat<(store (v8i16 (extract_subvector
1057                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1058            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1059  def : Pat<(store (v16i8 (extract_subvector
1060                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1061            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1062}
1063
1064// Use movaps / movups for SSE integer load / store (one byte shorter).
1065// The instructions selected below are then converted to MOVDQA/MOVDQU
1066// during the SSE domain pass.
1067let Predicates = [UseSSE1] in {
1068  def : Pat<(alignedloadv2i64 addr:$src),
1069            (MOVAPSrm addr:$src)>;
1070  def : Pat<(loadv2i64 addr:$src),
1071            (MOVUPSrm addr:$src)>;
1072
1073  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1074            (MOVAPSmr addr:$dst, VR128:$src)>;
1075  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1076            (MOVAPSmr addr:$dst, VR128:$src)>;
1077  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1078            (MOVAPSmr addr:$dst, VR128:$src)>;
1079  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1080            (MOVAPSmr addr:$dst, VR128:$src)>;
1081  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1082            (MOVUPSmr addr:$dst, VR128:$src)>;
1083  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1084            (MOVUPSmr addr:$dst, VR128:$src)>;
1085  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1086            (MOVUPSmr addr:$dst, VR128:$src)>;
1087  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1088            (MOVUPSmr addr:$dst, VR128:$src)>;
1089}
1090
1091// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
1092// bits are disregarded. FIXME: Set encoding to pseudo!
1093let neverHasSideEffects = 1 in {
1094def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
1095                       "movaps\t{$src, $dst|$dst, $src}", [],
1096                       IIC_SSE_MOVA_P_RR>, VEX;
1097def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1098                       "movapd\t{$src, $dst|$dst, $src}", [],
1099                       IIC_SSE_MOVA_P_RR>, VEX;
1100def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
1101                     "movaps\t{$src, $dst|$dst, $src}", [],
1102                     IIC_SSE_MOVA_P_RR>;
1103def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1104                     "movapd\t{$src, $dst|$dst, $src}", [],
1105                     IIC_SSE_MOVA_P_RR>;
1106}
1107
1108// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1109// bits are disregarded. FIXME: Set encoding to pseudo!
1110let canFoldAsLoad = 1, isReMaterializable = 1 in {
1111let isCodeGenOnly = 1 in {
1112  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1113                         "movaps\t{$src, $dst|$dst, $src}",
1114                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1115                         IIC_SSE_MOVA_P_RM>, VEX;
1116  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1117                         "movapd\t{$src, $dst|$dst, $src}",
1118                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1119                         IIC_SSE_MOVA_P_RM>, VEX;
1120}
1121def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1122                     "movaps\t{$src, $dst|$dst, $src}",
1123                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1124                     IIC_SSE_MOVA_P_RM>;
1125def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1126                     "movapd\t{$src, $dst|$dst, $src}",
1127                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1128                     IIC_SSE_MOVA_P_RM>;
1129}
1130
1131//===----------------------------------------------------------------------===//
1132// SSE 1 & 2 - Move Low packed FP Instructions
1133//===----------------------------------------------------------------------===//
1134
1135multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
1136                                 SDNode psnode, SDNode pdnode, string base_opc,
1137                                 string asm_opr, InstrItinClass itin> {
1138  def PSrm : PI<opc, MRMSrcMem,
1139         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1140         !strconcat(base_opc, "s", asm_opr),
1141     [(set RC:$dst,
1142       (psnode RC:$src1,
1143              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1144              itin, SSEPackedSingle>, TB;
1145
1146  def PDrm : PI<opc, MRMSrcMem,
1147         (outs RC:$dst), (ins RC:$src1, f64mem:$src2),
1148         !strconcat(base_opc, "d", asm_opr),
1149     [(set RC:$dst, (v2f64 (pdnode RC:$src1,
1150                              (scalar_to_vector (loadf64 addr:$src2)))))],
1151              itin, SSEPackedDouble>, TB, OpSize;
1152}
1153
1154let AddedComplexity = 20 in {
1155  defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
1156                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1157                     IIC_SSE_MOV_LH>, VEX_4V;
1158}
1159let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1160  defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
1161                                   "\t{$src2, $dst|$dst, $src2}",
1162                                   IIC_SSE_MOV_LH>;
1163}
1164
1165def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1166                   "movlps\t{$src, $dst|$dst, $src}",
1167                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1168                                 (iPTR 0))), addr:$dst)],
1169                                 IIC_SSE_MOV_LH>, VEX;
1170def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1171                   "movlpd\t{$src, $dst|$dst, $src}",
1172                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1173                                 (iPTR 0))), addr:$dst)],
1174                                 IIC_SSE_MOV_LH>, VEX;
1175def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1176                   "movlps\t{$src, $dst|$dst, $src}",
1177                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1178                                 (iPTR 0))), addr:$dst)],
1179                                 IIC_SSE_MOV_LH>;
1180def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1181                   "movlpd\t{$src, $dst|$dst, $src}",
1182                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1183                                 (iPTR 0))), addr:$dst)],
1184                                 IIC_SSE_MOV_LH>;
1185
1186let Predicates = [HasAVX] in {
1187  // Shuffle with VMOVLPS
1188  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1189            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1190  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1191            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1192
1193  // Shuffle with VMOVLPD
1194  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1195            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1196  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1197            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1198
1199  // Store patterns
1200  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1201                   addr:$src1),
1202            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1203  def : Pat<(store (v4i32 (X86Movlps
1204                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1205            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1206  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1207                   addr:$src1),
1208            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1209  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1210                   addr:$src1),
1211            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1212}
1213
1214let Predicates = [UseSSE1] in {
1215  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1216  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
1217                                 (iPTR 0))), addr:$src1),
1218            (MOVLPSmr addr:$src1, VR128:$src2)>;
1219
1220  // Shuffle with MOVLPS
1221  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1222            (MOVLPSrm VR128:$src1, addr:$src2)>;
1223  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1224            (MOVLPSrm VR128:$src1, addr:$src2)>;
1225  def : Pat<(X86Movlps VR128:$src1,
1226                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1227            (MOVLPSrm VR128:$src1, addr:$src2)>;
1228
1229  // Store patterns
1230  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1231                                      addr:$src1),
1232            (MOVLPSmr addr:$src1, VR128:$src2)>;
1233  def : Pat<(store (v4i32 (X86Movlps
1234                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1235                              addr:$src1),
1236            (MOVLPSmr addr:$src1, VR128:$src2)>;
1237}
1238
1239let Predicates = [UseSSE2] in {
1240  // Shuffle with MOVLPD
1241  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1242            (MOVLPDrm VR128:$src1, addr:$src2)>;
1243  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1244            (MOVLPDrm VR128:$src1, addr:$src2)>;
1245
1246  // Store patterns
1247  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1248                           addr:$src1),
1249            (MOVLPDmr addr:$src1, VR128:$src2)>;
1250  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1251                           addr:$src1),
1252            (MOVLPDmr addr:$src1, VR128:$src2)>;
1253}
1254
1255//===----------------------------------------------------------------------===//
1256// SSE 1 & 2 - Move Hi packed FP Instructions
1257//===----------------------------------------------------------------------===//
1258
1259let AddedComplexity = 20 in {
1260  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
1261                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1262                     IIC_SSE_MOV_LH>, VEX_4V;
1263}
1264let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1265  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
1266                                   "\t{$src2, $dst|$dst, $src2}",
1267                                   IIC_SSE_MOV_LH>;
1268}
1269
1270// v2f64 extract element 1 is always custom lowered to unpack high to low
1271// and extract element 0 so the non-store version isn't too horrible.
1272def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1273                   "movhps\t{$src, $dst|$dst, $src}",
1274                   [(store (f64 (vector_extract
1275                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1276                                            (bc_v2f64 (v4f32 VR128:$src))),
1277                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1278def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1279                   "movhpd\t{$src, $dst|$dst, $src}",
1280                   [(store (f64 (vector_extract
1281                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1282                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1283def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1284                   "movhps\t{$src, $dst|$dst, $src}",
1285                   [(store (f64 (vector_extract
1286                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1287                                            (bc_v2f64 (v4f32 VR128:$src))),
1288                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1289def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1290                   "movhpd\t{$src, $dst|$dst, $src}",
1291                   [(store (f64 (vector_extract
1292                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1293                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1294
1295let Predicates = [HasAVX] in {
1296  // VMOVHPS patterns
1297  def : Pat<(X86Movlhps VR128:$src1,
1298                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1299            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1300  def : Pat<(X86Movlhps VR128:$src1,
1301                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1302            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1303
1304  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1305  // is during lowering, where it's not possible to recognize the load fold 
1306  // cause it has two uses through a bitcast. One use disappears at isel time
1307  // and the fold opportunity reappears.
1308  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1309                      (scalar_to_vector (loadf64 addr:$src2)))),
1310            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1311}
1312
1313let Predicates = [UseSSE1] in {
1314  // MOVHPS patterns
1315  def : Pat<(X86Movlhps VR128:$src1,
1316                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1317            (MOVHPSrm VR128:$src1, addr:$src2)>;
1318  def : Pat<(X86Movlhps VR128:$src1,
1319                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1320            (MOVHPSrm VR128:$src1, addr:$src2)>;
1321}
1322
1323let Predicates = [UseSSE2] in {
1324  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1325  // is during lowering, where it's not possible to recognize the load fold 
1326  // cause it has two uses through a bitcast. One use disappears at isel time
1327  // and the fold opportunity reappears.
1328  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1329                      (scalar_to_vector (loadf64 addr:$src2)))),
1330            (MOVHPDrm VR128:$src1, addr:$src2)>;
1331}
1332
1333//===----------------------------------------------------------------------===//
1334// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1335//===----------------------------------------------------------------------===//
1336
1337let AddedComplexity = 20 in {
1338  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1339                                       (ins VR128:$src1, VR128:$src2),
1340                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1341                      [(set VR128:$dst,
1342                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1343                        IIC_SSE_MOV_LH>,
1344                      VEX_4V;
1345  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1346                                       (ins VR128:$src1, VR128:$src2),
1347                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1348                      [(set VR128:$dst,
1349                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1350                        IIC_SSE_MOV_LH>,
1351                      VEX_4V;
1352}
1353let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1354  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1355                                       (ins VR128:$src1, VR128:$src2),
1356                      "movlhps\t{$src2, $dst|$dst, $src2}",
1357                      [(set VR128:$dst,
1358                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1359                        IIC_SSE_MOV_LH>;
1360  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1361                                       (ins VR128:$src1, VR128:$src2),
1362                      "movhlps\t{$src2, $dst|$dst, $src2}",
1363                      [(set VR128:$dst,
1364                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1365                        IIC_SSE_MOV_LH>;
1366}
1367
1368let Predicates = [HasAVX] in {
1369  // MOVLHPS patterns
1370  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1371            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1372  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1373            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1374
1375  // MOVHLPS patterns
1376  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1377            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1378}
1379
1380let Predicates = [UseSSE1] in {
1381  // MOVLHPS patterns
1382  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1383            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1384  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1385            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1386
1387  // MOVHLPS patterns
1388  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1389            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1390}
1391
1392//===----------------------------------------------------------------------===//
1393// SSE 1 & 2 - Conversion Instructions
1394//===----------------------------------------------------------------------===//
1395
1396def SSE_CVT_PD : OpndItins<
1397  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1398>;
1399
1400def SSE_CVT_PS : OpndItins<
1401  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1402>;
1403
1404def SSE_CVT_Scalar : OpndItins<
1405  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1406>;
1407
1408def SSE_CVT_SS2SI_32 : OpndItins<
1409  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1410>;
1411
1412def SSE_CVT_SS2SI_64 : OpndItins<
1413  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1414>;
1415
1416def SSE_CVT_SD2SI : OpndItins<
1417  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1418>;
1419
1420multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1421                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1422                     string asm, OpndItins itins> {
1423  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1424                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1425                        itins.rr>;
1426  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1427                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1428                        itins.rm>;
1429}
1430
1431multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1432                       X86MemOperand x86memop, string asm, Domain d,
1433                       OpndItins itins> {
1434let neverHasSideEffects = 1 in {
1435  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1436             [], itins.rr, d>;
1437  let mayLoad = 1 in
1438  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1439             [], itins.rm, d>;
1440}
1441}
1442
1443multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1444                          X86MemOperand x86memop, string asm> {
1445let neverHasSideEffects = 1 in {
1446  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1447              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
1448  let mayLoad = 1 in
1449  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1450              (ins DstRC:$src1, x86memop:$src),
1451              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
1452} // neverHasSideEffects = 1
1453}
1454
1455defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1456                                "cvttss2si\t{$src, $dst|$dst, $src}",
1457                                SSE_CVT_SS2SI_32>,
1458                                XS, VEX, VEX_LIG;
1459defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1460                                "cvttss2si\t{$src, $dst|$dst, $src}",
1461                                SSE_CVT_SS2SI_64>,
1462                                XS, VEX, VEX_W, VEX_LIG;
1463defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1464                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1465                                SSE_CVT_SD2SI>,
1466                                XD, VEX, VEX_LIG;
1467defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1468                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1469                                SSE_CVT_SD2SI>,
1470                                XD, VEX, VEX_W, VEX_LIG;
1471
1472def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1473                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1474def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1475                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1476def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1477                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1478def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1479                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1480def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1481                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1482def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1483                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1484def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1485                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1486def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1487                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1488
1489// The assembler can recognize rr 64-bit instructions by seeing a rxx
1490// register, but the same isn't true when only using memory operands,
1491// provide other assembly "l" and "q" forms to address this explicitly
1492// where appropriate to do so.
1493defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1494                                  XS, VEX_4V, VEX_LIG;
1495defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1496                                  XS, VEX_4V, VEX_W, VEX_LIG;
1497defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1498                                  XD, VEX_4V, VEX_LIG;
1499defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1500                                  XD, VEX_4V, VEX_W, VEX_LIG;
1501
1502def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1503                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1504def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1505                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1506
1507let Predicates = [HasAVX] in {
1508  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1509            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1510  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1511            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1512  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1513            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1514  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1515            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1516
1517  def : Pat<(f32 (sint_to_fp GR32:$src)),
1518            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1519  def : Pat<(f32 (sint_to_fp GR64:$src)),
1520            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1521  def : Pat<(f64 (sint_to_fp GR32:$src)),
1522            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1523  def : Pat<(f64 (sint_to_fp GR64:$src)),
1524            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1525}
1526
1527defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1528                      "cvttss2si\t{$src, $dst|$dst, $src}",
1529                      SSE_CVT_SS2SI_32>, XS;
1530defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1531                      "cvttss2si\t{$src, $dst|$dst, $src}",
1532                      SSE_CVT_SS2SI_64>, XS, REX_W;
1533defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1534                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1535                      SSE_CVT_SD2SI>, XD;
1536defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1537                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1538                      SSE_CVT_SD2SI>, XD, REX_W;
1539defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1540                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1541                      SSE_CVT_Scalar>, XS;
1542defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1543                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1544                      SSE_CVT_Scalar>, XS, REX_W;
1545defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1546                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1547                      SSE_CVT_Scalar>, XD;
1548defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1549                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1550                      SSE_CVT_Scalar>, XD, REX_W;
1551
1552def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1553                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1554def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1555                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1556def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1557                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1558def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1559                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1560def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1561                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1562def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1563                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1564def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1565                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1566def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1567                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1568
1569def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1570                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
1571def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1572                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
1573
1574// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1575// and/or XMM operand(s).
1576
1577multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1578                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1579                         string asm, OpndItins itins> {
1580  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1581              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1582              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>;
1583  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1584              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1585              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>;
1586}
1587
1588multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1589                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1590                    PatFrag ld_frag, string asm, OpndItins itins,
1591                    bit Is2Addr = 1> {
1592  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1593              !if(Is2Addr,
1594                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1595                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1596              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1597              itins.rr>;
1598  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1599              (ins DstRC:$src1, x86memop:$src2),
1600              !if(Is2Addr,
1601                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1602                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1603              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1604              itins.rm>;
1605}
1606
1607defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1608                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1609                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1610defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1611                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1612                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1613
1614defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1615                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1616defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1617                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1618
1619
1620defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1621          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1622          SSE_CVT_Scalar, 0>, XS, VEX_4V;
1623defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1624          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1625          SSE_CVT_Scalar, 0>, XS, VEX_4V,
1626          VEX_W;
1627defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1628          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1629          SSE_CVT_Scalar, 0>, XD, VEX_4V;
1630defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1631          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1632          SSE_CVT_Scalar, 0>, XD,
1633          VEX_4V, VEX_W;
1634
1635let Constraints = "$src1 = $dst" in {
1636  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1637                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
1638                        "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1639  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1640                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
1641                        "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1642  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1643                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1644                        "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1645  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1646                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1647                        "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1648}
1649
1650/// SSE 1 Only
1651
1652// Aliases for intrinsics
1653defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1654                                    ssmem, sse_load_f32, "cvttss2si",
1655                                    SSE_CVT_SS2SI_32>, XS, VEX;
1656defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1657                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1658                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1659                                   XS, VEX, VEX_W;
1660defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1661                                    sdmem, sse_load_f64, "cvttsd2si",
1662                                    SSE_CVT_SD2SI>, XD, VEX;
1663defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1664                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1665                                  "cvttsd2si", SSE_CVT_SD2SI>,
1666                                  XD, VEX, VEX_W;
1667defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1668                                    ssmem, sse_load_f32, "cvttss2si",
1669                                    SSE_CVT_SS2SI_32>, XS;
1670defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1671                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1672                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1673defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1674                                    sdmem, sse_load_f64, "cvttsd2si",
1675                                    SSE_CVT_SD2SI>, XD;
1676defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1677                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1678                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1679
1680defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1681                                  ssmem, sse_load_f32, "cvtss2si",
1682                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1683defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1684                                  ssmem, sse_load_f32, "cvtss2si",
1685                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1686
1687defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1688                               ssmem, sse_load_f32, "cvtss2si",
1689                               SSE_CVT_SS2SI_32>, XS;
1690defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1691                                 ssmem, sse_load_f32, "cvtss2si",
1692                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1693
1694defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1695                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1696                               SSEPackedSingle, SSE_CVT_PS>,
1697                               TB, VEX, Requires<[HasAVX]>;
1698defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1699                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1700                               SSEPackedSingle, SSE_CVT_PS>,
1701                               TB, VEX, VEX_L, Requires<[HasAVX]>;
1702
1703defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1704                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1705                            SSEPackedSingle, SSE_CVT_PS>,
1706                            TB, Requires<[UseSSE2]>;
1707
1708def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1709                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1710def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1711                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1712def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1713                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1714def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1715                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1716def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1717                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1718def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1719                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1720def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1721                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1722def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1723                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1724
1725def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1726                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1727def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1728                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1729def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1730                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1731def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1732                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1733def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1734                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1735def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1736                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1737def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1738                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1739def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1740                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1741
1742/// SSE 2 Only
1743
1744// Convert scalar double to scalar single
1745let neverHasSideEffects = 1 in {
1746def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1747                       (ins FR64:$src1, FR64:$src2),
1748                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1749                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG;
1750let mayLoad = 1 in
1751def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1752                       (ins FR64:$src1, f64mem:$src2),
1753                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1754                      [], IIC_SSE_CVT_Scalar_RM>,
1755                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
1756}
1757
1758def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1759          Requires<[HasAVX]>;
1760
1761def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1762                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1763                      [(set FR32:$dst, (fround FR64:$src))],
1764                      IIC_SSE_CVT_Scalar_RR>;
1765def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1766                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1767                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1768                      IIC_SSE_CVT_Scalar_RM>,
1769                      XD,
1770                  Requires<[UseSSE2, OptForSize]>;
1771
1772def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1773                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1774                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1775                       [(set VR128:$dst,
1776                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1777                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>;
1778def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1779                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1780                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1781                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1782                                          VR128:$src1, sse_load_f64:$src2))],
1783                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>;
1784
1785let Constraints = "$src1 = $dst" in {
1786def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1787                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1788                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1789                       [(set VR128:$dst,
1790                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1791                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
1792def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1793                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1794                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1795                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1796                                          VR128:$src1, sse_load_f64:$src2))],
1797                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
1798}
1799
1800// Convert scalar single to scalar double
1801// SSE2 instructions with XS prefix
1802let neverHasSideEffects = 1 in {
1803def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1804                    (ins FR32:$src1, FR32:$src2),
1805                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1806                    [], IIC_SSE_CVT_Scalar_RR>,
1807                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
1808let mayLoad = 1 in
1809def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1810                    (ins FR32:$src1, f32mem:$src2),
1811                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1812                    [], IIC_SSE_CVT_Scalar_RM>,
1813                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
1814}
1815
1816def : Pat<(f64 (fextend FR32:$src)),
1817    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
1818def : Pat<(fextend (loadf32 addr:$src)),
1819    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
1820
1821def : Pat<(extloadf32 addr:$src),
1822    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1823    Requires<[HasAVX, OptForSize]>;
1824def : Pat<(extloadf32 addr:$src),
1825    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1826    Requires<[HasAVX, OptForSpeed]>;
1827
1828def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1829                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1830                   [(set FR64:$dst, (fextend FR32:$src))],
1831                   IIC_SSE_CVT_Scalar_RR>, XS,
1832                 Requires<[UseSSE2]>;
1833def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1834                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1835                   [(set FR64:$dst, (extloadf32 addr:$src))],
1836                   IIC_SSE_CVT_Scalar_RM>, XS,
1837                 Requires<[UseSSE2, OptForSize]>;
1838
1839// extload f32 -> f64.  This matches load+fextend because we have a hack in
1840// the isel (PreprocessForFPConvert) that can introduce loads after dag
1841// combine.
1842// Since these loads aren't folded into the fextend, we have to match it
1843// explicitly here.
1844def : Pat<(fextend (loadf32 addr:$src)),
1845          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1846def : Pat<(extloadf32 addr:$src),
1847          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1848
1849def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1850                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1851                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1852                    [(set VR128:$dst,
1853                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1854                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>;
1855def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1856                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1857                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1858                    [(set VR128:$dst,
1859                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1860                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>;
1861let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1862def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1863                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1864                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1865                    [(set VR128:$dst,
1866                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1867                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
1868def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1869                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1870                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1871                    [(set VR128:$dst,
1872                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1873                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
1874}
1875
1876// Convert packed single/double fp to doubleword
1877def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1878                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1879                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1880                       IIC_SSE_CVT_PS_RR>, VEX;
1881def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1882                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1883                       [(set VR128:$dst,
1884                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1885                       IIC_SSE_CVT_PS_RM>, VEX;
1886def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1887                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1888                        [(set VR256:$dst,
1889                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1890                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
1891def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1892                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1893                        [(set VR256:$dst,
1894                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
1895                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
1896def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1897                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1898                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1899                     IIC_SSE_CVT_PS_RR>;
1900def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1901                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1902                     [(set VR128:$dst,
1903                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1904                     IIC_SSE_CVT_PS_RM>;
1905
1906
1907// Convert Packed Double FP to Packed DW Integers
1908let Predicates = [HasAVX] in {
1909// The assembler can recognize rr 256-bit instructions by seeing a ymm
1910// register, but the same isn't true when using memory operands instead.
1911// Provide other assembly rr and rm forms to address this explicitly.
1912def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1913                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1914                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1915                       VEX;
1916
1917// XMM only
1918def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1919                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
1920def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1921                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1922                       [(set VR128:$dst,
1923                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX;
1924
1925// YMM only
1926def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1927                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1928                       [(set VR128:$dst,
1929                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L;
1930def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1931                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1932                       [(set VR128:$dst,
1933                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
1934                       VEX, VEX_L;
1935def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
1936                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
1937}
1938
1939def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1940                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1941                      [(set VR128:$dst,
1942                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
1943                      IIC_SSE_CVT_PD_RM>;
1944def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1945                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1946                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
1947                      IIC_SSE_CVT_PD_RR>;
1948
1949// Convert with truncation packed single/double fp to doubleword
1950// SSE2 packed instructions with XS prefix
1951def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1952                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1953                         [(set VR128:$dst,
1954                           (int_x86_sse2_cvttps2dq VR128:$src))],
1955                         IIC_SSE_CVT_PS_RR>, VEX;
1956def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1957                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1958                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
1959                                            (memopv4f32 addr:$src)))],
1960                         IIC_SSE_CVT_PS_RM>, VEX;
1961def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1962                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1963                          [(set VR256:$dst,
1964                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
1965                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
1966def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1967                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1968                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
1969                                             (memopv8f32 addr:$src)))],
1970                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
1971
1972def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1973                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1974                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
1975                       IIC_SSE_CVT_PS_RR>;
1976def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1977                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1978                       [(set VR128:$dst,
1979                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
1980                       IIC_SSE_CVT_PS_RM>;
1981
1982let Predicates = [HasAVX] in {
1983  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
1984            (VCVTDQ2PSrr VR128:$src)>;
1985  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
1986            (VCVTDQ2PSrm addr:$src)>;
1987
1988  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
1989            (VCVTDQ2PSrr VR128:$src)>;
1990  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
1991            (VCVTDQ2PSrm addr:$src)>;
1992
1993  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1994            (VCVTTPS2DQrr VR128:$src)>;
1995  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
1996            (VCVTTPS2DQrm addr:$src)>;
1997
1998  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
1999            (VCVTDQ2PSYrr VR256:$src)>;
2000  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
2001            (VCVTDQ2PSYrm addr:$src)>;
2002
2003  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2004            (VCVTTPS2DQYrr VR256:$src)>;
2005  def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
2006            (VCVTTPS2DQYrm addr:$src)>;
2007}
2008
2009let Predicates = [UseSSE2] in {
2010  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2011            (CVTDQ2PSrr VR128:$src)>;
2012  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2013            (CVTDQ2PSrm addr:$src)>;
2014
2015  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2016            (CVTDQ2PSrr VR128:$src)>;
2017  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2018            (CVTDQ2PSrm addr:$src)>;
2019
2020  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2021            (CVTTPS2DQrr VR128:$src)>;
2022  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2023            (CVTTPS2DQrm addr:$src)>;
2024}
2025
2026def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2027                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2028                        [(set VR128:$dst,
2029                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2030                              IIC_SSE_CVT_PD_RR>, VEX;
2031
2032// The assembler can recognize rr 256-bit instructions by seeing a ymm
2033// register, but the same isn't true when using memory operands instead.
2034// Provide other assembly rr and rm forms to address this explicitly.
2035
2036// XMM only
2037def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2038                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
2039def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2040                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2041                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2042                                            (memopv2f64 addr:$src)))],
2043                         IIC_SSE_CVT_PD_RM>, VEX;
2044
2045// YMM only
2046def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2047                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2048                         [(set VR128:$dst,
2049                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2050                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
2051def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2052                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2053                         [(set VR128:$dst,
2054                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
2055                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
2056def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2057                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
2058
2059let Predicates = [HasAVX] in {
2060  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2061            (VCVTTPD2DQYrr VR256:$src)>;
2062  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
2063            (VCVTTPD2DQYrm addr:$src)>;
2064} // Predicates = [HasAVX]
2065
2066def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2067                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2068                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2069                      IIC_SSE_CVT_PD_RR>;
2070def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2071                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2072                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2073                                        (memopv2f64 addr:$src)))],
2074                                        IIC_SSE_CVT_PD_RM>;
2075
2076// Convert packed single to packed double
2077let Predicates = [HasAVX] in {
2078                  // SSE2 instructions without OpSize prefix
2079def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2080                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2081                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2082                     IIC_SSE_CVT_PD_RR>, TB, VEX;
2083def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2084                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2085                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2086                    IIC_SSE_CVT_PD_RM>, TB, VEX;
2087def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2088                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2089                     [(set VR256:$dst,
2090                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2091                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L;
2092def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2093                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2094                     [(set VR256:$dst,
2095                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
2096                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L;
2097}
2098
2099let Predicates = [UseSSE2] in {
2100def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2101                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2102                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2103                       IIC_SSE_CVT_PD_RR>, TB;
2104def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2105                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2106                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2107                   IIC_SSE_CVT_PD_RM>, TB;
2108}
2109
2110// Convert Packed DW Integers to Packed Double FP
2111let Predicates = [HasAVX] in {
2112let neverHasSideEffects = 1, mayLoad = 1 in
2113def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2114                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2115                     []>, VEX;
2116def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2117                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2118                     [(set VR128:$dst,
2119                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX;
2120def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2121                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2122                     [(set VR256:$dst,
2123                       (int_x86_avx_cvtdq2_pd_256
2124                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L;
2125def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2126                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2127                     [(set VR256:$dst,
2128                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L;
2129}
2130
2131let neverHasSideEffects = 1, mayLoad = 1 in
2132def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2133                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2134                       IIC_SSE_CVT_PD_RR>;
2135def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2136                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2137                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2138                       IIC_SSE_CVT_PD_RM>;
2139
2140// AVX 256-bit register conversion intrinsics
2141let Predicates = [HasAVX] in {
2142  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2143            (VCVTDQ2PDYrr VR128:$src)>;
2144  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2145            (VCVTDQ2PDYrm addr:$src)>;
2146} // Predicates = [HasAVX]
2147
2148// Convert packed double to packed single
2149// The assembler can recognize rr 256-bit instructions by seeing a ymm
2150// register, but the same isn't true when using memory operands instead.
2151// Provide other assembly rr and rm forms to address this explicitly.
2152def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2153                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2154                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2155                       IIC_SSE_CVT_PD_RR>, VEX;
2156
2157// XMM only
2158def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2159                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
2160def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2161                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2162                        [(set VR128:$dst,
2163                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2164                        IIC_SSE_CVT_PD_RM>, VEX;
2165
2166// YMM only
2167def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2168                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2169                        [(set VR128:$dst,
2170                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2171                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
2172def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2173                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2174                        [(set VR128:$dst,
2175                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
2176                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
2177def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2178                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
2179
2180def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2181                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2182                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2183                     IIC_SSE_CVT_PD_RR>;
2184def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2185                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2186                     [(set VR128:$dst,
2187                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2188                     IIC_SSE_CVT_PD_RM>;
2189
2190
2191// AVX 256-bit register conversion intrinsics
2192// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2193// whenever possible to avoid declaring two versions of each one.
2194let Predicates = [HasAVX] in {
2195  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2196            (VCVTDQ2PSYrr VR256:$src)>;
2197  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
2198            (VCVTDQ2PSYrm addr:$src)>;
2199
2200  // Match fround and fextend for 128/256-bit conversions
2201  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2202            (VCVTPD2PSYrr VR256:$src)>;
2203  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2204            (VCVTPD2PSYrm addr:$src)>;
2205
2206  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2207            (VCVTPS2PDrr VR128:$src)>;
2208  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2209            (VCVTPS2PDYrr VR128:$src)>;
2210  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2211            (VCVTPS2PDYrm addr:$src)>;
2212}
2213
2214let Predicates = [UseSSE2] in {
2215  // Match fextend for 128 conversions
2216  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2217            (CVTPS2PDrr VR128:$src)>;
2218}
2219
2220//===----------------------------------------------------------------------===//
2221// SSE 1 & 2 - Compare Instructions
2222//===----------------------------------------------------------------------===//
2223
2224// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2225multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2226                            Operand CC, SDNode OpNode, ValueType VT, 
2227                            PatFrag ld_frag, string asm, string asm_alt,
2228                            OpndItins itins> {
2229  def rr : SIi8<0xC2, MRMSrcReg,
2230                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2231                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
2232                itins.rr>;
2233  def rm : SIi8<0xC2, MRMSrcMem,
2234                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2235                [(set RC:$dst, (OpNode (VT RC:$src1),
2236                                         (ld_frag addr:$src2), imm:$cc))],
2237                                         itins.rm>;
2238
2239  // Accept explicit immediate argument form instead of comparison code.
2240  let neverHasSideEffects = 1 in {
2241    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2242                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
2243                      IIC_SSE_ALU_F32S_RR>;
2244    let mayLoad = 1 in
2245    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2246                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
2247                      IIC_SSE_ALU_F32S_RM>;
2248  }
2249}
2250
2251defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
2252                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2253                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2254                 SSE_ALU_F32S>,
2255                 XS, VEX_4V, VEX_LIG;
2256defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
2257                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2258                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2259                 SSE_ALU_F32S>, // same latency as 32 bit compare
2260                 XD, VEX_4V, VEX_LIG;
2261
2262let Constraints = "$src1 = $dst" in {
2263  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
2264                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2265                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
2266                  XS;
2267  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
2268                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2269                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2270                  SSE_ALU_F32S>, // same latency as 32 bit compare
2271                  XD;
2272}
2273
2274multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2275                         Intrinsic Int, string asm, OpndItins itins> {
2276  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2277                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2278                        [(set VR128:$dst, (Int VR128:$src1,
2279                                               VR128:$src, imm:$cc))],
2280                                               itins.rr>;
2281  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2282                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2283                        [(set VR128:$dst, (Int VR128:$src1,
2284                                               (load addr:$src), imm:$cc))],
2285                                               itins.rm>;
2286}
2287
2288// Aliases to match intrinsics which expect XMM operand(s).
2289defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2290                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2291                     SSE_ALU_F32S>,
2292                     XS, VEX_4V;
2293defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2294                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2295                     SSE_ALU_F32S>, // same latency as f32
2296                     XD, VEX_4V;
2297let Constraints = "$src1 = $dst" in {
2298  defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2299                       "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2300                       SSE_ALU_F32S>, XS;
2301  defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2302                       "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2303                       SSE_ALU_F32S>, // same latency as f32
2304                       XD;
2305}
2306
2307
2308// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2309multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2310                            ValueType vt, X86MemOperand x86memop,
2311                            PatFrag ld_frag, string OpcodeStr, Domain d> {
2312  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2313                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2314                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2315                     IIC_SSE_COMIS_RR, d>;
2316  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2317                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2318                     [(set EFLAGS, (OpNode (vt RC:$src1),
2319                                           (ld_frag addr:$src2)))],
2320                                           IIC_SSE_COMIS_RM, d>;
2321}
2322
2323let Defs = [EFLAGS] in {
2324  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2325                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
2326  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2327                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
2328                                  VEX_LIG;
2329  let Pattern = []<dag> in {
2330    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2331                                    "comiss", SSEPackedSingle>, TB, VEX,
2332                                    VEX_LIG;
2333    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2334                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
2335                                    VEX_LIG;
2336  }
2337
2338  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2339                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
2340  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2341                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
2342
2343  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2344                            load, "comiss", SSEPackedSingle>, TB, VEX;
2345  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2346                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
2347  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2348                                  "ucomiss", SSEPackedSingle>, TB;
2349  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2350                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
2351
2352  let Pattern = []<dag> in {
2353    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2354                                    "comiss", SSEPackedSingle>, TB;
2355    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2356                                    "comisd", SSEPackedDouble>, TB, OpSize;
2357  }
2358
2359  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2360                              load, "ucomiss", SSEPackedSingle>, TB;
2361  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2362                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
2363
2364  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2365                                  "comiss", SSEPackedSingle>, TB;
2366  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2367                                  "comisd", SSEPackedDouble>, TB, OpSize;
2368} // Defs = [EFLAGS]
2369
2370// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2371multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2372                            Operand CC, Intrinsic Int, string asm, 
2373                            string asm_alt, Domain d> {
2374  def rri : PIi8<0xC2, MRMSrcReg,
2375             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2376             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
2377             IIC_SSE_CMPP_RR, d>;
2378  def rmi : PIi8<0xC2, MRMSrcMem,
2379             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2380             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
2381             IIC_SSE_CMPP_RM, d>;
2382
2383  // Accept explicit immediate argument form instead of comparison code.
2384  let neverHasSideEffects = 1 in {
2385    def rri_alt : PIi8<0xC2, MRMSrcReg,
2386               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
2387               asm_alt, [], IIC_SSE_CMPP_RR, d>;
2388    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2389               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
2390               asm_alt, [], IIC_SSE_CMPP_RM, d>;
2391  }
2392}
2393
2394defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2395               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2396               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2397               SSEPackedSingle>, TB, VEX_4V;
2398defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2399               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2400               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2401               SSEPackedDouble>, TB, OpSize, VEX_4V;
2402defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2403               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2404               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2405               SSEPackedSingle>, TB, VEX_4V, VEX_L;
2406defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2407               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2408               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2409               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2410let Constraints = "$src1 = $dst" in {
2411  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2412                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2413                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2414                 SSEPackedSingle>, TB;
2415  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2416                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2417                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2418                 SSEPackedDouble>, TB, OpSize;
2419}
2420
2421let Predicates = [HasAVX] in {
2422def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2423          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2424def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2425          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2426def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2427          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2428def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2429          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2430
2431def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2432          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2433def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
2434          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2435def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2436          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2437def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
2438          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2439}
2440
2441let Predicates = [UseSSE1] in {
2442def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2443          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2444def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2445          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2446}
2447
2448let Predicates = [UseSSE2] in {
2449def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2450          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2451def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2452          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2453}
2454
2455//===----------------------------------------------------------------------===//
2456// SSE 1 & 2 - Shuffle Instructions
2457//===----------------------------------------------------------------------===//
2458
2459/// sse12_shuffle - sse 1 & 2 shuffle instructions
2460multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2461                         ValueType vt, string asm, PatFrag mem_frag,
2462                         Domain d, bit IsConvertibleToThreeAddress = 0> {
2463  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2464                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
2465                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2466                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
2467  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
2468    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2469                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
2470                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2471                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
2472}
2473
2474defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2475           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2476           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
2477defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2478           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2479           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
2480defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2481           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
2482           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
2483defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2484           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
2485           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2486
2487let Constraints = "$src1 = $dst" in {
2488  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2489                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2490                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
2491                    TB;
2492  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2493                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2494                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
2495                    TB, OpSize;
2496}
2497
2498let Predicates = [HasAVX] in {
2499  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2500                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2501            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2502  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2503            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2504
2505  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2506                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2507            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2508  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2509            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2510
2511  // 256-bit patterns
2512  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2513            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2514  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2515                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
2516            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2517
2518  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2519            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2520  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2521                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
2522            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2523}
2524
2525let Predicates = [UseSSE1] in {
2526  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2527                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2528            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2529  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2530            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2531}
2532
2533let Predicates = [UseSSE2] in {
2534  // Generic SHUFPD patterns
2535  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2536                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2537            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2538  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2539            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2540}
2541
2542//===----------------------------------------------------------------------===//
2543// SSE 1 & 2 - Unpack Instructions
2544//===----------------------------------------------------------------------===//
2545
2546/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
2547multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2548                                   PatFrag mem_frag, RegisterClass RC,
2549                                   X86MemOperand x86memop, string asm,
2550                                   Domain d> {
2551    def rr : PI<opc, MRMSrcReg,
2552                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2553                asm, [(set RC:$dst,
2554                           (vt (OpNode RC:$src1, RC:$src2)))],
2555                           IIC_SSE_UNPCK, d>;
2556    def rm : PI<opc, MRMSrcMem,
2557                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2558                asm, [(set RC:$dst,
2559                           (vt (OpNode RC:$src1,
2560                                       (mem_frag addr:$src2))))],
2561                                       IIC_SSE_UNPCK, d>;
2562}
2563
2564defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2565      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2566                     SSEPackedSingle>, TB, VEX_4V;
2567defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2568      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2569                     SSEPackedDouble>, TB, OpSize, VEX_4V;
2570defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2571      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2572                     SSEPackedSingle>, TB, VEX_4V;
2573defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2574      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2575                     SSEPackedDouble>, TB, OpSize, VEX_4V;
2576
2577defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
2578      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2579                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
2580defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
2581      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2582                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2583defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
2584      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2585                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
2586defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
2587      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2588                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2589
2590let Constraints = "$src1 = $dst" in {
2591  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2592        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2593                       SSEPackedSingle>, TB;
2594  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2595        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2596                       SSEPackedDouble>, TB, OpSize;
2597  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2598        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2599                       SSEPackedSingle>, TB;
2600  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2601        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2602                       SSEPackedDouble>, TB, OpSize;
2603} // Constraints = "$src1 = $dst"
2604
2605let Predicates = [HasAVX1Only] in {
2606  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
2607            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2608  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2609            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2610  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
2611            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2612  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2613            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2614
2615  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
2616            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2617  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2618            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2619  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
2620            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2621  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2622            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2623}
2624
2625let Predicates = [HasAVX] in {
2626  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2627  // problem is during lowering, where it's not possible to recognize the load
2628  // fold cause it has two uses through a bitcast. One use disappears at isel
2629  // time and the fold opportunity reappears.
2630  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2631            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2632}
2633
2634let Predicates = [UseSSE2] in {
2635  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2636  // problem is during lowering, where it's not possible to recognize the load
2637  // fold cause it has two uses through a bitcast. One use disappears at isel
2638  // time and the fold opportunity reappears.
2639  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2640            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2641}
2642
2643//===----------------------------------------------------------------------===//
2644// SSE 1 & 2 - Extract Floating-Point Sign mask
2645//===----------------------------------------------------------------------===//
2646
2647/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2648multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2649                                Domain d> {
2650  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
2651                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2652                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>;
2653  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
2654                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
2655                IIC_SSE_MOVMSK, d>, REX_W;
2656}
2657
2658let Predicates = [HasAVX] in {
2659  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2660                                        "movmskps", SSEPackedSingle>, TB, VEX;
2661  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2662                                        "movmskpd", SSEPackedDouble>, TB,
2663                                        OpSize, VEX;
2664  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2665                                        "movmskps", SSEPackedSingle>, TB,
2666                                        VEX, VEX_L;
2667  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2668                                        "movmskpd", SSEPackedDouble>, TB,
2669                                        OpSize, VEX, VEX_L;
2670
2671  def : Pat<(i32 (X86fgetsign FR32:$src)),
2672            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
2673  def : Pat<(i64 (X86fgetsign FR32:$src)),
2674            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
2675  def : Pat<(i32 (X86fgetsign FR64:$src)),
2676            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
2677  def : Pat<(i64 (X86fgetsign FR64:$src)),
2678            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
2679
2680  // Assembler Only
2681  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
2682             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
2683             SSEPackedSingle>, TB, VEX;
2684  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
2685             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
2686             SSEPackedDouble>, TB,
2687             OpSize, VEX;
2688  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
2689             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
2690             SSEPackedSingle>, TB, VEX, VEX_L;
2691  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
2692             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
2693             SSEPackedDouble>, TB,
2694             OpSize, VEX, VEX_L;
2695}
2696
2697defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2698                                     SSEPackedSingle>, TB;
2699defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2700                                     SSEPackedDouble>, TB, OpSize;
2701
2702def : Pat<(i32 (X86fgetsign FR32:$src)),
2703          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
2704      Requires<[UseSSE1]>;
2705def : Pat<(i64 (X86fgetsign FR32:$src)),
2706          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
2707      Requires<[UseSSE1]>;
2708def : Pat<(i32 (X86fgetsign FR64:$src)),
2709          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
2710      Requires<[UseSSE2]>;
2711def : Pat<(i64 (X86fgetsign FR64:$src)),
2712          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
2713      Requires<[UseSSE2]>;
2714
2715//===---------------------------------------------------------------------===//
2716// SSE2 - Packed Integer Logical Instructions
2717//===---------------------------------------------------------------------===//
2718
2719let ExeDomain = SSEPackedInt in { // SSE integer instructions
2720
2721/// PDI_binop_rm - Simple SSE2 binary operator.
2722multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2723                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2724                        X86MemOperand x86memop,
2725                        OpndItins itins,
2726                        bit IsCommutable = 0,
2727                        bit Is2Addr = 1> {
2728  let isCommutable = IsCommutable in
2729  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2730       (ins RC:$src1, RC:$src2),
2731       !if(Is2Addr,
2732           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2733           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2734       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>;
2735  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2736       (ins RC:$src1, x86memop:$src2),
2737       !if(Is2Addr,
2738           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2739           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2740       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2741                                     (bitconvert (memop_frag addr:$src2)))))],
2742                                     itins.rm>;
2743}
2744} // ExeDomain = SSEPackedInt
2745
2746// These are ordered here for pattern ordering requirements with the fp versions
2747
2748let Predicates = [HasAVX] in {
2749defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64,
2750                          i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
2751defm VPOR  : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
2752                          i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
2753defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
2754                          i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
2755defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64,
2756                          i128mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
2757}
2758
2759let Constraints = "$src1 = $dst" in {
2760defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64,
2761                         i128mem, SSE_BIT_ITINS_P, 1>;
2762defm POR  : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
2763                         i128mem, SSE_BIT_ITINS_P, 1>;
2764defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
2765                         i128mem, SSE_BIT_ITINS_P, 1>;
2766defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
2767                          i128mem, SSE_BIT_ITINS_P, 0>;
2768} // Constraints = "$src1 = $dst"
2769
2770let Predicates = [HasAVX2] in {
2771defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
2772                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
2773defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
2774                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
2775defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
2776                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
2777defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
2778                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V, VEX_L;
2779}
2780
2781//===----------------------------------------------------------------------===//
2782// SSE 1 & 2 - Logical Instructions
2783//===----------------------------------------------------------------------===//
2784
2785/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
2786///
2787multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
2788                                       SDNode OpNode, OpndItins itins> {
2789  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2790              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
2791              TB, VEX_4V;
2792
2793  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2794        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
2795        TB, OpSize, VEX_4V;
2796
2797  let Constraints = "$src1 = $dst" in {
2798    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2799                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
2800                TB;
2801
2802    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2803                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
2804                TB, OpSize;
2805  }
2806}
2807
2808// Alias bitwise logical operations using SSE logical ops on packed FP values.
2809defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
2810              SSE_BIT_ITINS_P>;
2811defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
2812              SSE_BIT_ITINS_P>;
2813defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
2814              SSE_BIT_ITINS_P>;
2815
2816let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
2817  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
2818                SSE_BIT_ITINS_P>;
2819
2820/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2821///
2822multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2823                                   SDNode OpNode> {
2824  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2825  // are all promoted to v2i64, and the patterns are covered by the int
2826  // version. This is needed in SSE only, because v2i64 isn't supported on
2827  // SSE1, but only on SSE2.
2828  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2829       !strconcat(OpcodeStr, "ps"), f128mem, [],
2830       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2831                                 (memopv2i64 addr:$src2)))], 0, 1>, TB, VEX_4V;
2832
2833  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2834       !strconcat(OpcodeStr, "pd"), f128mem,
2835       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2836                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2837       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2838                                 (memopv2i64 addr:$src2)))], 0>,
2839                                                 TB, OpSize, VEX_4V;
2840  let Constraints = "$src1 = $dst" in {
2841    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2842         !strconcat(OpcodeStr, "ps"), f128mem,
2843         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2844         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2845                                   (memopv2i64 addr:$src2)))]>, TB;
2846
2847    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2848         !strconcat(OpcodeStr, "pd"), f128mem,
2849         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2850                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2851         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2852                                   (memopv2i64 addr:$src2)))]>, TB, OpSize;
2853  }
2854}
2855
2856/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
2857///
2858multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
2859                                     SDNode OpNode> {
2860    defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2861          !strconcat(OpcodeStr, "ps"), f256mem,
2862          [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2863          [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2864                             (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
2865
2866    defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2867          !strconcat(OpcodeStr, "pd"), f256mem,
2868          [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2869                                    (bc_v4i64 (v4f64 VR256:$src2))))],
2870          [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2871                                    (memopv4i64 addr:$src2)))], 0>,
2872                                    TB, OpSize, VEX_4V, VEX_L;
2873}
2874
2875// AVX 256-bit packed logical ops forms
2876defm VAND  : sse12_fp_packed_logical_y<0x54, "and", and>;
2877defm VOR   : sse12_fp_packed_logical_y<0x56, "or", or>;
2878defm VXOR  : sse12_fp_packed_logical_y<0x57, "xor", xor>;
2879defm VANDN : sse12_fp_packed_logical_y<0x55, "andn", X86andnp>;
2880
2881defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2882defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2883defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2884let isCommutable = 0 in
2885  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2886
2887//===----------------------------------------------------------------------===//
2888// SSE 1 & 2 - Arithmetic Instructions
2889//===----------------------------------------------------------------------===//
2890
2891/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2892/// vector forms.
2893///
2894/// In addition, we also have a special variant of the scalar form here to
2895/// represent the associated intrinsic operation.  This form is unlike the
2896/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2897/// and leaves the top elements unmodified (therefore these cannot be commuted).
2898///
2899/// These three forms can each be reg+reg or reg+mem.
2900///
2901
2902/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2903/// classes below
2904multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2905                                  SizeItins itins,
2906                                  bit Is2Addr = 1> {
2907  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2908                            OpNode, FR32, f32mem,
2909                            itins.s, Is2Addr>, XS;
2910  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2911                            OpNode, FR64, f64mem,
2912                            itins.d, Is2Addr>, XD;
2913}
2914
2915multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2916                                   SizeItins itins,
2917                                   bit Is2Addr = 1> {
2918  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2919              v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
2920              TB;
2921  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2922              v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
2923              TB, OpSize;
2924}
2925
2926multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
2927                                    SDNode OpNode,
2928                                    SizeItins itins> {
2929  defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
2930                v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
2931                TB, VEX_L;
2932  defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
2933                v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
2934                TB, OpSize, VEX_L;
2935}
2936
2937multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2938                                      SizeItins itins,
2939                                      bit Is2Addr = 1> {
2940  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2941     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
2942     itins.s, Is2Addr>, XS;
2943  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2944     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
2945     itins.d, Is2Addr>, XD;
2946}
2947
2948multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
2949                                      SizeItins itins,
2950                                      bit Is2Addr = 1> {
2951  defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
2952     !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32,
2953                              SSEPackedSingle, itins.s, Is2Addr>,
2954                              TB;
2955
2956  defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
2957     !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64,
2958                              SSEPackedDouble, itins.d, Is2Addr>,
2959                              TB, OpSize;
2960}
2961
2962multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr,
2963                                        SizeItins itins> {
2964  defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
2965     !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
2966      SSEPackedSingle, itins.s, 0>, TB, VEX_L;
2967
2968  defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
2969     !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
2970      SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_L;
2971}
2972
2973// Binary Arithmetic instructions
2974defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
2975            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
2976              VEX_4V, VEX_LIG;
2977defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P, 0>,
2978            basic_sse12_fp_binop_p_y<0x58, "add", fadd, SSE_ALU_ITINS_P>,
2979              VEX_4V;
2980defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
2981            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
2982              VEX_4V, VEX_LIG;
2983defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P, 0>,
2984            basic_sse12_fp_binop_p_y<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
2985              VEX_4V;
2986
2987let isCommutable = 0 in {
2988  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
2989              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
2990                VEX_4V, VEX_LIG;
2991  defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>,
2992              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
2993                VEX_4V;
2994  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
2995              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
2996                VEX_4V, VEX_LIG;
2997  defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_ALU_ITINS_P, 0>,
2998              basic_sse12_fp_binop_p_y<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
2999                VEX_4V;
3000  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
3001              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
3002                VEX_4V, VEX_LIG;
3003  defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P, 0>,
3004              basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P, 0>,
3005              basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3006              basic_sse12_fp_binop_p_y_int<0x5F, "max", SSE_ALU_ITINS_P>,
3007                VEX_4V;
3008  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
3009              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
3010                VEX_4V, VEX_LIG;
3011  defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P, 0>,
3012              basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P, 0>,
3013              basic_sse12_fp_binop_p_y_int<0x5D, "min", SSE_ALU_ITINS_P>,
3014              basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3015                VEX_4V;
3016}
3017
3018let Constraints = "$src1 = $dst" in {
3019  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3020             basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3021             basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
3022  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3023             basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3024             basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
3025
3026  let isCommutable = 0 in {
3027    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3028               basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3029               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3030    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3031               basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3032               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3033    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3034               basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3035               basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>,
3036               basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P>;
3037    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3038               basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3039               basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>,
3040               basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P>;
3041  }
3042}
3043
3044let isCodeGenOnly = 1 in {
3045  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
3046       VEX_4V, VEX_LIG;
3047  defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
3048       basic_sse12_fp_binop_p_y<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, VEX_4V;
3049  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
3050       VEX_4V, VEX_LIG;
3051  defm VMINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P, 0>,
3052       basic_sse12_fp_binop_p_y<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, VEX_4V;
3053  let Constraints = "$src1 = $dst" in {
3054    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>,
3055         basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
3056    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>,
3057         basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
3058  }
3059}
3060
3061/// Unop Arithmetic
3062/// In addition, we also have a special variant of the scalar form here to
3063/// represent the associated intrinsic operation.  This form is unlike the
3064/// plain scalar form, in that it takes an entire vector (instead of a
3065/// scalar) and leaves the top elements undefined.
3066///
3067/// And, we have a special variant form for a full-vector intrinsic form.
3068
3069def SSE_SQRTP : OpndItins<
3070  IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
3071>;
3072
3073def SSE_SQRTS : OpndItins<
3074  IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
3075>;
3076
3077def SSE_RCPP : OpndItins<
3078  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3079>;
3080
3081def SSE_RCPS : OpndItins<
3082  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3083>;
3084
3085/// sse1_fp_unop_s - SSE1 unops in scalar form.
3086multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
3087                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
3088  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3089                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3090                [(set FR32:$dst, (OpNode FR32:$src))]>;
3091  // For scalar unary operations, fold a load into the operation
3092  // only in OptForSize mode. It eliminates an instruction, but it also
3093  // eliminates a whole-register clobber (the load), so it introduces a
3094  // partial register update condition.
3095  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3096                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3097                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3098            Requires<[UseSSE1, OptForSize]>;
3099  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3100                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3101                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
3102  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
3103                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3104                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>;
3105}
3106
3107/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
3108multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
3109  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
3110                !strconcat(OpcodeStr,
3111                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3112  let mayLoad = 1 in {
3113  def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
3114                !strconcat(OpcodeStr,
3115                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3116  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3117                (ins VR128:$src1, ssmem:$src2),
3118                !strconcat(OpcodeStr,
3119                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3120  }
3121}
3122
3123/// sse1_fp_unop_p - SSE1 unops in packed form.
3124multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3125                          OpndItins itins> {
3126  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3127              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3128              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>;
3129  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3130                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3131                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>;
3132}
3133
3134/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
3135multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
3136                            OpndItins itins> {
3137  def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3138              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3139              [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3140              itins.rr>, VEX_L;
3141  def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3142                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3143                [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
3144                itins.rm>, VEX_L;
3145}
3146
3147/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
3148multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3149                              Intrinsic V4F32Int, OpndItins itins> {
3150  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3151                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3152                    [(set VR128:$dst, (V4F32Int VR128:$src))],
3153                    itins.rr>;
3154  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3155                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3156                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
3157                    itins.rm>;
3158}
3159
3160/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms.
3161multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
3162                                Intrinsic V4F32Int, OpndItins itins> {
3163  def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3164                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3165                    [(set VR256:$dst, (V4F32Int VR256:$src))],
3166                    itins.rr>, VEX_L;
3167  def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3168                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3169                    [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
3170                    itins.rm>, VEX_L;
3171}
3172
3173/// sse2_fp_unop_s - SSE2 unops in scalar form.
3174multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
3175                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
3176  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
3177                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3178                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>;
3179  // See the comments in sse1_fp_unop_s for why this is OptForSize.
3180  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
3181                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3182                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
3183            Requires<[UseSSE2, OptForSize]>;
3184  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3185                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3186                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
3187  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
3188                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3189                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>;
3190}
3191
3192/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
3193let hasSideEffects = 0 in
3194multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
3195  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
3196               !strconcat(OpcodeStr,
3197                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3198  let mayLoad = 1 in {
3199  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
3200               !strconcat(OpcodeStr,
3201                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3202  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
3203               (ins VR128:$src1, sdmem:$src2),
3204               !strconcat(OpcodeStr,
3205                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
3206  }
3207}
3208
3209/// sse2_fp_unop_p - SSE2 unops in vector forms.
3210multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3211                          SDNode OpNode, OpndItins itins> {
3212  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3213              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3214              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>;
3215  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3216                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3217                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>;
3218}
3219
3220/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
3221multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
3222                          OpndItins itins> {
3223  def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3224              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3225              [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3226              itins.rr>, VEX_L;
3227  def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3228                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3229                [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
3230                itins.rm>, VEX_L;
3231}
3232
3233/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
3234multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3235                              Intrinsic V2F64Int, OpndItins itins> {
3236  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3237                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3238                    [(set VR128:$dst, (V2F64Int VR128:$src))],
3239                    itins.rr>;
3240  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3241                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3242                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))],
3243                    itins.rm>;
3244}
3245
3246/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms.
3247multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
3248                                Intrinsic V2F64Int, OpndItins itins> {
3249  def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3250                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3251                    [(set VR256:$dst, (V2F64Int VR256:$src))],
3252                    itins.rr>, VEX_L;
3253  def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3254                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3255                    [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
3256                    itins.rm>, VEX_L;
3257}
3258
3259let Predicates = [HasAVX] in {
3260  // Square root.
3261  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt">,
3262                sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
3263
3264  defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
3265                sse2_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
3266                sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
3267                sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
3268                sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps,
3269                                   SSE_SQRTP>,
3270                sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd,
3271                                    SSE_SQRTP>,
3272                sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256,
3273                                    SSE_SQRTP>,
3274                sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256,
3275                                    SSE_SQRTP>,
3276                VEX;
3277
3278  // Reciprocal approximations. Note that these typically require refinement
3279  // in order to obtain suitable precision.
3280  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
3281  defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
3282                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
3283                sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256,
3284                                    SSE_SQRTP>,
3285                sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps,
3286                                    SSE_SQRTP>, VEX;
3287
3288  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
3289  defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp, SSE_RCPP>,
3290                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp, SSE_RCPP>,
3291                sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256,
3292                                    SSE_RCPP>,
3293                sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps,
3294                                    SSE_RCPP>, VEX;
3295}
3296
3297def : Pat<(f32 (fsqrt FR32:$src)),
3298          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3299def : Pat<(f32 (fsqrt (load addr:$src))),
3300          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3301          Requires<[HasAVX, OptForSize]>;
3302def : Pat<(f64 (fsqrt FR64:$src)),
3303          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
3304def : Pat<(f64 (fsqrt (load addr:$src))),
3305          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
3306          Requires<[HasAVX, OptForSize]>;
3307
3308def : Pat<(f32 (X86frsqrt FR32:$src)),
3309          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3310def : Pat<(f32 (X86frsqrt (load addr:$src))),
3311          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3312          Requires<[HasAVX, OptForSize]>;
3313
3314def : Pat<(f32 (X86frcp FR32:$src)),
3315          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3316def : Pat<(f32 (X86frcp (load addr:$src))),
3317          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3318          Requires<[HasAVX, OptForSize]>;
3319
3320let Predicates = [HasAVX] in {
3321  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
3322            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
3323                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
3324                              VR128)>;
3325  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
3326            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3327
3328  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
3329            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
3330                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
3331                              VR128)>;
3332  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
3333            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
3334
3335  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3336            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
3337                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
3338                              VR128)>;
3339  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
3340            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3341
3342  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3343            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
3344                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
3345                              VR128)>;
3346  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
3347            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3348}
3349
3350// Square root.
3351defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
3352                            SSE_SQRTS>,
3353             sse1_fp_unop_p<0x51, "sqrt",  fsqrt, SSE_SQRTS>,
3354             sse1_fp_unop_p_int<0x51, "sqrt",  int_x86_sse_sqrt_ps, SSE_SQRTS>,
3355             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
3356                            SSE_SQRTS>,
3357             sse2_fp_unop_p<0x51, "sqrt",  fsqrt, SSE_SQRTS>,
3358             sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
3359
3360// Reciprocal approximations. Note that these typically require refinement
3361// in order to obtain suitable precision.
3362defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
3363                            SSE_SQRTS>,
3364             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
3365             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3366                            SSE_SQRTS>;
3367defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
3368                            SSE_RCPS>,
3369             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
3370             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
3371
3372// There is no f64 version of the reciprocal approximation instructions.
3373
3374//===----------------------------------------------------------------------===//
3375// SSE 1 & 2 - Non-temporal stores
3376//===----------------------------------------------------------------------===//
3377
3378let AddedComplexity = 400 in { // Prefer non-temporal versions
3379  def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3380                       (ins f128mem:$dst, VR128:$src),
3381                       "movntps\t{$src, $dst|$dst, $src}",
3382                       [(alignednontemporalstore (v4f32 VR128:$src),
3383                                                 addr:$dst)],
3384                                                 IIC_SSE_MOVNT>, VEX;
3385  def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3386                       (ins f128mem:$dst, VR128:$src),
3387                       "movntpd\t{$src, $dst|$dst, $src}",
3388                       [(alignednontemporalstore (v2f64 VR128:$src),
3389                                                 addr:$dst)],
3390                                                 IIC_SSE_MOVNT>, VEX;
3391
3392  let ExeDomain = SSEPackedInt in
3393  def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3394                           (ins f128mem:$dst, VR128:$src),
3395                           "movntdq\t{$src, $dst|$dst, $src}",
3396                           [(alignednontemporalstore (v2i64 VR128:$src),
3397                                                     addr:$dst)],
3398                                                     IIC_SSE_MOVNT>, VEX;
3399
3400  def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3401            (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
3402
3403  def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3404                       (ins f256mem:$dst, VR256:$src),
3405                       "movntps\t{$src, $dst|$dst, $src}",
3406                       [(alignednontemporalstore (v8f32 VR256:$src),
3407                                                 addr:$dst)],
3408                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
3409  def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3410                       (ins f256mem:$dst, VR256:$src),
3411                       "movntpd\t{$src, $dst|$dst, $src}",
3412                       [(alignednontemporalstore (v4f64 VR256:$src),
3413                                                 addr:$dst)],
3414                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
3415  let ExeDomain = SSEPackedInt in
3416  def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3417                      (ins f256mem:$dst, VR256:$src),
3418                      "movntdq\t{$src, $dst|$dst, $src}",
3419                      [(alignednontemporalstore (v4i64 VR256:$src),
3420                                                addr:$dst)],
3421                                                IIC_SSE_MOVNT>, VEX, VEX_L;
3422}
3423
3424let AddedComplexity = 400 in { // Prefer non-temporal versions
3425def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3426                    "movntps\t{$src, $dst|$dst, $src}",
3427                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3428                    IIC_SSE_MOVNT>;
3429def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3430                    "movntpd\t{$src, $dst|$dst, $src}",
3431                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3432                    IIC_SSE_MOVNT>;
3433
3434let ExeDomain = SSEPackedInt in
3435def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3436                    "movntdq\t{$src, $dst|$dst, $src}",
3437                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3438                    IIC_SSE_MOVNT>;
3439
3440def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3441          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
3442
3443// There is no AVX form for instructions below this point
3444def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3445                 "movnti{l}\t{$src, $dst|$dst, $src}",
3446                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3447                 IIC_SSE_MOVNT>,
3448               TB, Requires<[HasSSE2]>;
3449def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3450                     "movnti{q}\t{$src, $dst|$dst, $src}",
3451                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3452                     IIC_SSE_MOVNT>,
3453                  TB, Requires<[HasSSE2]>;
3454}
3455
3456//===----------------------------------------------------------------------===//
3457// SSE 1 & 2 - Prefetch and memory fence
3458//===----------------------------------------------------------------------===//
3459
3460// Prefetch intrinsic.
3461let Predicates = [HasSSE1] in {
3462def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3463    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3464    IIC_SSE_PREFETCH>, TB;
3465def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3466    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3467    IIC_SSE_PREFETCH>, TB;
3468def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3469    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3470    IIC_SSE_PREFETCH>, TB;
3471def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3472    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3473    IIC_SSE_PREFETCH>, TB;
3474}
3475
3476// Flush cache
3477def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3478               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3479               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
3480
3481// Pause. This "instruction" is encoded as "rep; nop", so even though it
3482// was introduced with SSE2, it's backward compatible.
3483def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
3484
3485// Load, store, and memory fence
3486def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3487               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3488               TB, Requires<[HasSSE1]>;
3489def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3490               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3491               TB, Requires<[HasSSE2]>;
3492def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3493               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3494               TB, Requires<[HasSSE2]>;
3495
3496def : Pat<(X86SFence), (SFENCE)>;
3497def : Pat<(X86LFence), (LFENCE)>;
3498def : Pat<(X86MFence), (MFENCE)>;
3499
3500//===----------------------------------------------------------------------===//
3501// SSE 1 & 2 - Load/Store XCSR register
3502//===----------------------------------------------------------------------===//
3503
3504def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3505                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3506                  IIC_SSE_LDMXCSR>, VEX;
3507def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3508                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3509                  IIC_SSE_STMXCSR>, VEX;
3510
3511def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3512                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3513                  IIC_SSE_LDMXCSR>;
3514def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3515                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3516                  IIC_SSE_STMXCSR>;
3517
3518//===---------------------------------------------------------------------===//
3519// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3520//===---------------------------------------------------------------------===//
3521
3522let ExeDomain = SSEPackedInt in { // SSE integer instructions
3523
3524let neverHasSideEffects = 1 in {
3525def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3526                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3527                    VEX;
3528def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3529                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3530                    VEX, VEX_L;
3531}
3532def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3533                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3534                    VEX;
3535def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3536                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3537                    VEX, VEX_L;
3538
3539// For Disassembler
3540let isCodeGenOnly = 1 in {
3541def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3542                        "movdqa\t{$src, $dst|$dst, $src}", [],
3543                        IIC_SSE_MOVA_P_RR>,
3544                        VEX;
3545def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3546                        "movdqa\t{$src, $dst|$dst, $src}", [],
3547                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3548def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3549                        "movdqu\t{$src, $dst|$dst, $src}", [],
3550                        IIC_SSE_MOVU_P_RR>,
3551                        VEX;
3552def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3553                        "movdqu\t{$src, $dst|$dst, $src}", [],
3554                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3555}
3556
3557let canFoldAsLoad = 1, mayLoad = 1 in {
3558def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3559                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3560                   VEX;
3561def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3562                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3563                   VEX, VEX_L;
3564let Predicates = [HasAVX] in {
3565  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3566                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3567                    XS, VEX;
3568  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3569                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3570                    XS, VEX, VEX_L;
3571}
3572}
3573
3574let mayStore = 1 in {
3575def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3576                     (ins i128mem:$dst, VR128:$src),
3577                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3578                     VEX;
3579def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3580                     (ins i256mem:$dst, VR256:$src),
3581                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3582                     VEX, VEX_L;
3583let Predicates = [HasAVX] in {
3584def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3585                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3586                  XS, VEX;
3587def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3588                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3589                  XS, VEX, VEX_L;
3590}
3591}
3592
3593let neverHasSideEffects = 1 in
3594def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3595                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3596
3597def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3598                   "movdqu\t{$src, $dst|$dst, $src}",
3599                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3600
3601// For Disassembler
3602let isCodeGenOnly = 1 in {
3603def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3604                       "movdqa\t{$src, $dst|$dst, $src}", [],
3605                       IIC_SSE_MOVA_P_RR>;
3606
3607def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3608                       "movdqu\t{$src, $dst|$dst, $src}",
3609                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3610}
3611
3612let canFoldAsLoad = 1, mayLoad = 1 in {
3613def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3614                   "movdqa\t{$src, $dst|$dst, $src}",
3615                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3616                   IIC_SSE_MOVA_P_RM>;
3617def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3618                   "movdqu\t{$src, $dst|$dst, $src}",
3619                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3620                   IIC_SSE_MOVU_P_RM>,
3621                 XS, Requires<[UseSSE2]>;
3622}
3623
3624let mayStore = 1 in {
3625def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3626                   "movdqa\t{$src, $dst|$dst, $src}",
3627                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3628                   IIC_SSE_MOVA_P_MR>;
3629def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3630                   "movdqu\t{$src, $dst|$dst, $src}",
3631                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3632                   IIC_SSE_MOVU_P_MR>,
3633                 XS, Requires<[UseSSE2]>;
3634}
3635
3636// Intrinsic forms of MOVDQU load and store
3637def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3638                       "vmovdqu\t{$src, $dst|$dst, $src}",
3639                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
3640                       IIC_SSE_MOVU_P_MR>,
3641                     XS, VEX, Requires<[HasAVX]>;
3642
3643def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3644                       "movdqu\t{$src, $dst|$dst, $src}",
3645                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
3646                       IIC_SSE_MOVU_P_MR>,
3647                     XS, Requires<[UseSSE2]>;
3648
3649} // ExeDomain = SSEPackedInt
3650
3651let Predicates = [HasAVX] in {
3652  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3653            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3654}
3655
3656//===---------------------------------------------------------------------===//
3657// SSE2 - Packed Integer Arithmetic Instructions
3658//===---------------------------------------------------------------------===//
3659
3660def SSE_PMADD : OpndItins<
3661  IIC_SSE_PMADD, IIC_SSE_PMADD
3662>;
3663
3664let ExeDomain = SSEPackedInt in { // SSE integer instructions
3665
3666multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3667                            RegisterClass RC, PatFrag memop_frag,
3668                            X86MemOperand x86memop,
3669                            OpndItins itins,
3670                            bit IsCommutable = 0,
3671                            bit Is2Addr = 1> {
3672  let isCommutable = IsCommutable in
3673  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3674       (ins RC:$src1, RC:$src2),
3675       !if(Is2Addr,
3676           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3677           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3678       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>;
3679  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3680       (ins RC:$src1, x86memop:$src2),
3681       !if(Is2Addr,
3682           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3683           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3684       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3685       itins.rm>;
3686}
3687
3688multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3689                         string OpcodeStr, SDNode OpNode,
3690                         SDNode OpNode2, RegisterClass RC,
3691                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
3692                         ShiftOpndItins itins,
3693                         bit Is2Addr = 1> {
3694  // src2 is always 128-bit
3695  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3696       (ins RC:$src1, VR128:$src2),
3697       !if(Is2Addr,
3698           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3699           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3700       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
3701        itins.rr>;
3702  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3703       (ins RC:$src1, i128mem:$src2),
3704       !if(Is2Addr,
3705           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3706           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3707       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3708                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>;
3709  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3710       (ins RC:$src1, i32i8imm:$src2),
3711       !if(Is2Addr,
3712           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3713           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3714       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>;
3715}
3716
3717/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types
3718multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3719                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3720                         PatFrag memop_frag, X86MemOperand x86memop,
3721                         OpndItins itins,
3722                         bit IsCommutable = 0, bit Is2Addr = 1> {
3723  let isCommutable = IsCommutable in
3724  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3725       (ins RC:$src1, RC:$src2),
3726       !if(Is2Addr,
3727           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3728           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3729       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>;
3730  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3731       (ins RC:$src1, x86memop:$src2),
3732       !if(Is2Addr,
3733           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3734           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3735       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3736                                     (bitconvert (memop_frag addr:$src2)))))]>;
3737}
3738} // ExeDomain = SSEPackedInt
3739
3740// 128-bit Integer Arithmetic
3741
3742let Predicates = [HasAVX] in {
3743defm VPADDB  : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, VR128, memopv2i64,
3744                            i128mem, SSE_INTALU_ITINS_P, 1, 0 /*3addr*/>,
3745                            VEX_4V;
3746defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64,
3747                            i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3748defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64,
3749                            i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3750defm VPADDQ  : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64,
3751                            i128mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
3752defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64,
3753                            i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
3754defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64,
3755                            i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3756defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64,
3757                            i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3758defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
3759                            i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3760defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
3761                            i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
3762defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
3763                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
3764                              VEX_4V;
3765
3766// Intrinsic forms
3767defm VPSUBSB  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
3768                                 VR128, memopv2i64, i128mem,
3769                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3770defm VPSUBSW  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w,
3771                                 VR128, memopv2i64, i128mem,
3772                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3773defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b,
3774                                 VR128, memopv2i64, i128mem,
3775                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3776defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w,
3777                                 VR128, memopv2i64, i128mem,
3778                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
3779defm VPADDSB  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b,
3780                                 VR128, memopv2i64, i128mem,
3781                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3782defm VPADDSW  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w,
3783                                 VR128, memopv2i64, i128mem,
3784                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3785defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b,
3786                                 VR128, memopv2i64, i128mem,
3787                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3788defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w,
3789                                 VR128, memopv2i64, i128mem,
3790                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3791defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w,
3792                                 VR128, memopv2i64, i128mem,
3793                                 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
3794defm VPMULHW  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w,
3795                                 VR128, memopv2i64, i128mem,
3796                                 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
3797defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd,
3798                                 VR128, memopv2i64, i128mem,
3799                                 SSE_PMADD, 1, 0>, VEX_4V;
3800defm VPAVGB   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b,
3801                                 VR128, memopv2i64, i128mem,
3802                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3803defm VPAVGW   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w,
3804                                 VR128, memopv2i64, i128mem,
3805                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3806defm VPMINUB  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b,
3807                                 VR128, memopv2i64, i128mem,
3808                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3809defm VPMINSW  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w,
3810                                 VR128, memopv2i64, i128mem,
3811                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3812defm VPMAXUB  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b,
3813                                 VR128, memopv2i64, i128mem,
3814                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3815defm VPMAXSW  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w,
3816                                 VR128, memopv2i64, i128mem,
3817                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3818defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
3819                                 VR128, memopv2i64, i128mem,
3820                                 SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
3821}
3822
3823let Predicates = [HasAVX2] in {
3824defm VPADDBY  : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
3825                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3826defm VPADDWY  : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
3827                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3828defm VPADDDY  : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
3829                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3830defm VPADDQY  : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
3831                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3832defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
3833                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3834defm VPSUBBY  : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
3835                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3836defm VPSUBWY  : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
3837                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3838defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
3839                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3840defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
3841                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3842defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
3843                               VR256, memopv4i64, i256mem,
3844                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3845
3846// Intrinsic forms
3847defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
3848                                  VR256, memopv4i64, i256mem,
3849                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3850defm VPSUBSWY  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
3851                                  VR256, memopv4i64, i256mem,
3852                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3853defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
3854                                  VR256, memopv4i64, i256mem,
3855                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3856defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
3857                                  VR256, memopv4i64, i256mem,
3858                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
3859defm VPADDSBY  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
3860                                  VR256, memopv4i64, i256mem,
3861                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3862defm VPADDSWY  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
3863                                  VR256, memopv4i64, i256mem,
3864                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3865defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
3866                                  VR256, memopv4i64, i256mem,
3867                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3868defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
3869                                  VR256, memopv4i64, i256mem,
3870                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3871defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
3872                                  VR256, memopv4i64, i256mem,
3873                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3874defm VPMULHWY  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
3875                                  VR256, memopv4i64, i256mem,
3876                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3877defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
3878                                  VR256, memopv4i64, i256mem,
3879                                  SSE_PMADD, 1, 0>, VEX_4V, VEX_L;
3880defm VPAVGBY   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
3881                                  VR256, memopv4i64, i256mem,
3882                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3883defm VPAVGWY   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
3884                                  VR256, memopv4i64, i256mem,
3885                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3886defm VPMINUBY  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
3887                                  VR256, memopv4i64, i256mem,
3888                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3889defm VPMINSWY  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
3890                                  VR256, memopv4i64, i256mem,
3891                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3892defm VPMAXUBY  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
3893                                  VR256, memopv4i64, i256mem,
3894                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3895defm VPMAXSWY  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
3896                                  VR256, memopv4i64, i256mem,
3897                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3898defm VPSADBWY  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
3899                                  VR256, memopv4i64, i256mem,
3900                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3901}
3902
3903let Constraints = "$src1 = $dst" in {
3904defm PADDB  : PDI_binop_rm<0xFC, "paddb", add, v16i8, VR128, memopv2i64,
3905                           i128mem, SSE_INTALU_ITINS_P, 1>;
3906defm PADDW  : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64,
3907                           i128mem, SSE_INTALU_ITINS_P, 1>;
3908defm PADDD  : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64,
3909                           i128mem, SSE_INTALU_ITINS_P, 1>;
3910defm PADDQ  : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64,
3911                           i128mem, SSE_INTALUQ_ITINS_P, 1>;
3912defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64,
3913                           i128mem, SSE_INTMUL_ITINS_P, 1>;
3914defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64,
3915                          i128mem, SSE_INTALU_ITINS_P>;
3916defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64,
3917                          i128mem, SSE_INTALU_ITINS_P>;
3918defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
3919                          i128mem, SSE_INTALU_ITINS_P>;
3920defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
3921                          i128mem, SSE_INTALUQ_ITINS_P>;
3922defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
3923                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
3924
3925// Intrinsic forms
3926defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
3927                                VR128, memopv2i64, i128mem,
3928                                SSE_INTALU_ITINS_P>;
3929defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
3930                                VR128, memopv2i64, i128mem,
3931                                SSE_INTALU_ITINS_P>;
3932defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b,
3933                                VR128, memopv2i64, i128mem,
3934                                SSE_INTALU_ITINS_P>;
3935defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w,
3936                                VR128, memopv2i64, i128mem,
3937                                SSE_INTALU_ITINS_P>;
3938defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
3939                                VR128, memopv2i64, i128mem,
3940                                SSE_INTALU_ITINS_P, 1>;
3941defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w,
3942                                VR128, memopv2i64, i128mem,
3943                                SSE_INTALU_ITINS_P, 1>;
3944defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
3945                                VR128, memopv2i64, i128mem,
3946                                SSE_INTALU_ITINS_P, 1>;
3947defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
3948                                VR128, memopv2i64, i128mem,
3949                                SSE_INTALU_ITINS_P, 1>;
3950defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
3951                                VR128, memopv2i64, i128mem,
3952                                SSE_INTMUL_ITINS_P, 1>;
3953defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
3954                                VR128, memopv2i64, i128mem,
3955                                SSE_INTMUL_ITINS_P, 1>;
3956defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
3957                                VR128, memopv2i64, i128mem,
3958                                SSE_PMADD, 1>;
3959defm PAVGB   : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
3960                                VR128, memopv2i64, i128mem,
3961                                SSE_INTALU_ITINS_P, 1>;
3962defm PAVGW   : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
3963                                VR128, memopv2i64, i128mem,
3964                                SSE_INTALU_ITINS_P, 1>;
3965defm PMINUB  : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b,
3966                                VR128, memopv2i64, i128mem,
3967                                SSE_INTALU_ITINS_P, 1>;
3968defm PMINSW  : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w,
3969                                VR128, memopv2i64, i128mem,
3970                                SSE_INTALU_ITINS_P, 1>;
3971defm PMAXUB  : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b,
3972                                VR128, memopv2i64, i128mem,
3973                                SSE_INTALU_ITINS_P, 1>;
3974defm PMAXSW  : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w,
3975                                VR128, memopv2i64, i128mem,
3976                                SSE_INTALU_ITINS_P, 1>;
3977defm PSADBW  : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
3978                                VR128, memopv2i64, i128mem,
3979                                SSE_INTALU_ITINS_P, 1>;
3980
3981} // Constraints = "$src1 = $dst"
3982
3983//===---------------------------------------------------------------------===//
3984// SSE2 - Packed Integer Logical Instructions
3985//===---------------------------------------------------------------------===//
3986
3987let Predicates = [HasAVX] in {
3988defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
3989                            VR128, v8i16, v8i16, bc_v8i16,
3990                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3991defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
3992                            VR128, v4i32, v4i32, bc_v4i32,
3993                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3994defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
3995                            VR128, v2i64, v2i64, bc_v2i64,
3996                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3997
3998defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
3999                            VR128, v8i16, v8i16, bc_v8i16,
4000                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4001defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4002                            VR128, v4i32, v4i32, bc_v4i32,
4003                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4004defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4005                            VR128, v2i64, v2i64, bc_v2i64,
4006                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4007
4008defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4009                            VR128, v8i16, v8i16, bc_v8i16,
4010                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4011defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4012                            VR128, v4i32, v4i32, bc_v4i32,
4013                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4014
4015let ExeDomain = SSEPackedInt in {
4016  // 128-bit logical shifts.
4017  def VPSLLDQri : PDIi8<0x73, MRM7r,
4018                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4019                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4020                    [(set VR128:$dst,
4021                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
4022                    VEX_4V;
4023  def VPSRLDQri : PDIi8<0x73, MRM3r,
4024                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4025                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4026                    [(set VR128:$dst,
4027                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
4028                    VEX_4V;
4029  // PSRADQri doesn't exist in SSE[1-3].
4030}
4031} // Predicates = [HasAVX]
4032
4033let Predicates = [HasAVX2] in {
4034defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4035                             VR256, v16i16, v8i16, bc_v8i16,
4036                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4037defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4038                             VR256, v8i32, v4i32, bc_v4i32,
4039                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4040defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4041                             VR256, v4i64, v2i64, bc_v2i64,
4042                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4043
4044defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4045                             VR256, v16i16, v8i16, bc_v8i16,
4046                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4047defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4048                             VR256, v8i32, v4i32, bc_v4i32,
4049                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4050defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4051                             VR256, v4i64, v2i64, bc_v2i64,
4052                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4053
4054defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4055                             VR256, v16i16, v8i16, bc_v8i16,
4056                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4057defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4058                             VR256, v8i32, v4i32, bc_v4i32,
4059                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4060
4061let ExeDomain = SSEPackedInt in {
4062  // 256-bit logical shifts.
4063  def VPSLLDQYri : PDIi8<0x73, MRM7r,
4064                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4065                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4066                    [(set VR256:$dst,
4067                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
4068                    VEX_4V, VEX_L;
4069  def VPSRLDQYri : PDIi8<0x73, MRM3r,
4070                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4071                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4072                    [(set VR256:$dst,
4073                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
4074                    VEX_4V, VEX_L;
4075  // PSRADQYri doesn't exist in SSE[1-3].
4076}
4077} // Predicates = [HasAVX2]
4078
4079let Constraints = "$src1 = $dst" in {
4080defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4081                           VR128, v8i16, v8i16, bc_v8i16,
4082                           SSE_INTSHIFT_ITINS_P>;
4083defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4084                           VR128, v4i32, v4i32, bc_v4i32,
4085                           SSE_INTSHIFT_ITINS_P>;
4086defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4087                           VR128, v2i64, v2i64, bc_v2i64,
4088                           SSE_INTSHIFT_ITINS_P>;
4089
4090defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4091                           VR128, v8i16, v8i16, bc_v8i16,
4092                           SSE_INTSHIFT_ITINS_P>;
4093defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4094                           VR128, v4i32, v4i32, bc_v4i32,
4095                           SSE_INTSHIFT_ITINS_P>;
4096defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4097                           VR128, v2i64, v2i64, bc_v2i64,
4098                           SSE_INTSHIFT_ITINS_P>;
4099
4100defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4101                           VR128, v8i16, v8i16, bc_v8i16,
4102                           SSE_INTSHIFT_ITINS_P>;
4103defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4104                           VR128, v4i32, v4i32, bc_v4i32,
4105                           SSE_INTSHIFT_ITINS_P>;
4106
4107let ExeDomain = SSEPackedInt in {
4108  // 128-bit logical shifts.
4109  def PSLLDQri : PDIi8<0x73, MRM7r,
4110                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4111                       "pslldq\t{$src2, $dst|$dst, $src2}",
4112                       [(set VR128:$dst,
4113                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
4114  def PSRLDQri : PDIi8<0x73, MRM3r,
4115                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4116                       "psrldq\t{$src2, $dst|$dst, $src2}",
4117                       [(set VR128:$dst,
4118                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
4119  // PSRADQri doesn't exist in SSE[1-3].
4120}
4121} // Constraints = "$src1 = $dst"
4122
4123let Predicates = [HasAVX] in {
4124  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4125            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4126  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4127            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4128  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4129            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4130
4131  // Shift up / down and insert zero's.
4132  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4133            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4134  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4135            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4136}
4137
4138let Predicates = [HasAVX2] in {
4139  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
4140            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4141  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
4142            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4143}
4144
4145let Predicates = [UseSSE2] in {
4146  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4147            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4148  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4149            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4150  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4151            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4152
4153  // Shift up / down and insert zero's.
4154  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4155            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4156  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4157            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4158}
4159
4160//===---------------------------------------------------------------------===//
4161// SSE2 - Packed Integer Comparison Instructions
4162//===---------------------------------------------------------------------===//
4163
4164let Predicates = [HasAVX] in {
4165  defm VPCMPEQB  : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v16i8,
4166                                VR128, memopv2i64, i128mem,
4167                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
4168  defm VPCMPEQW  : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v8i16,
4169                                VR128, memopv2i64, i128mem,
4170                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
4171  defm VPCMPEQD  : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v4i32,
4172                                VR128, memopv2i64, i128mem,
4173                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
4174  defm VPCMPGTB  : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v16i8,
4175                                VR128, memopv2i64, i128mem,
4176                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4177  defm VPCMPGTW  : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v8i16,
4178                                VR128, memopv2i64, i128mem,
4179                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4180  defm VPCMPGTD  : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v4i32,
4181                                VR128, memopv2i64, i128mem,
4182                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4183}
4184
4185let Predicates = [HasAVX2] in {
4186  defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
4187                                VR256, memopv4i64, i256mem,
4188                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4189  defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
4190                                VR256, memopv4i64, i256mem,
4191                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4192  defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
4193                                VR256, memopv4i64, i256mem,
4194                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4195  defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
4196                                VR256, memopv4i64, i256mem,
4197                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4198  defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
4199                                VR256, memopv4i64, i256mem,
4200                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4201  defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
4202                                VR256, memopv4i64, i256mem,
4203                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4204}
4205
4206let Constraints = "$src1 = $dst" in {
4207  defm PCMPEQB  : PDI_binop_rm<0x74, "pcmpeqb", X86pcmpeq, v16i8,
4208                               VR128, memopv2i64, i128mem,
4209                               SSE_INTALU_ITINS_P, 1>;
4210  defm PCMPEQW  : PDI_binop_rm<0x75, "pcmpeqw", X86pcmpeq, v8i16,
4211                               VR128, memopv2i64, i128mem,
4212                               SSE_INTALU_ITINS_P, 1>;
4213  defm PCMPEQD  : PDI_binop_rm<0x76, "pcmpeqd", X86pcmpeq, v4i32,
4214                               VR128, memopv2i64, i128mem,
4215                               SSE_INTALU_ITINS_P, 1>;
4216  defm PCMPGTB  : PDI_binop_rm<0x64, "pcmpgtb", X86pcmpgt, v16i8,
4217                               VR128, memopv2i64, i128mem,
4218                               SSE_INTALU_ITINS_P>;
4219  defm PCMPGTW  : PDI_binop_rm<0x65, "pcmpgtw", X86pcmpgt, v8i16,
4220                               VR128, memopv2i64, i128mem,
4221                               SSE_INTALU_ITINS_P>;
4222  defm PCMPGTD  : PDI_binop_rm<0x66, "pcmpgtd", X86pcmpgt, v4i32,
4223                               VR128, memopv2i64, i128mem,
4224                               SSE_INTALU_ITINS_P>;
4225} // Constraints = "$src1 = $dst"
4226
4227//===---------------------------------------------------------------------===//
4228// SSE2 - Packed Integer Pack Instructions
4229//===---------------------------------------------------------------------===//
4230
4231let Predicates = [HasAVX] in {
4232defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
4233                                  VR128, memopv2i64, i128mem,
4234                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4235defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
4236                                  VR128, memopv2i64, i128mem,
4237                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4238defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
4239                                  VR128, memopv2i64, i128mem,
4240                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
4241}
4242
4243let Predicates = [HasAVX2] in {
4244defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
4245                                   VR256, memopv4i64, i256mem,
4246                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4247defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
4248                                   VR256, memopv4i64, i256mem,
4249                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4250defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
4251                                   VR256, memopv4i64, i256mem,
4252                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
4253}
4254
4255let Constraints = "$src1 = $dst" in {
4256defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
4257                                 VR128, memopv2i64, i128mem,
4258                                 SSE_INTALU_ITINS_P>;
4259defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
4260                                 VR128, memopv2i64, i128mem,
4261                                 SSE_INTALU_ITINS_P>;
4262defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
4263                                 VR128, memopv2i64, i128mem,
4264                                 SSE_INTALU_ITINS_P>;
4265} // Constraints = "$src1 = $dst"
4266
4267//===---------------------------------------------------------------------===//
4268// SSE2 - Packed Integer Shuffle Instructions
4269//===---------------------------------------------------------------------===//
4270
4271let ExeDomain = SSEPackedInt in {
4272multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, SDNode OpNode> {
4273def ri : Ii8<0x70, MRMSrcReg,
4274             (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
4275             !strconcat(OpcodeStr,
4276                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4277              [(set VR128:$dst, (vt (OpNode VR128:$src1, (i8 imm:$src2))))],
4278              IIC_SSE_PSHUF>;
4279def mi : Ii8<0x70, MRMSrcMem,
4280             (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
4281             !strconcat(OpcodeStr,
4282                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4283              [(set VR128:$dst,
4284                (vt (OpNode (bitconvert (memopv2i64 addr:$src1)),
4285                             (i8 imm:$src2))))],
4286                             IIC_SSE_PSHUF>;
4287}
4288
4289multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, SDNode OpNode> {
4290def Yri : Ii8<0x70, MRMSrcReg,
4291              (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2),
4292              !strconcat(OpcodeStr,
4293                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4294              [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>;
4295def Ymi : Ii8<0x70, MRMSrcMem,
4296              (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2),
4297              !strconcat(OpcodeStr,
4298                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4299              [(set VR256:$dst,
4300                (vt (OpNode (bitconvert (memopv4i64 addr:$src1)),
4301                             (i8 imm:$src2))))]>;
4302}
4303} // ExeDomain = SSEPackedInt
4304
4305let Predicates = [HasAVX] in {
4306 let AddedComplexity = 5 in
4307  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, X86PShufd>, TB, OpSize, VEX;
4308
4309 // SSE2 with ImmT == Imm8 and XS prefix.
4310  defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, X86PShufhw>, XS, VEX;
4311
4312 // SSE2 with ImmT == Imm8 and XD prefix.
4313  defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, X86PShuflw>, XD, VEX;
4314
4315 def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4316           (VPSHUFDmi addr:$src1, imm:$imm)>;
4317 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4318           (VPSHUFDri VR128:$src1, imm:$imm)>;
4319}
4320
4321let Predicates = [HasAVX2] in {
4322  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>,
4323                                TB, OpSize, VEX,VEX_L;
4324  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>,
4325                                  XS, VEX, VEX_L;
4326  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>,
4327                                  XD, VEX, VEX_L;
4328}
4329
4330let Predicates = [UseSSE2] in {
4331 let AddedComplexity = 5 in
4332  defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
4333
4334 // SSE2 with ImmT == Imm8 and XS prefix.
4335  defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, X86PShufhw>, XS;
4336
4337 // SSE2 with ImmT == Imm8 and XD prefix.
4338  defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, X86PShuflw>, XD;
4339
4340 def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4341           (PSHUFDmi addr:$src1, imm:$imm)>;
4342 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4343           (PSHUFDri VR128:$src1, imm:$imm)>;
4344}
4345
4346//===---------------------------------------------------------------------===//
4347// SSE2 - Packed Integer Unpack Instructions
4348//===---------------------------------------------------------------------===//
4349
4350let ExeDomain = SSEPackedInt in {
4351multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4352                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
4353  def rr : PDI<opc, MRMSrcReg,
4354      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4355      !if(Is2Addr,
4356          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4357          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4358      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4359      IIC_SSE_UNPCK>;
4360  def rm : PDI<opc, MRMSrcMem,
4361      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4362      !if(Is2Addr,
4363          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4364          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4365      [(set VR128:$dst, (OpNode VR128:$src1,
4366                                  (bc_frag (memopv2i64
4367                                               addr:$src2))))],
4368                                               IIC_SSE_UNPCK>;
4369}
4370
4371multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4372                         SDNode OpNode, PatFrag bc_frag> {
4373  def Yrr : PDI<opc, MRMSrcReg,
4374      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4375      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4376      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>;
4377  def Yrm : PDI<opc, MRMSrcMem,
4378      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4379      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4380      [(set VR256:$dst, (OpNode VR256:$src1,
4381                                  (bc_frag (memopv4i64 addr:$src2))))]>;
4382}
4383
4384let Predicates = [HasAVX] in {
4385  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4386                                 bc_v16i8, 0>, VEX_4V;
4387  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4388                                 bc_v8i16, 0>, VEX_4V;
4389  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4390                                 bc_v4i32, 0>, VEX_4V;
4391  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4392                                 bc_v2i64, 0>, VEX_4V;
4393
4394  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4395                                 bc_v16i8, 0>, VEX_4V;
4396  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4397                                 bc_v8i16, 0>, VEX_4V;
4398  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4399                                 bc_v4i32, 0>, VEX_4V;
4400  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4401                                 bc_v2i64, 0>, VEX_4V;
4402}
4403
4404let Predicates = [HasAVX2] in {
4405  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4406                                   bc_v32i8>, VEX_4V, VEX_L;
4407  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4408                                   bc_v16i16>, VEX_4V, VEX_L;
4409  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4410                                   bc_v8i32>, VEX_4V, VEX_L;
4411  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4412                                   bc_v4i64>, VEX_4V, VEX_L;
4413
4414  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4415                                   bc_v32i8>, VEX_4V, VEX_L;
4416  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4417                                   bc_v16i16>, VEX_4V, VEX_L;
4418  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4419                                   bc_v8i32>, VEX_4V, VEX_L;
4420  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4421                                   bc_v4i64>, VEX_4V, VEX_L;
4422}
4423
4424let Constraints = "$src1 = $dst" in {
4425  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4426                                bc_v16i8>;
4427  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4428                                bc_v8i16>;
4429  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4430                                bc_v4i32>;
4431  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4432                                bc_v2i64>;
4433
4434  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4435                                bc_v16i8>;
4436  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4437                                bc_v8i16>;
4438  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4439                                bc_v4i32>;
4440  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4441                                bc_v2i64>;
4442}
4443} // ExeDomain = SSEPackedInt
4444
4445//===---------------------------------------------------------------------===//
4446// SSE2 - Packed Integer Extract and Insert
4447//===---------------------------------------------------------------------===//
4448
4449let ExeDomain = SSEPackedInt in {
4450multiclass sse2_pinsrw<bit Is2Addr = 1> {
4451  def rri : Ii8<0xC4, MRMSrcReg,
4452       (outs VR128:$dst), (ins VR128:$src1,
4453        GR32:$src2, i32i8imm:$src3),
4454       !if(Is2Addr,
4455           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4456           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4457       [(set VR128:$dst,
4458         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>;
4459  def rmi : Ii8<0xC4, MRMSrcMem,
4460                       (outs VR128:$dst), (ins VR128:$src1,
4461                        i16mem:$src2, i32i8imm:$src3),
4462       !if(Is2Addr,
4463           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4464           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4465       [(set VR128:$dst,
4466         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4467                    imm:$src3))], IIC_SSE_PINSRW>;
4468}
4469
4470// Extract
4471let Predicates = [HasAVX] in
4472def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4473                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
4474                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4475                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
4476                                                imm:$src2))]>, TB, OpSize, VEX;
4477def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4478                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
4479                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4480                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
4481                                                imm:$src2))], IIC_SSE_PEXTRW>;
4482
4483// Insert
4484let Predicates = [HasAVX] in {
4485  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
4486  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
4487       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
4488       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
4489       []>, TB, OpSize, VEX_4V;
4490}
4491
4492let Constraints = "$src1 = $dst" in
4493  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
4494
4495} // ExeDomain = SSEPackedInt
4496
4497//===---------------------------------------------------------------------===//
4498// SSE2 - Packed Mask Creation
4499//===---------------------------------------------------------------------===//
4500
4501let ExeDomain = SSEPackedInt in {
4502
4503def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
4504           "pmovmskb\t{$src, $dst|$dst, $src}",
4505           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4506           IIC_SSE_MOVMSK>, VEX;
4507def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
4508           "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
4509
4510let Predicates = [HasAVX2] in {
4511def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
4512           "pmovmskb\t{$src, $dst|$dst, $src}",
4513           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
4514def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
4515           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
4516}
4517
4518def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
4519           "pmovmskb\t{$src, $dst|$dst, $src}",
4520           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4521           IIC_SSE_MOVMSK>;
4522
4523} // ExeDomain = SSEPackedInt
4524
4525//===---------------------------------------------------------------------===//
4526// SSE2 - Conditional Store
4527//===---------------------------------------------------------------------===//
4528
4529let ExeDomain = SSEPackedInt in {
4530
4531let Uses = [EDI] in
4532def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4533           (ins VR128:$src, VR128:$mask),
4534           "maskmovdqu\t{$mask, $src|$src, $mask}",
4535           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4536           IIC_SSE_MASKMOV>, VEX;
4537let Uses = [RDI] in
4538def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4539           (ins VR128:$src, VR128:$mask),
4540           "maskmovdqu\t{$mask, $src|$src, $mask}",
4541           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4542           IIC_SSE_MASKMOV>, VEX;
4543
4544let Uses = [EDI] in
4545def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4546           "maskmovdqu\t{$mask, $src|$src, $mask}",
4547           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4548           IIC_SSE_MASKMOV>;
4549let Uses = [RDI] in
4550def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4551           "maskmovdqu\t{$mask, $src|$src, $mask}",
4552           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4553           IIC_SSE_MASKMOV>;
4554
4555} // ExeDomain = SSEPackedInt
4556
4557//===---------------------------------------------------------------------===//
4558// SSE2 - Move Doubleword
4559//===---------------------------------------------------------------------===//
4560
4561//===---------------------------------------------------------------------===//
4562// Move Int Doubleword to Packed Double Int
4563//
4564def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4565                      "movd\t{$src, $dst|$dst, $src}",
4566                      [(set VR128:$dst,
4567                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4568                        VEX;
4569def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4570                      "movd\t{$src, $dst|$dst, $src}",
4571                      [(set VR128:$dst,
4572                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4573                        IIC_SSE_MOVDQ>,
4574                      VEX;
4575def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4576                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4577                        [(set VR128:$dst,
4578                          (v2i64 (scalar_to_vector GR64:$src)))],
4579                          IIC_SSE_MOVDQ>, VEX;
4580def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4581                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4582                       [(set FR64:$dst, (bitconvert GR64:$src))],
4583                       IIC_SSE_MOVDQ>, VEX;
4584
4585def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4586                      "movd\t{$src, $dst|$dst, $src}",
4587                      [(set VR128:$dst,
4588                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>;
4589def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4590                      "movd\t{$src, $dst|$dst, $src}",
4591                      [(set VR128:$dst,
4592                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4593                        IIC_SSE_MOVDQ>;
4594def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4595                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4596                        [(set VR128:$dst,
4597                          (v2i64 (scalar_to_vector GR64:$src)))],
4598                          IIC_SSE_MOVDQ>;
4599def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4600                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4601                       [(set FR64:$dst, (bitconvert GR64:$src))],
4602                       IIC_SSE_MOVDQ>;
4603
4604//===---------------------------------------------------------------------===//
4605// Move Int Doubleword to Single Scalar
4606//
4607def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4608                      "movd\t{$src, $dst|$dst, $src}",
4609                      [(set FR32:$dst, (bitconvert GR32:$src))],
4610                      IIC_SSE_MOVDQ>, VEX;
4611
4612def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4613                      "movd\t{$src, $dst|$dst, $src}",
4614                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4615                      IIC_SSE_MOVDQ>,
4616                      VEX;
4617def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4618                      "movd\t{$src, $dst|$dst, $src}",
4619                      [(set FR32:$dst, (bitconvert GR32:$src))],
4620                      IIC_SSE_MOVDQ>;
4621
4622def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4623                      "movd\t{$src, $dst|$dst, $src}",
4624                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4625                      IIC_SSE_MOVDQ>;
4626
4627//===---------------------------------------------------------------------===//
4628// Move Packed Doubleword Int to Packed Double Int
4629//
4630def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4631                       "movd\t{$src, $dst|$dst, $src}",
4632                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4633                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX;
4634def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
4635                       (ins i32mem:$dst, VR128:$src),
4636                       "movd\t{$src, $dst|$dst, $src}",
4637                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4638                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4639                                     VEX;
4640def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4641                       "movd\t{$src, $dst|$dst, $src}",
4642                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4643                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>;
4644def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4645                       "movd\t{$src, $dst|$dst, $src}",
4646                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4647                                     (iPTR 0))), addr:$dst)],
4648                                     IIC_SSE_MOVDQ>;
4649
4650//===---------------------------------------------------------------------===//
4651// Move Packed Doubleword Int first element to Doubleword Int
4652//
4653def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4654                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
4655                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4656                                                           (iPTR 0)))],
4657                                                           IIC_SSE_MOVD_ToGP>,
4658                      TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
4659
4660def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4661                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4662                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4663                                                         (iPTR 0)))],
4664                                                         IIC_SSE_MOVD_ToGP>;
4665
4666//===---------------------------------------------------------------------===//
4667// Bitcast FR64 <-> GR64
4668//
4669let Predicates = [HasAVX] in
4670def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4671                        "vmovq\t{$src, $dst|$dst, $src}",
4672                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4673                        VEX;
4674def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4675                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4676                         [(set GR64:$dst, (bitconvert FR64:$src))],
4677                         IIC_SSE_MOVDQ>, VEX;
4678def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4679                         "movq\t{$src, $dst|$dst, $src}",
4680                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4681                         IIC_SSE_MOVDQ>, VEX;
4682
4683def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4684                       "movq\t{$src, $dst|$dst, $src}",
4685                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4686                       IIC_SSE_MOVDQ>;
4687def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4688                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4689                       [(set GR64:$dst, (bitconvert FR64:$src))],
4690                       IIC_SSE_MOVD_ToGP>;
4691def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4692                       "movq\t{$src, $dst|$dst, $src}",
4693                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4694                       IIC_SSE_MOVDQ>;
4695
4696//===---------------------------------------------------------------------===//
4697// Move Scalar Single to Double Int
4698//
4699def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4700                      "movd\t{$src, $dst|$dst, $src}",
4701                      [(set GR32:$dst, (bitconvert FR32:$src))],
4702                      IIC_SSE_MOVD_ToGP>, VEX;
4703def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4704                      "movd\t{$src, $dst|$dst, $src}",
4705                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4706                      IIC_SSE_MOVDQ>, VEX;
4707def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4708                      "movd\t{$src, $dst|$dst, $src}",
4709                      [(set GR32:$dst, (bitconvert FR32:$src))],
4710                      IIC_SSE_MOVD_ToGP>;
4711def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4712                      "movd\t{$src, $dst|$dst, $src}",
4713                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4714                      IIC_SSE_MOVDQ>;
4715
4716//===---------------------------------------------------------------------===//
4717// Patterns and instructions to describe movd/movq to XMM register zero-extends
4718//
4719let AddedComplexity = 15 in {
4720def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4721                       "movd\t{$src, $dst|$dst, $src}",
4722                       [(set VR128:$dst, (v4i32 (X86vzmovl
4723                                      (v4i32 (scalar_to_vector GR32:$src)))))],
4724                                      IIC_SSE_MOVDQ>, VEX;
4725def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4726                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4727                       [(set VR128:$dst, (v2i64 (X86vzmovl
4728                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4729                                      IIC_SSE_MOVDQ>,
4730                                      VEX, VEX_W;
4731}
4732let AddedComplexity = 15 in {
4733def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4734                       "movd\t{$src, $dst|$dst, $src}",
4735                       [(set VR128:$dst, (v4i32 (X86vzmovl
4736                                      (v4i32 (scalar_to_vector GR32:$src)))))],
4737                                      IIC_SSE_MOVDQ>;
4738def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4739                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4740                       [(set VR128:$dst, (v2i64 (X86vzmovl
4741                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4742                                      IIC_SSE_MOVDQ>;
4743}
4744
4745let AddedComplexity = 20 in {
4746def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4747                       "movd\t{$src, $dst|$dst, $src}",
4748                       [(set VR128:$dst,
4749                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
4750                                                   (loadi32 addr:$src))))))],
4751                                                   IIC_SSE_MOVDQ>, VEX;
4752def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4753                       "movd\t{$src, $dst|$dst, $src}",
4754                       [(set VR128:$dst,
4755                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
4756                                                   (loadi32 addr:$src))))))],
4757                                                   IIC_SSE_MOVDQ>;
4758}
4759
4760let Predicates = [HasAVX] in {
4761  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
4762  let AddedComplexity = 20 in {
4763    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4764              (VMOVZDI2PDIrm addr:$src)>;
4765    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4766              (VMOVZDI2PDIrm addr:$src)>;
4767  }
4768  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4769  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4770                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4771            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
4772  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4773                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4774            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4775}
4776
4777let Predicates = [UseSSE2], AddedComplexity = 20 in {
4778  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4779            (MOVZDI2PDIrm addr:$src)>;
4780  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4781            (MOVZDI2PDIrm addr:$src)>;
4782}
4783
4784// These are the correct encodings of the instructions so that we know how to
4785// read correct assembly, even though we continue to emit the wrong ones for
4786// compatibility with Darwin's buggy assembler.
4787def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4788                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4789def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4790                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
4791def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4792                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4793def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4794                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
4795def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4796                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
4797def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4798                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
4799
4800//===---------------------------------------------------------------------===//
4801// SSE2 - Move Quadword
4802//===---------------------------------------------------------------------===//
4803
4804//===---------------------------------------------------------------------===//
4805// Move Quadword Int to Packed Quadword Int
4806//
4807def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4808                    "vmovq\t{$src, $dst|$dst, $src}",
4809                    [(set VR128:$dst,
4810                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4811                    VEX, Requires<[HasAVX]>;
4812def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4813                    "movq\t{$src, $dst|$dst, $src}",
4814                    [(set VR128:$dst,
4815                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4816                      IIC_SSE_MOVDQ>, XS,
4817                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4818
4819//===---------------------------------------------------------------------===//
4820// Move Packed Quadword Int to Quadword Int
4821//
4822def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4823                      "movq\t{$src, $dst|$dst, $src}",
4824                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4825                                    (iPTR 0))), addr:$dst)],
4826                                    IIC_SSE_MOVDQ>, VEX;
4827def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4828                      "movq\t{$src, $dst|$dst, $src}",
4829                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4830                                    (iPTR 0))), addr:$dst)],
4831                                    IIC_SSE_MOVDQ>;
4832
4833//===---------------------------------------------------------------------===//
4834// Store / copy lower 64-bits of a XMM register.
4835//
4836def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4837                     "movq\t{$src, $dst|$dst, $src}",
4838                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
4839def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4840                     "movq\t{$src, $dst|$dst, $src}",
4841                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
4842                     IIC_SSE_MOVDQ>;
4843
4844let AddedComplexity = 20 in
4845def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4846                     "vmovq\t{$src, $dst|$dst, $src}",
4847                     [(set VR128:$dst,
4848                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4849                                                 (loadi64 addr:$src))))))],
4850                                                 IIC_SSE_MOVDQ>,
4851                     XS, VEX, Requires<[HasAVX]>;
4852
4853let AddedComplexity = 20 in
4854def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4855                     "movq\t{$src, $dst|$dst, $src}",
4856                     [(set VR128:$dst,
4857                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4858                                                 (loadi64 addr:$src))))))],
4859                                                 IIC_SSE_MOVDQ>,
4860                     XS, Requires<[UseSSE2]>;
4861
4862let Predicates = [HasAVX], AddedComplexity = 20 in {
4863  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4864            (VMOVZQI2PQIrm addr:$src)>;
4865  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4866            (VMOVZQI2PQIrm addr:$src)>;
4867  def : Pat<(v2i64 (X86vzload addr:$src)),
4868            (VMOVZQI2PQIrm addr:$src)>;
4869}
4870
4871let Predicates = [UseSSE2], AddedComplexity = 20 in {
4872  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4873            (MOVZQI2PQIrm addr:$src)>;
4874  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4875            (MOVZQI2PQIrm addr:$src)>;
4876  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
4877}
4878
4879let Predicates = [HasAVX] in {
4880def : Pat<(v4i64 (alignedX86vzload addr:$src)),
4881          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
4882def : Pat<(v4i64 (X86vzload addr:$src)),
4883          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
4884}
4885
4886//===---------------------------------------------------------------------===//
4887// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4888// IA32 document. movq xmm1, xmm2 does clear the high bits.
4889//
4890let AddedComplexity = 15 in
4891def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4892                        "vmovq\t{$src, $dst|$dst, $src}",
4893                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4894                    IIC_SSE_MOVQ_RR>,
4895                      XS, VEX, Requires<[HasAVX]>;
4896let AddedComplexity = 15 in
4897def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4898                        "movq\t{$src, $dst|$dst, $src}",
4899                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4900                    IIC_SSE_MOVQ_RR>,
4901                      XS, Requires<[UseSSE2]>;
4902
4903let AddedComplexity = 20 in
4904def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4905                        "vmovq\t{$src, $dst|$dst, $src}",
4906                    [(set VR128:$dst, (v2i64 (X86vzmovl
4907                                             (loadv2i64 addr:$src))))],
4908                                             IIC_SSE_MOVDQ>,
4909                      XS, VEX, Requires<[HasAVX]>;
4910let AddedComplexity = 20 in {
4911def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4912                        "movq\t{$src, $dst|$dst, $src}",
4913                    [(set VR128:$dst, (v2i64 (X86vzmovl
4914                                             (loadv2i64 addr:$src))))],
4915                                             IIC_SSE_MOVDQ>,
4916                      XS, Requires<[UseSSE2]>;
4917}
4918
4919let AddedComplexity = 20 in {
4920  let Predicates = [HasAVX] in {
4921    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4922              (VMOVZPQILo2PQIrm addr:$src)>;
4923    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4924              (VMOVZPQILo2PQIrr VR128:$src)>;
4925  }
4926  let Predicates = [UseSSE2] in {
4927    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4928              (MOVZPQILo2PQIrm addr:$src)>;
4929    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4930              (MOVZPQILo2PQIrr VR128:$src)>;
4931  }
4932}
4933
4934// Instructions to match in the assembler
4935def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4936                      "movq\t{$src, $dst|$dst, $src}", [],
4937                      IIC_SSE_MOVDQ>, VEX, VEX_W;
4938def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4939                      "movq\t{$src, $dst|$dst, $src}", [],
4940                      IIC_SSE_MOVDQ>, VEX, VEX_W;
4941// Recognize "movd" with GR64 destination, but encode as a "movq"
4942def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4943                          "movd\t{$src, $dst|$dst, $src}", [],
4944                          IIC_SSE_MOVDQ>, VEX, VEX_W;
4945
4946// Instructions for the disassembler
4947// xr = XMM register
4948// xm = mem64
4949
4950let Predicates = [HasAVX] in
4951def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4952                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
4953def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4954                 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
4955
4956//===---------------------------------------------------------------------===//
4957// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4958//===---------------------------------------------------------------------===//
4959multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4960                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4961                              X86MemOperand x86memop> {
4962def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4963                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4964                      [(set RC:$dst, (vt (OpNode RC:$src)))],
4965                      IIC_SSE_MOV_LH>;
4966def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4967                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4968                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
4969                      IIC_SSE_MOV_LH>;
4970}
4971
4972let Predicates = [HasAVX] in {
4973  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4974                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
4975  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4976                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
4977  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4978                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
4979  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4980                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
4981}
4982defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4983                                   memopv4f32, f128mem>;
4984defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4985                                   memopv4f32, f128mem>;
4986
4987let Predicates = [HasAVX] in {
4988  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4989            (VMOVSHDUPrr VR128:$src)>;
4990  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
4991            (VMOVSHDUPrm addr:$src)>;
4992  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4993            (VMOVSLDUPrr VR128:$src)>;
4994  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
4995            (VMOVSLDUPrm addr:$src)>;
4996  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4997            (VMOVSHDUPYrr VR256:$src)>;
4998  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
4999            (VMOVSHDUPYrm addr:$src)>;
5000  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5001            (VMOVSLDUPYrr VR256:$src)>;
5002  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
5003            (VMOVSLDUPYrm addr:$src)>;
5004}
5005
5006let Predicates = [UseSSE3] in {
5007  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5008            (MOVSHDUPrr VR128:$src)>;
5009  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5010            (MOVSHDUPrm addr:$src)>;
5011  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5012            (MOVSLDUPrr VR128:$src)>;
5013  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5014            (MOVSLDUPrm addr:$src)>;
5015}
5016
5017//===---------------------------------------------------------------------===//
5018// SSE3 - Replicate Double FP - MOVDDUP
5019//===---------------------------------------------------------------------===//
5020
5021multiclass sse3_replicate_dfp<string OpcodeStr> {
5022let neverHasSideEffects = 1 in
5023def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5024                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5025                    [], IIC_SSE_MOV_LH>;
5026def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5027                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5028                    [(set VR128:$dst,
5029                      (v2f64 (X86Movddup
5030                              (scalar_to_vector (loadf64 addr:$src)))))],
5031                              IIC_SSE_MOV_LH>;
5032}
5033
5034// FIXME: Merge with above classe when there're patterns for the ymm version
5035multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5036def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5037                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5038                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>;
5039def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5040                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5041                    [(set VR256:$dst,
5042                      (v4f64 (X86Movddup
5043                              (scalar_to_vector (loadf64 addr:$src)))))]>;
5044}
5045
5046let Predicates = [HasAVX] in {
5047  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
5048  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5049}
5050
5051defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5052
5053let Predicates = [HasAVX] in {
5054  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5055            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5056  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5057            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5058  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5059            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5060  def : Pat<(X86Movddup (bc_v2f64
5061                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5062            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5063
5064  // 256-bit version
5065  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
5066            (VMOVDDUPYrm addr:$src)>;
5067  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
5068            (VMOVDDUPYrm addr:$src)>;
5069  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
5070            (VMOVDDUPYrm addr:$src)>;
5071  def : Pat<(X86Movddup (v4i64 VR256:$src)),
5072            (VMOVDDUPYrr VR256:$src)>;
5073}
5074
5075let Predicates = [UseSSE3] in {
5076  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5077            (MOVDDUPrm addr:$src)>;
5078  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5079            (MOVDDUPrm addr:$src)>;
5080  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5081            (MOVDDUPrm addr:$src)>;
5082  def : Pat<(X86Movddup (bc_v2f64
5083                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5084            (MOVDDUPrm addr:$src)>;
5085}
5086
5087//===---------------------------------------------------------------------===//
5088// SSE3 - Move Unaligned Integer
5089//===---------------------------------------------------------------------===//
5090
5091let Predicates = [HasAVX] in {
5092  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5093                   "vlddqu\t{$src, $dst|$dst, $src}",
5094                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5095  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5096                   "vlddqu\t{$src, $dst|$dst, $src}",
5097                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5098                   VEX, VEX_L;
5099}
5100def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5101                   "lddqu\t{$src, $dst|$dst, $src}",
5102                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5103                   IIC_SSE_LDDQU>;
5104
5105//===---------------------------------------------------------------------===//
5106// SSE3 - Arithmetic
5107//===---------------------------------------------------------------------===//
5108
5109multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5110                       X86MemOperand x86memop, OpndItins itins,
5111                       bit Is2Addr = 1> {
5112  def rr : I<0xD0, MRMSrcReg,
5113       (outs RC:$dst), (ins RC:$src1, RC:$src2),
5114       !if(Is2Addr,
5115           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5116           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5117       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>;
5118  def rm : I<0xD0, MRMSrcMem,
5119       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5120       !if(Is2Addr,
5121           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5122           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5123       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>;
5124}
5125
5126let Predicates = [HasAVX] in {
5127  let ExeDomain = SSEPackedSingle in {
5128    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5129                                 f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
5130    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5131                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
5132  }
5133  let ExeDomain = SSEPackedDouble in {
5134    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5135                                 f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
5136    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5137                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
5138  }
5139}
5140let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5141  let ExeDomain = SSEPackedSingle in
5142  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5143                              f128mem, SSE_ALU_F32P>, TB, XD;
5144  let ExeDomain = SSEPackedDouble in
5145  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5146                              f128mem, SSE_ALU_F64P>, TB, OpSize;
5147}
5148
5149//===---------------------------------------------------------------------===//
5150// SSE3 Instructions
5151//===---------------------------------------------------------------------===//
5152
5153// Horizontal ops
5154multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5155                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5156  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5157       !if(Is2Addr,
5158         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5159         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5160      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
5161
5162  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5163       !if(Is2Addr,
5164         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5165         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5166      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5167        IIC_SSE_HADDSUB_RM>;
5168}
5169multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5170                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5171  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5172       !if(Is2Addr,
5173         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5174         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5175      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
5176
5177  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5178       !if(Is2Addr,
5179         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5180         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5181      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5182        IIC_SSE_HADDSUB_RM>;
5183}
5184
5185let Predicates = [HasAVX] in {
5186  let ExeDomain = SSEPackedSingle in {
5187    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5188                            X86fhadd, 0>, VEX_4V;
5189    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5190                            X86fhsub, 0>, VEX_4V;
5191    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5192                            X86fhadd, 0>, VEX_4V, VEX_L;
5193    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5194                            X86fhsub, 0>, VEX_4V, VEX_L;
5195  }
5196  let ExeDomain = SSEPackedDouble in {
5197    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5198                            X86fhadd, 0>, VEX_4V;
5199    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5200                            X86fhsub, 0>, VEX_4V;
5201    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5202                            X86fhadd, 0>, VEX_4V, VEX_L;
5203    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5204                            X86fhsub, 0>, VEX_4V, VEX_L;
5205  }
5206}
5207
5208let Constraints = "$src1 = $dst" in {
5209  let ExeDomain = SSEPackedSingle in {
5210    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
5211    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
5212  }
5213  let ExeDomain = SSEPackedDouble in {
5214    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
5215    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
5216  }
5217}
5218
5219//===---------------------------------------------------------------------===//
5220// SSSE3 - Packed Absolute Instructions
5221//===---------------------------------------------------------------------===//
5222
5223
5224/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5225multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
5226                            Intrinsic IntId128> {
5227  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5228                    (ins VR128:$src),
5229                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5230                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5231                    OpSize;
5232
5233  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5234                    (ins i128mem:$src),
5235                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5236                    [(set VR128:$dst,
5237                      (IntId128
5238                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
5239                    OpSize;
5240}
5241
5242/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5243multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5244                              Intrinsic IntId256> {
5245  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5246                    (ins VR256:$src),
5247                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5248                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5249                    OpSize;
5250
5251  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5252                    (ins i256mem:$src),
5253                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5254                    [(set VR256:$dst,
5255                      (IntId256
5256                       (bitconvert (memopv4i64 addr:$src))))]>, OpSize;
5257}
5258
5259let Predicates = [HasAVX] in {
5260  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
5261                                  int_x86_ssse3_pabs_b_128>, VEX;
5262  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
5263                                  int_x86_ssse3_pabs_w_128>, VEX;
5264  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
5265                                  int_x86_ssse3_pabs_d_128>, VEX;
5266}
5267
5268let Predicates = [HasAVX2] in {
5269  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5270                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5271  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5272                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5273  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5274                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5275}
5276
5277defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
5278                              int_x86_ssse3_pabs_b_128>;
5279defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
5280                              int_x86_ssse3_pabs_w_128>;
5281defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
5282                              int_x86_ssse3_pabs_d_128>;
5283
5284//===---------------------------------------------------------------------===//
5285// SSSE3 - Packed Binary Operator Instructions
5286//===---------------------------------------------------------------------===//
5287
5288def SSE_PHADDSUBD : OpndItins<
5289  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5290>;
5291def SSE_PHADDSUBSW : OpndItins<
5292  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5293>;
5294def SSE_PHADDSUBW : OpndItins<
5295  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5296>;
5297def SSE_PSHUFB : OpndItins<
5298  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5299>;
5300def SSE_PSIGN : OpndItins<
5301  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5302>;
5303def SSE_PMULHRSW : OpndItins<
5304  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5305>;
5306
5307/// SS3I_binop_rm - Simple SSSE3 bin op
5308multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5309                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5310                         X86MemOperand x86memop, OpndItins itins,
5311                         bit Is2Addr = 1> {
5312  let isCommutable = 1 in
5313  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5314       (ins RC:$src1, RC:$src2),
5315       !if(Is2Addr,
5316         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5317         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5318       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5319       OpSize;
5320  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5321       (ins RC:$src1, x86memop:$src2),
5322       !if(Is2Addr,
5323         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5324         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5325       [(set RC:$dst,
5326         (OpVT (OpNode RC:$src1,
5327          (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize;
5328}
5329
5330/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5331multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5332                             Intrinsic IntId128, OpndItins itins,
5333                             bit Is2Addr = 1> {
5334  let isCommutable = 1 in
5335  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5336       (ins VR128:$src1, VR128:$src2),
5337       !if(Is2Addr,
5338         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5339         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5340       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5341       OpSize;
5342  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5343       (ins VR128:$src1, i128mem:$src2),
5344       !if(Is2Addr,
5345         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5346         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5347       [(set VR128:$dst,
5348         (IntId128 VR128:$src1,
5349          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
5350}
5351
5352multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5353                               Intrinsic IntId256> {
5354  let isCommutable = 1 in
5355  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5356       (ins VR256:$src1, VR256:$src2),
5357       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5358       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5359       OpSize;
5360  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5361       (ins VR256:$src1, i256mem:$src2),
5362       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5363       [(set VR256:$dst,
5364         (IntId256 VR256:$src1,
5365          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
5366}
5367
5368let ImmT = NoImm, Predicates = [HasAVX] in {
5369let isCommutable = 0 in {
5370  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5371                                  memopv2i64, i128mem,
5372                                  SSE_PHADDSUBW, 0>, VEX_4V;
5373  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5374                                  memopv2i64, i128mem,
5375                                  SSE_PHADDSUBD, 0>, VEX_4V;
5376  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5377                                  memopv2i64, i128mem,
5378                                  SSE_PHADDSUBW, 0>, VEX_4V;
5379  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5380                                  memopv2i64, i128mem,
5381                                  SSE_PHADDSUBD, 0>, VEX_4V;
5382  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5383                                  memopv2i64, i128mem,
5384                                  SSE_PSIGN, 0>, VEX_4V;
5385  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5386                                  memopv2i64, i128mem,
5387                                  SSE_PSIGN, 0>, VEX_4V;
5388  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5389                                  memopv2i64, i128mem,
5390                                  SSE_PSIGN, 0>, VEX_4V;
5391  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5392                                  memopv2i64, i128mem,
5393                                  SSE_PSHUFB, 0>, VEX_4V;
5394  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5395                                      int_x86_ssse3_phadd_sw_128,
5396                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5397  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5398                                      int_x86_ssse3_phsub_sw_128,
5399                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5400  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5401                                      int_x86_ssse3_pmadd_ub_sw_128,
5402                                      SSE_PMADD, 0>, VEX_4V;
5403}
5404defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5405                                      int_x86_ssse3_pmul_hr_sw_128,
5406                                      SSE_PMULHRSW, 0>, VEX_4V;
5407}
5408
5409let ImmT = NoImm, Predicates = [HasAVX2] in {
5410let isCommutable = 0 in {
5411  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5412                                  memopv4i64, i256mem,
5413                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5414  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5415                                  memopv4i64, i256mem,
5416                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5417  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5418                                  memopv4i64, i256mem,
5419                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5420  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5421                                  memopv4i64, i256mem,
5422                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5423  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5424                                  memopv4i64, i256mem,
5425                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5426  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5427                                  memopv4i64, i256mem,
5428                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5429  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5430                                  memopv4i64, i256mem,
5431                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5432  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5433                                  memopv4i64, i256mem,
5434                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5435  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5436                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
5437  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5438                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
5439  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5440                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
5441}
5442defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5443                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
5444}
5445
5446// None of these have i8 immediate fields.
5447let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5448let isCommutable = 0 in {
5449  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5450                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5451  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5452                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5453  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5454                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5455  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5456                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5457  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5458                                 memopv2i64, i128mem, SSE_PSIGN>;
5459  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5460                                 memopv2i64, i128mem, SSE_PSIGN>;
5461  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5462                                 memopv2i64, i128mem, SSE_PSIGN>;
5463  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5464                                 memopv2i64, i128mem, SSE_PSHUFB>;
5465  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5466                                     int_x86_ssse3_phadd_sw_128,
5467                                     SSE_PHADDSUBSW>;
5468  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5469                                     int_x86_ssse3_phsub_sw_128,
5470                                     SSE_PHADDSUBSW>;
5471  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5472                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
5473}
5474defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5475                                     int_x86_ssse3_pmul_hr_sw_128,
5476                                     SSE_PMULHRSW>;
5477}
5478
5479//===---------------------------------------------------------------------===//
5480// SSSE3 - Packed Align Instruction Patterns
5481//===---------------------------------------------------------------------===//
5482
5483multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
5484  let neverHasSideEffects = 1 in {
5485  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5486      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
5487      !if(Is2Addr,
5488        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5489        !strconcat(asm,
5490                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5491      [], IIC_SSE_PALIGNR>, OpSize;
5492  let mayLoad = 1 in
5493  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5494      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
5495      !if(Is2Addr,
5496        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5497        !strconcat(asm,
5498                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5499      [], IIC_SSE_PALIGNR>, OpSize;
5500  }
5501}
5502
5503multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
5504  let neverHasSideEffects = 1 in {
5505  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5506      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
5507      !strconcat(asm,
5508                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5509      []>, OpSize;
5510  let mayLoad = 1 in
5511  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5512      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
5513      !strconcat(asm,
5514                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5515      []>, OpSize;
5516  }
5517}
5518
5519let Predicates = [HasAVX] in
5520  defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
5521let Predicates = [HasAVX2] in
5522  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
5523let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5524  defm PALIGN : ssse3_palign<"palignr">;
5525
5526let Predicates = [HasAVX2] in {
5527def : Pat<(v8i32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5528          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5529def : Pat<(v8f32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5530          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5531def : Pat<(v16i16 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5532          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5533def : Pat<(v32i8 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5534          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5535}
5536
5537let Predicates = [HasAVX] in {
5538def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5539          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5540def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5541          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5542def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5543          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5544def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5545          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5546}
5547
5548let Predicates = [UseSSSE3] in {
5549def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5550          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5551def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5552          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5553def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5554          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5555def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5556          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5557}
5558
5559//===---------------------------------------------------------------------===//
5560// SSSE3 - Thread synchronization
5561//===---------------------------------------------------------------------===//
5562
5563let usesCustomInserter = 1 in {
5564def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5565                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5566                Requires<[HasSSE3]>;
5567}
5568
5569let Uses = [EAX, ECX, EDX] in
5570def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5571                 TB, Requires<[HasSSE3]>;
5572let Uses = [ECX, EAX] in
5573def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5574                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5575                TB, Requires<[HasSSE3]>;
5576
5577def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
5578def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
5579
5580def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
5581      Requires<[In32BitMode]>;
5582def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
5583      Requires<[In64BitMode]>;
5584
5585//===----------------------------------------------------------------------===//
5586// SSE4.1 - Packed Move with Sign/Zero Extend
5587//===----------------------------------------------------------------------===//
5588
5589multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5590  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5591                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5592                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5593
5594  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5595                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5596       [(set VR128:$dst,
5597         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5598       OpSize;
5599}
5600
5601multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
5602                                 Intrinsic IntId> {
5603  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5604                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5605                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5606
5607  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
5608                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5609                  [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
5610}
5611
5612let Predicates = [HasAVX] in {
5613defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
5614                                     VEX;
5615defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
5616                                     VEX;
5617defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
5618                                     VEX;
5619defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
5620                                     VEX;
5621defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
5622                                     VEX;
5623defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
5624                                     VEX;
5625}
5626
5627let Predicates = [HasAVX2] in {
5628defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
5629                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
5630defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
5631                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
5632defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
5633                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
5634defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
5635                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
5636defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
5637                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
5638defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
5639                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
5640}
5641
5642defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
5643defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
5644defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
5645defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
5646defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
5647defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
5648
5649let Predicates = [HasAVX] in {
5650  // Common patterns involving scalar load.
5651  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5652            (VPMOVSXBWrm addr:$src)>;
5653  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5654            (VPMOVSXBWrm addr:$src)>;
5655  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5656            (VPMOVSXBWrm addr:$src)>;
5657
5658  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5659            (VPMOVSXWDrm addr:$src)>;
5660  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5661            (VPMOVSXWDrm addr:$src)>;
5662  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5663            (VPMOVSXWDrm addr:$src)>;
5664
5665  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5666            (VPMOVSXDQrm addr:$src)>;
5667  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5668            (VPMOVSXDQrm addr:$src)>;
5669  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5670            (VPMOVSXDQrm addr:$src)>;
5671
5672  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5673            (VPMOVZXBWrm addr:$src)>;
5674  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5675            (VPMOVZXBWrm addr:$src)>;
5676  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5677            (VPMOVZXBWrm addr:$src)>;
5678
5679  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5680            (VPMOVZXWDrm addr:$src)>;
5681  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5682            (VPMOVZXWDrm addr:$src)>;
5683  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5684            (VPMOVZXWDrm addr:$src)>;
5685
5686  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5687            (VPMOVZXDQrm addr:$src)>;
5688  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5689            (VPMOVZXDQrm addr:$src)>;
5690  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5691            (VPMOVZXDQrm addr:$src)>;
5692}
5693
5694let Predicates = [UseSSE41] in {
5695  // Common patterns involving scalar load.
5696  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5697            (PMOVSXBWrm addr:$src)>;
5698  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5699            (PMOVSXBWrm addr:$src)>;
5700  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5701            (PMOVSXBWrm addr:$src)>;
5702
5703  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5704            (PMOVSXWDrm addr:$src)>;
5705  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5706            (PMOVSXWDrm addr:$src)>;
5707  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5708            (PMOVSXWDrm addr:$src)>;
5709
5710  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5711            (PMOVSXDQrm addr:$src)>;
5712  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5713            (PMOVSXDQrm addr:$src)>;
5714  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5715            (PMOVSXDQrm addr:$src)>;
5716
5717  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5718            (PMOVZXBWrm addr:$src)>;
5719  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5720            (PMOVZXBWrm addr:$src)>;
5721  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5722            (PMOVZXBWrm addr:$src)>;
5723
5724  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5725            (PMOVZXWDrm addr:$src)>;
5726  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5727            (PMOVZXWDrm addr:$src)>;
5728  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5729            (PMOVZXWDrm addr:$src)>;
5730
5731  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5732            (PMOVZXDQrm addr:$src)>;
5733  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5734            (PMOVZXDQrm addr:$src)>;
5735  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5736            (PMOVZXDQrm addr:$src)>;
5737}
5738
5739let Predicates = [HasAVX2] in {
5740  let AddedComplexity = 15 in {
5741    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
5742              (VPMOVZXDQYrr VR128:$src)>;
5743    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
5744              (VPMOVZXWDYrr VR128:$src)>;
5745  }
5746
5747  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
5748  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
5749}
5750
5751let Predicates = [HasAVX] in {
5752  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
5753  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
5754}
5755
5756let Predicates = [UseSSE41] in {
5757  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
5758  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
5759}
5760
5761
5762multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5763  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5764                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5765                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5766
5767  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
5768                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5769       [(set VR128:$dst,
5770         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
5771          OpSize;
5772}
5773
5774multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
5775                                 Intrinsic IntId> {
5776  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5777                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5778                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5779
5780  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
5781                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5782       [(set VR256:$dst,
5783         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5784          OpSize;
5785}
5786
5787let Predicates = [HasAVX] in {
5788defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
5789                                     VEX;
5790defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
5791                                     VEX;
5792defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
5793                                     VEX;
5794defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
5795                                     VEX;
5796}
5797
5798let Predicates = [HasAVX2] in {
5799defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
5800                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
5801defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
5802                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
5803defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
5804                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
5805defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
5806                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
5807}
5808
5809defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
5810defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
5811defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
5812defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
5813
5814let Predicates = [HasAVX] in {
5815  // Common patterns involving scalar load
5816  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5817            (VPMOVSXBDrm addr:$src)>;
5818  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5819            (VPMOVSXWQrm addr:$src)>;
5820
5821  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5822            (VPMOVZXBDrm addr:$src)>;
5823  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5824            (VPMOVZXWQrm addr:$src)>;
5825}
5826
5827let Predicates = [UseSSE41] in {
5828  // Common patterns involving scalar load
5829  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5830            (PMOVSXBDrm addr:$src)>;
5831  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5832            (PMOVSXWQrm addr:$src)>;
5833
5834  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5835            (PMOVZXBDrm addr:$src)>;
5836  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5837            (PMOVZXWQrm addr:$src)>;
5838}
5839
5840multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5841  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5842                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5843                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5844
5845  // Expecting a i16 load any extended to i32 value.
5846  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
5847                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5848                 [(set VR128:$dst, (IntId (bitconvert
5849                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
5850                 OpSize;
5851}
5852
5853multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
5854                                 Intrinsic IntId> {
5855  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5856                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5857                 [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5858
5859  // Expecting a i16 load any extended to i32 value.
5860  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
5861                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5862                  [(set VR256:$dst, (IntId (bitconvert
5863                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
5864                  OpSize;
5865}
5866
5867let Predicates = [HasAVX] in {
5868defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
5869                                     VEX;
5870defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
5871                                     VEX;
5872}
5873let Predicates = [HasAVX2] in {
5874defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
5875                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
5876defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
5877                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
5878}
5879defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
5880defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
5881
5882let Predicates = [HasAVX] in {
5883  // Common patterns involving scalar load
5884  def : Pat<(int_x86_sse41_pmovsxbq
5885              (bitconvert (v4i32 (X86vzmovl
5886                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5887            (VPMOVSXBQrm addr:$src)>;
5888
5889  def : Pat<(int_x86_sse41_pmovzxbq
5890              (bitconvert (v4i32 (X86vzmovl
5891                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5892            (VPMOVZXBQrm addr:$src)>;
5893}
5894
5895let Predicates = [UseSSE41] in {
5896  // Common patterns involving scalar load
5897  def : Pat<(int_x86_sse41_pmovsxbq
5898              (bitconvert (v4i32 (X86vzmovl
5899                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5900            (PMOVSXBQrm addr:$src)>;
5901
5902  def : Pat<(int_x86_sse41_pmovzxbq
5903              (bitconvert (v4i32 (X86vzmovl
5904                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5905            (PMOVZXBQrm addr:$src)>;
5906}
5907
5908//===----------------------------------------------------------------------===//
5909// SSE4.1 - Extract Instructions
5910//===----------------------------------------------------------------------===//
5911
5912/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5913multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5914  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5915                 (ins VR128:$src1, i32i8imm:$src2),
5916                 !strconcat(OpcodeStr,
5917                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5918                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
5919                 OpSize;
5920  let neverHasSideEffects = 1, mayStore = 1 in
5921  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5922                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
5923                 !strconcat(OpcodeStr,
5924                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5925                 []>, OpSize;
5926// FIXME:
5927// There's an AssertZext in the way of writing the store pattern
5928// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
5929}
5930
5931let Predicates = [HasAVX] in {
5932  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
5933  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
5934         (ins VR128:$src1, i32i8imm:$src2),
5935         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
5936}
5937
5938defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5939
5940
5941/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5942multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5943  let neverHasSideEffects = 1, mayStore = 1 in
5944  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5945                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
5946                 !strconcat(OpcodeStr,
5947                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5948                 []>, OpSize;
5949// FIXME:
5950// There's an AssertZext in the way of writing the store pattern
5951// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
5952}
5953
5954let Predicates = [HasAVX] in
5955  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
5956
5957defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5958
5959
5960/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5961multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5962  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5963                 (ins VR128:$src1, i32i8imm:$src2),
5964                 !strconcat(OpcodeStr,
5965                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5966                 [(set GR32:$dst,
5967                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
5968  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5969                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
5970                 !strconcat(OpcodeStr,
5971                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5972                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5973                          addr:$dst)]>, OpSize;
5974}
5975
5976let Predicates = [HasAVX] in
5977  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5978
5979defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5980
5981/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5982multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5983  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5984                 (ins VR128:$src1, i32i8imm:$src2),
5985                 !strconcat(OpcodeStr,
5986                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5987                 [(set GR64:$dst,
5988                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
5989  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5990                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
5991                 !strconcat(OpcodeStr,
5992                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5993                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5994                          addr:$dst)]>, OpSize, REX_W;
5995}
5996
5997let Predicates = [HasAVX] in
5998  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5999
6000defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6001
6002/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6003/// destination
6004multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
6005  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6006                 (ins VR128:$src1, i32i8imm:$src2),
6007                 !strconcat(OpcodeStr,
6008                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6009                 [(set GR32:$dst,
6010                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
6011           OpSize;
6012  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6013                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
6014                 !strconcat(OpcodeStr,
6015                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6016                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6017                          addr:$dst)]>, OpSize;
6018}
6019
6020let ExeDomain = SSEPackedSingle in {
6021  let Predicates = [HasAVX] in {
6022    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6023    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
6024                    (ins VR128:$src1, i32i8imm:$src2),
6025                    "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
6026                    []>, OpSize, VEX;
6027  }
6028  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
6029}
6030
6031// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6032def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6033                                              imm:$src2))),
6034                 addr:$dst),
6035          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6036          Requires<[HasAVX]>;
6037def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6038                                              imm:$src2))),
6039                 addr:$dst),
6040          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6041          Requires<[UseSSE41]>;
6042
6043//===----------------------------------------------------------------------===//
6044// SSE4.1 - Insert Instructions
6045//===----------------------------------------------------------------------===//
6046
6047multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6048  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6049      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
6050      !if(Is2Addr,
6051        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6052        !strconcat(asm,
6053                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6054      [(set VR128:$dst,
6055        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
6056  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6057      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
6058      !if(Is2Addr,
6059        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6060        !strconcat(asm,
6061                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6062      [(set VR128:$dst,
6063        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6064                   imm:$src3))]>, OpSize;
6065}
6066
6067let Predicates = [HasAVX] in
6068  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6069let Constraints = "$src1 = $dst" in
6070  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6071
6072multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6073  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6074      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
6075      !if(Is2Addr,
6076        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6077        !strconcat(asm,
6078                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6079      [(set VR128:$dst,
6080        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6081      OpSize;
6082  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6083      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
6084      !if(Is2Addr,
6085        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6086        !strconcat(asm,
6087                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6088      [(set VR128:$dst,
6089        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6090                          imm:$src3)))]>, OpSize;
6091}
6092
6093let Predicates = [HasAVX] in
6094  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6095let Constraints = "$src1 = $dst" in
6096  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6097
6098multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6099  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6100      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
6101      !if(Is2Addr,
6102        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6103        !strconcat(asm,
6104                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6105      [(set VR128:$dst,
6106        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6107      OpSize;
6108  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6109      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
6110      !if(Is2Addr,
6111        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6112        !strconcat(asm,
6113                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6114      [(set VR128:$dst,
6115        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6116                          imm:$src3)))]>, OpSize;
6117}
6118
6119let Predicates = [HasAVX] in
6120  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6121let Constraints = "$src1 = $dst" in
6122  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6123
6124// insertps has a few different modes, there's the first two here below which
6125// are optimized inserts that won't zero arbitrary elements in the destination
6126// vector. The next one matches the intrinsic and could zero arbitrary elements
6127// in the target vector.
6128multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
6129  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6130      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
6131      !if(Is2Addr,
6132        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6133        !strconcat(asm,
6134                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6135      [(set VR128:$dst,
6136        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
6137      OpSize;
6138  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6139      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
6140      !if(Is2Addr,
6141        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6142        !strconcat(asm,
6143                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6144      [(set VR128:$dst,
6145        (X86insrtps VR128:$src1,
6146                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6147                    imm:$src3))]>, OpSize;
6148}
6149
6150let ExeDomain = SSEPackedSingle in {
6151  let Predicates = [HasAVX] in
6152    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6153  let Constraints = "$src1 = $dst" in
6154    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
6155}
6156
6157//===----------------------------------------------------------------------===//
6158// SSE4.1 - Round Instructions
6159//===----------------------------------------------------------------------===//
6160
6161multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6162                            X86MemOperand x86memop, RegisterClass RC,
6163                            PatFrag mem_frag32, PatFrag mem_frag64,
6164                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6165let ExeDomain = SSEPackedSingle in {
6166  // Intrinsic operation, reg.
6167  // Vector intrinsic operation, reg
6168  def PSr : SS4AIi8<opcps, MRMSrcReg,
6169                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6170                    !strconcat(OpcodeStr,
6171                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6172                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
6173                    OpSize;
6174
6175  // Vector intrinsic operation, mem
6176  def PSm : SS4AIi8<opcps, MRMSrcMem,
6177                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6178                    !strconcat(OpcodeStr,
6179                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6180                    [(set RC:$dst,
6181                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
6182                    OpSize;
6183} // ExeDomain = SSEPackedSingle
6184
6185let ExeDomain = SSEPackedDouble in {
6186  // Vector intrinsic operation, reg
6187  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6188                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6189                    !strconcat(OpcodeStr,
6190                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6191                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
6192                    OpSize;
6193
6194  // Vector intrinsic operation, mem
6195  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6196                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6197                    !strconcat(OpcodeStr,
6198                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6199                    [(set RC:$dst,
6200                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
6201                    OpSize;
6202} // ExeDomain = SSEPackedDouble
6203}
6204
6205multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6206                            string OpcodeStr,
6207                            Intrinsic F32Int,
6208                            Intrinsic F64Int, bit Is2Addr = 1> {
6209let ExeDomain = GenericDomain in {
6210  // Operation, reg.
6211  def SSr : SS4AIi8<opcss, MRMSrcReg,
6212      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
6213      !if(Is2Addr,
6214          !strconcat(OpcodeStr,
6215              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6216          !strconcat(OpcodeStr,
6217              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6218      []>, OpSize;
6219
6220  // Intrinsic operation, reg.
6221  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6222        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6223        !if(Is2Addr,
6224            !strconcat(OpcodeStr,
6225                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6226            !strconcat(OpcodeStr,
6227                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6228        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6229        OpSize;
6230
6231  // Intrinsic operation, mem.
6232  def SSm : SS4AIi8<opcss, MRMSrcMem,
6233        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
6234        !if(Is2Addr,
6235            !strconcat(OpcodeStr,
6236                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6237            !strconcat(OpcodeStr,
6238                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6239        [(set VR128:$dst,
6240             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6241        OpSize;
6242
6243  // Operation, reg.
6244  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6245        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
6246        !if(Is2Addr,
6247            !strconcat(OpcodeStr,
6248                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6249            !strconcat(OpcodeStr,
6250                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6251        []>, OpSize;
6252
6253  // Intrinsic operation, reg.
6254  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6255        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6256        !if(Is2Addr,
6257            !strconcat(OpcodeStr,
6258                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6259            !strconcat(OpcodeStr,
6260                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6261        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6262        OpSize;
6263
6264  // Intrinsic operation, mem.
6265  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6266        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
6267        !if(Is2Addr,
6268            !strconcat(OpcodeStr,
6269                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6270            !strconcat(OpcodeStr,
6271                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6272        [(set VR128:$dst,
6273              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6274        OpSize;
6275} // ExeDomain = GenericDomain
6276}
6277
6278// FP round - roundss, roundps, roundsd, roundpd
6279let Predicates = [HasAVX] in {
6280  // Intrinsic form
6281  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6282                                  memopv4f32, memopv2f64,
6283                                  int_x86_sse41_round_ps,
6284                                  int_x86_sse41_round_pd>, VEX;
6285  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6286                                  memopv8f32, memopv4f64,
6287                                  int_x86_avx_round_ps_256,
6288                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6289  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6290                                  int_x86_sse41_round_ss,
6291                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6292
6293  def : Pat<(ffloor FR32:$src),
6294            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6295  def : Pat<(f64 (ffloor FR64:$src)),
6296            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6297  def : Pat<(f32 (fnearbyint FR32:$src)),
6298            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6299  def : Pat<(f64 (fnearbyint FR64:$src)),
6300            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6301  def : Pat<(f32 (fceil FR32:$src)),
6302            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6303  def : Pat<(f64 (fceil FR64:$src)),
6304            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6305  def : Pat<(f32 (frint FR32:$src)),
6306            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6307  def : Pat<(f64 (frint FR64:$src)),
6308            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6309  def : Pat<(f32 (ftrunc FR32:$src)),
6310            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6311  def : Pat<(f64 (ftrunc FR64:$src)),
6312            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6313
6314  def : Pat<(v4f32 (ffloor VR128:$src)),
6315            (VROUNDPSr VR128:$src, (i32 0x1))>;
6316  def : Pat<(v2f64 (ffloor VR128:$src)),
6317            (VROUNDPDr VR128:$src, (i32 0x1))>;
6318  def : Pat<(v8f32 (ffloor VR256:$src)),
6319            (VROUNDYPSr VR256:$src, (i32 0x1))>;
6320  def : Pat<(v4f64 (ffloor VR256:$src)),
6321            (VROUNDYPDr VR256:$src, (i32 0x1))>;
6322}
6323
6324defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6325                               memopv4f32, memopv2f64,
6326                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6327let Constraints = "$src1 = $dst" in
6328defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6329                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6330
6331let Predicates = [UseSSE41] in {
6332  def : Pat<(ffloor FR32:$src),
6333            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6334  def : Pat<(f64 (ffloor FR64:$src)),
6335            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6336  def : Pat<(f32 (fnearbyint FR32:$src)),
6337            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6338  def : Pat<(f64 (fnearbyint FR64:$src)),
6339            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6340  def : Pat<(f32 (fceil FR32:$src)),
6341            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6342  def : Pat<(f64 (fceil FR64:$src)),
6343            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6344  def : Pat<(f32 (frint FR32:$src)),
6345            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6346  def : Pat<(f64 (frint FR64:$src)),
6347            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6348  def : Pat<(f32 (ftrunc FR32:$src)),
6349            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6350  def : Pat<(f64 (ftrunc FR64:$src)),
6351            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6352
6353  def : Pat<(v4f32 (ffloor VR128:$src)),
6354            (ROUNDPSr VR128:$src, (i32 0x1))>;
6355  def : Pat<(v2f64 (ffloor VR128:$src)),
6356            (ROUNDPDr VR128:$src, (i32 0x1))>;
6357}
6358
6359//===----------------------------------------------------------------------===//
6360// SSE4.1 - Packed Bit Test
6361//===----------------------------------------------------------------------===//
6362
6363// ptest instruction we'll lower to this in X86ISelLowering primarily from
6364// the intel intrinsic that corresponds to this.
6365let Defs = [EFLAGS], Predicates = [HasAVX] in {
6366def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6367                "vptest\t{$src2, $src1|$src1, $src2}",
6368                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6369                OpSize, VEX;
6370def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6371                "vptest\t{$src2, $src1|$src1, $src2}",
6372                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6373                OpSize, VEX;
6374
6375def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6376                "vptest\t{$src2, $src1|$src1, $src2}",
6377                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6378                OpSize, VEX, VEX_L;
6379def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6380                "vptest\t{$src2, $src1|$src1, $src2}",
6381                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
6382                OpSize, VEX, VEX_L;
6383}
6384
6385let Defs = [EFLAGS] in {
6386def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6387              "ptest\t{$src2, $src1|$src1, $src2}",
6388              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6389              OpSize;
6390def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6391              "ptest\t{$src2, $src1|$src1, $src2}",
6392              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6393              OpSize;
6394}
6395
6396// The bit test instructions below are AVX only
6397multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6398                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6399  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6400            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6401            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
6402  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6403            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6404            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6405            OpSize, VEX;
6406}
6407
6408let Defs = [EFLAGS], Predicates = [HasAVX] in {
6409let ExeDomain = SSEPackedSingle in {
6410defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
6411defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
6412                            VEX_L;
6413}
6414let ExeDomain = SSEPackedDouble in {
6415defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
6416defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
6417                            VEX_L;
6418}
6419}
6420
6421//===----------------------------------------------------------------------===//
6422// SSE4.1 - Misc Instructions
6423//===----------------------------------------------------------------------===//
6424
6425let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6426  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6427                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6428                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
6429                     OpSize, XS;
6430  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6431                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6432                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6433                      (implicit EFLAGS)]>, OpSize, XS;
6434
6435  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6436                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6437                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
6438                     XS;
6439  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6440                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6441                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6442                      (implicit EFLAGS)]>, XS;
6443
6444  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6445                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6446                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
6447                      XS;
6448  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6449                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6450                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6451                       (implicit EFLAGS)]>, XS;
6452}
6453
6454
6455
6456// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6457multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6458                                 Intrinsic IntId128> {
6459  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6460                    (ins VR128:$src),
6461                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6462                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
6463  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6464                     (ins i128mem:$src),
6465                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6466                     [(set VR128:$dst,
6467                       (IntId128
6468                        (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
6469}
6470
6471let Predicates = [HasAVX] in
6472defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6473                                         int_x86_sse41_phminposuw>, VEX;
6474defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6475                                         int_x86_sse41_phminposuw>;
6476
6477/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
6478multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
6479                              Intrinsic IntId128, bit Is2Addr = 1> {
6480  let isCommutable = 1 in
6481  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6482       (ins VR128:$src1, VR128:$src2),
6483       !if(Is2Addr,
6484           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6485           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6486       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
6487  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6488       (ins VR128:$src1, i128mem:$src2),
6489       !if(Is2Addr,
6490           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6491           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6492       [(set VR128:$dst,
6493         (IntId128 VR128:$src1,
6494          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
6495}
6496
6497/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
6498multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
6499                                Intrinsic IntId256> {
6500  let isCommutable = 1 in
6501  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
6502       (ins VR256:$src1, VR256:$src2),
6503       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6504       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
6505  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
6506       (ins VR256:$src1, i256mem:$src2),
6507       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6508       [(set VR256:$dst,
6509         (IntId256 VR256:$src1,
6510          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
6511}
6512
6513let Predicates = [HasAVX] in {
6514  let isCommutable = 0 in
6515  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
6516                                                         0>, VEX_4V;
6517  defm VPMINSB   : SS41I_binop_rm_int<0x38, "vpminsb",   int_x86_sse41_pminsb,
6518                                                         0>, VEX_4V;
6519  defm VPMINSD   : SS41I_binop_rm_int<0x39, "vpminsd",   int_x86_sse41_pminsd,
6520                                                         0>, VEX_4V;
6521  defm VPMINUD   : SS41I_binop_rm_int<0x3B, "vpminud",   int_x86_sse41_pminud,
6522                                                         0>, VEX_4V;
6523  defm VPMINUW   : SS41I_binop_rm_int<0x3A, "vpminuw",   int_x86_sse41_pminuw,
6524                                                         0>, VEX_4V;
6525  defm VPMAXSB   : SS41I_binop_rm_int<0x3C, "vpmaxsb",   int_x86_sse41_pmaxsb,
6526                                                         0>, VEX_4V;
6527  defm VPMAXSD   : SS41I_binop_rm_int<0x3D, "vpmaxsd",   int_x86_sse41_pmaxsd,
6528                                                         0>, VEX_4V;
6529  defm VPMAXUD   : SS41I_binop_rm_int<0x3F, "vpmaxud",   int_x86_sse41_pmaxud,
6530                                                         0>, VEX_4V;
6531  defm VPMAXUW   : SS41I_binop_rm_int<0x3E, "vpmaxuw",   int_x86_sse41_pmaxuw,
6532                                                         0>, VEX_4V;
6533  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
6534                                                         0>, VEX_4V;
6535}
6536
6537let Predicates = [HasAVX2] in {
6538  let isCommutable = 0 in
6539  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
6540                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
6541  defm VPMINSB   : SS41I_binop_rm_int_y<0x38, "vpminsb",
6542                                        int_x86_avx2_pmins_b>, VEX_4V, VEX_L;
6543  defm VPMINSD   : SS41I_binop_rm_int_y<0x39, "vpminsd",
6544                                        int_x86_avx2_pmins_d>, VEX_4V, VEX_L;
6545  defm VPMINUD   : SS41I_binop_rm_int_y<0x3B, "vpminud",
6546                                        int_x86_avx2_pminu_d>, VEX_4V, VEX_L;
6547  defm VPMINUW   : SS41I_binop_rm_int_y<0x3A, "vpminuw",
6548                                        int_x86_avx2_pminu_w>, VEX_4V, VEX_L;
6549  defm VPMAXSB   : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
6550                                        int_x86_avx2_pmaxs_b>, VEX_4V, VEX_L;
6551  defm VPMAXSD   : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
6552                                        int_x86_avx2_pmaxs_d>, VEX_4V, VEX_L;
6553  defm VPMAXUD   : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
6554                                        int_x86_avx2_pmaxu_d>, VEX_4V, VEX_L;
6555  defm VPMAXUW   : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
6556                                        int_x86_avx2_pmaxu_w>, VEX_4V, VEX_L;
6557  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
6558                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
6559}
6560
6561let Constraints = "$src1 = $dst" in {
6562  let isCommutable = 0 in
6563  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
6564  defm PMINSB   : SS41I_binop_rm_int<0x38, "pminsb",   int_x86_sse41_pminsb>;
6565  defm PMINSD   : SS41I_binop_rm_int<0x39, "pminsd",   int_x86_sse41_pminsd>;
6566  defm PMINUD   : SS41I_binop_rm_int<0x3B, "pminud",   int_x86_sse41_pminud>;
6567  defm PMINUW   : SS41I_binop_rm_int<0x3A, "pminuw",   int_x86_sse41_pminuw>;
6568  defm PMAXSB   : SS41I_binop_rm_int<0x3C, "pmaxsb",   int_x86_sse41_pmaxsb>;
6569  defm PMAXSD   : SS41I_binop_rm_int<0x3D, "pmaxsd",   int_x86_sse41_pmaxsd>;
6570  defm PMAXUD   : SS41I_binop_rm_int<0x3F, "pmaxud",   int_x86_sse41_pmaxud>;
6571  defm PMAXUW   : SS41I_binop_rm_int<0x3E, "pmaxuw",   int_x86_sse41_pmaxuw>;
6572  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
6573}
6574
6575/// SS48I_binop_rm - Simple SSE41 binary operator.
6576multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6577                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6578                          X86MemOperand x86memop, bit Is2Addr = 1> {
6579  let isCommutable = 1 in
6580  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6581       (ins RC:$src1, RC:$src2),
6582       !if(Is2Addr,
6583           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6584           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6585       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
6586  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6587       (ins RC:$src1, x86memop:$src2),
6588       !if(Is2Addr,
6589           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6590           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6591       [(set RC:$dst,
6592         (OpVT (OpNode RC:$src1,
6593          (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
6594}
6595
6596let Predicates = [HasAVX] in {
6597  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6598                                memopv2i64, i128mem, 0>, VEX_4V;
6599  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6600                                 memopv2i64, i128mem, 0>, VEX_4V;
6601}
6602let Predicates = [HasAVX2] in {
6603  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6604                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6605  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6606                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6607}
6608
6609let Constraints = "$src1 = $dst" in {
6610  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6611                                memopv2i64, i128mem>;
6612  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6613                                memopv2i64, i128mem>;
6614}
6615
6616/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6617multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6618                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6619                 X86MemOperand x86memop, bit Is2Addr = 1> {
6620  let isCommutable = 1 in
6621  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6622        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
6623        !if(Is2Addr,
6624            !strconcat(OpcodeStr,
6625                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6626            !strconcat(OpcodeStr,
6627                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6628        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6629        OpSize;
6630  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6631        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
6632        !if(Is2Addr,
6633            !strconcat(OpcodeStr,
6634                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6635            !strconcat(OpcodeStr,
6636                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6637        [(set RC:$dst,
6638          (IntId RC:$src1,
6639           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
6640        OpSize;
6641}
6642
6643let Predicates = [HasAVX] in {
6644  let isCommutable = 0 in {
6645    let ExeDomain = SSEPackedSingle in {
6646    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
6647                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
6648    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
6649                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
6650                                    f256mem, 0>, VEX_4V, VEX_L;
6651    }
6652    let ExeDomain = SSEPackedDouble in {
6653    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
6654                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
6655    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
6656                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
6657                                     f256mem, 0>, VEX_4V, VEX_L;
6658    }
6659  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
6660                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
6661  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6662                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
6663  }
6664  let ExeDomain = SSEPackedSingle in
6665  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6666                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
6667  let ExeDomain = SSEPackedDouble in
6668  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6669                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
6670  let ExeDomain = SSEPackedSingle in
6671  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6672                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
6673}
6674
6675let Predicates = [HasAVX2] in {
6676  let isCommutable = 0 in {
6677  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
6678                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6679  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6680                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6681  }
6682}
6683
6684let Constraints = "$src1 = $dst" in {
6685  let isCommutable = 0 in {
6686  let ExeDomain = SSEPackedSingle in
6687  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
6688                                     VR128, memopv4f32, f128mem>;
6689  let ExeDomain = SSEPackedDouble in
6690  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
6691                                     VR128, memopv2f64, f128mem>;
6692  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
6693                                     VR128, memopv2i64, i128mem>;
6694  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6695                                     VR128, memopv2i64, i128mem>;
6696  }
6697  let ExeDomain = SSEPackedSingle in
6698  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6699                                  VR128, memopv4f32, f128mem>;
6700  let ExeDomain = SSEPackedDouble in
6701  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6702                                  VR128, memopv2f64, f128mem>;
6703}
6704
6705/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
6706multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
6707                                    RegisterClass RC, X86MemOperand x86memop,
6708                                    PatFrag mem_frag, Intrinsic IntId> {
6709  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
6710                  (ins RC:$src1, RC:$src2, RC:$src3),
6711                  !strconcat(OpcodeStr,
6712                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6713                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
6714                  IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
6715
6716  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
6717                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6718                  !strconcat(OpcodeStr,
6719                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6720                  [(set RC:$dst,
6721                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
6722                               RC:$src3))],
6723                  IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
6724}
6725
6726let Predicates = [HasAVX] in {
6727let ExeDomain = SSEPackedDouble in {
6728defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
6729                                           memopv2f64, int_x86_sse41_blendvpd>;
6730defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
6731                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
6732} // ExeDomain = SSEPackedDouble
6733let ExeDomain = SSEPackedSingle in {
6734defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
6735                                           memopv4f32, int_x86_sse41_blendvps>;
6736defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
6737                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
6738} // ExeDomain = SSEPackedSingle
6739defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
6740                                           memopv2i64, int_x86_sse41_pblendvb>;
6741}
6742
6743let Predicates = [HasAVX2] in {
6744defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
6745                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
6746}
6747
6748let Predicates = [HasAVX] in {
6749  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
6750                            (v16i8 VR128:$src2))),
6751            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6752  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6753                            (v4i32 VR128:$src2))),
6754            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6755  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
6756                            (v4f32 VR128:$src2))),
6757            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6758  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6759                            (v2i64 VR128:$src2))),
6760            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6761  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
6762                            (v2f64 VR128:$src2))),
6763            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6764  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6765                            (v8i32 VR256:$src2))),
6766            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6767  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
6768                            (v8f32 VR256:$src2))),
6769            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6770  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6771                            (v4i64 VR256:$src2))),
6772            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6773  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
6774                            (v4f64 VR256:$src2))),
6775            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6776
6777  def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2),
6778                               (imm:$mask))),
6779            (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>;
6780  def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2),
6781                               (imm:$mask))),
6782            (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>;
6783
6784  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
6785                               (imm:$mask))),
6786            (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
6787  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
6788                               (imm:$mask))),
6789            (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
6790  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
6791                               (imm:$mask))),
6792            (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
6793}
6794
6795let Predicates = [HasAVX2] in {
6796  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
6797                            (v32i8 VR256:$src2))),
6798            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6799  def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2),
6800                               (imm:$mask))),
6801            (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>;
6802}
6803
6804/// SS41I_ternary_int - SSE 4.1 ternary operator
6805let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6806  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
6807                               X86MemOperand x86memop, Intrinsic IntId> {
6808    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6809                    (ins VR128:$src1, VR128:$src2),
6810                    !strconcat(OpcodeStr,
6811                     "\t{$src2, $dst|$dst, $src2}"),
6812                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
6813                    OpSize;
6814
6815    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6816                    (ins VR128:$src1, x86memop:$src2),
6817                    !strconcat(OpcodeStr,
6818                     "\t{$src2, $dst|$dst, $src2}"),
6819                    [(set VR128:$dst,
6820                      (IntId VR128:$src1,
6821                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
6822  }
6823}
6824
6825let ExeDomain = SSEPackedDouble in
6826defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
6827                                  int_x86_sse41_blendvpd>;
6828let ExeDomain = SSEPackedSingle in
6829defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
6830                                  int_x86_sse41_blendvps>;
6831defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
6832                                  int_x86_sse41_pblendvb>;
6833
6834// Aliases with the implicit xmm0 argument
6835def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6836                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
6837def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6838                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
6839def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6840                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
6841def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6842                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
6843def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6844                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
6845def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
6846                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
6847
6848let Predicates = [UseSSE41] in {
6849  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
6850                            (v16i8 VR128:$src2))),
6851            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
6852  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
6853                            (v4i32 VR128:$src2))),
6854            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6855  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
6856                            (v4f32 VR128:$src2))),
6857            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6858  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
6859                            (v2i64 VR128:$src2))),
6860            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6861  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
6862                            (v2f64 VR128:$src2))),
6863            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6864
6865  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
6866                               (imm:$mask))),
6867            (PBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
6868  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
6869                               (imm:$mask))),
6870            (BLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
6871  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
6872                               (imm:$mask))),
6873            (BLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
6874
6875}
6876
6877let Predicates = [HasAVX] in
6878def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6879                       "vmovntdqa\t{$src, $dst|$dst, $src}",
6880                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
6881                       OpSize, VEX;
6882let Predicates = [HasAVX2] in
6883def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6884                         "vmovntdqa\t{$src, $dst|$dst, $src}",
6885                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
6886                         OpSize, VEX, VEX_L;
6887def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6888                       "movntdqa\t{$src, $dst|$dst, $src}",
6889                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
6890                       OpSize;
6891
6892//===----------------------------------------------------------------------===//
6893// SSE4.2 - Compare Instructions
6894//===----------------------------------------------------------------------===//
6895
6896/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6897multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6898                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6899                          X86MemOperand x86memop, bit Is2Addr = 1> {
6900  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6901       (ins RC:$src1, RC:$src2),
6902       !if(Is2Addr,
6903           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6904           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6905       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6906       OpSize;
6907  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6908       (ins RC:$src1, x86memop:$src2),
6909       !if(Is2Addr,
6910           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6911           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6912       [(set RC:$dst,
6913         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
6914}
6915
6916let Predicates = [HasAVX] in
6917  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6918                                 memopv2i64, i128mem, 0>, VEX_4V;
6919
6920let Predicates = [HasAVX2] in
6921  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6922                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6923
6924let Constraints = "$src1 = $dst" in
6925  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6926                                memopv2i64, i128mem>;
6927
6928//===----------------------------------------------------------------------===//
6929// SSE4.2 - String/text Processing Instructions
6930//===----------------------------------------------------------------------===//
6931
6932// Packed Compare Implicit Length Strings, Return Mask
6933multiclass pseudo_pcmpistrm<string asm> {
6934  def REG : PseudoI<(outs VR128:$dst),
6935                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6936    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
6937                                                  imm:$src3))]>;
6938  def MEM : PseudoI<(outs VR128:$dst),
6939                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6940    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
6941                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
6942}
6943
6944let Defs = [EFLAGS], usesCustomInserter = 1 in {
6945  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
6946  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
6947}
6948
6949let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
6950  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
6951      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6952      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
6953  let mayLoad = 1 in
6954  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
6955      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6956      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
6957}
6958
6959let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
6960  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
6961      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6962      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
6963  let mayLoad = 1 in
6964  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
6965      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6966      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
6967}
6968
6969// Packed Compare Explicit Length Strings, Return Mask
6970multiclass pseudo_pcmpestrm<string asm> {
6971  def REG : PseudoI<(outs VR128:$dst),
6972                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6973    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
6974                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
6975  def MEM : PseudoI<(outs VR128:$dst),
6976                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6977    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
6978                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
6979}
6980
6981let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
6982  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
6983  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
6984}
6985
6986let Predicates = [HasAVX],
6987    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
6988  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
6989      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6990      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
6991  let mayLoad = 1 in
6992  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
6993      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6994      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
6995}
6996
6997let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
6998  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
6999      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7000      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
7001  let mayLoad = 1 in
7002  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
7003      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7004      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
7005}
7006
7007// Packed Compare Implicit Length Strings, Return Index
7008let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
7009  multiclass SS42AI_pcmpistri<string asm> {
7010    def rr : SS42AI<0x63, MRMSrcReg, (outs),
7011      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7012      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7013      []>, OpSize;
7014    let mayLoad = 1 in
7015    def rm : SS42AI<0x63, MRMSrcMem, (outs),
7016      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7017      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7018      []>, OpSize;
7019  }
7020}
7021
7022let Predicates = [HasAVX] in
7023defm VPCMPISTRI  : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7024defm PCMPISTRI   : SS42AI_pcmpistri<"pcmpistri">;
7025
7026// Packed Compare Explicit Length Strings, Return Index
7027let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7028  multiclass SS42AI_pcmpestri<string asm> {
7029    def rr : SS42AI<0x61, MRMSrcReg, (outs),
7030      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7031      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7032      []>, OpSize;
7033    let mayLoad = 1 in
7034    def rm : SS42AI<0x61, MRMSrcMem, (outs),
7035      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7036      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7037      []>, OpSize;
7038  }
7039}
7040
7041let Predicates = [HasAVX] in
7042defm VPCMPESTRI  : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7043defm PCMPESTRI   : SS42AI_pcmpestri<"pcmpestri">;
7044
7045//===----------------------------------------------------------------------===//
7046// SSE4.2 - CRC Instructions
7047//===----------------------------------------------------------------------===//
7048
7049// No CRC instructions have AVX equivalents
7050
7051// crc intrinsic instruction
7052// This set of instructions are only rm, the only difference is the size
7053// of r and m.
7054let Constraints = "$src1 = $dst" in {
7055  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
7056                      (ins GR32:$src1, i8mem:$src2),
7057                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
7058                       [(set GR32:$dst,
7059                         (int_x86_sse42_crc32_32_8 GR32:$src1,
7060                         (load addr:$src2)))]>;
7061  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
7062                      (ins GR32:$src1, GR8:$src2),
7063                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
7064                       [(set GR32:$dst,
7065                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
7066  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
7067                      (ins GR32:$src1, i16mem:$src2),
7068                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
7069                       [(set GR32:$dst,
7070                         (int_x86_sse42_crc32_32_16 GR32:$src1,
7071                         (load addr:$src2)))]>,
7072                         OpSize;
7073  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
7074                      (ins GR32:$src1, GR16:$src2),
7075                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
7076                       [(set GR32:$dst,
7077                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
7078                         OpSize;
7079  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
7080                      (ins GR32:$src1, i32mem:$src2),
7081                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
7082                       [(set GR32:$dst,
7083                         (int_x86_sse42_crc32_32_32 GR32:$src1,
7084                         (load addr:$src2)))]>;
7085  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
7086                      (ins GR32:$src1, GR32:$src2),
7087                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
7088                       [(set GR32:$dst,
7089                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
7090  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
7091                      (ins GR64:$src1, i8mem:$src2),
7092                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
7093                       [(set GR64:$dst,
7094                         (int_x86_sse42_crc32_64_8 GR64:$src1,
7095                         (load addr:$src2)))]>,
7096                         REX_W;
7097  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
7098                      (ins GR64:$src1, GR8:$src2),
7099                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
7100                       [(set GR64:$dst,
7101                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
7102                         REX_W;
7103  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
7104                      (ins GR64:$src1, i64mem:$src2),
7105                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
7106                       [(set GR64:$dst,
7107                         (int_x86_sse42_crc32_64_64 GR64:$src1,
7108                         (load addr:$src2)))]>,
7109                         REX_W;
7110  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
7111                      (ins GR64:$src1, GR64:$src2),
7112                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
7113                       [(set GR64:$dst,
7114                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
7115                         REX_W;
7116}
7117
7118//===----------------------------------------------------------------------===//
7119// AES-NI Instructions
7120//===----------------------------------------------------------------------===//
7121
7122multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7123                              Intrinsic IntId128, bit Is2Addr = 1> {
7124  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7125       (ins VR128:$src1, VR128:$src2),
7126       !if(Is2Addr,
7127           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7128           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7129       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7130       OpSize;
7131  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7132       (ins VR128:$src1, i128mem:$src2),
7133       !if(Is2Addr,
7134           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7135           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7136       [(set VR128:$dst,
7137         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
7138}
7139
7140// Perform One Round of an AES Encryption/Decryption Flow
7141let Predicates = [HasAVX, HasAES] in {
7142  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7143                         int_x86_aesni_aesenc, 0>, VEX_4V;
7144  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7145                         int_x86_aesni_aesenclast, 0>, VEX_4V;
7146  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7147                         int_x86_aesni_aesdec, 0>, VEX_4V;
7148  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7149                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
7150}
7151
7152let Constraints = "$src1 = $dst" in {
7153  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7154                         int_x86_aesni_aesenc>;
7155  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7156                         int_x86_aesni_aesenclast>;
7157  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7158                         int_x86_aesni_aesdec>;
7159  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7160                         int_x86_aesni_aesdeclast>;
7161}
7162
7163// Perform the AES InvMixColumn Transformation
7164let Predicates = [HasAVX, HasAES] in {
7165  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7166      (ins VR128:$src1),
7167      "vaesimc\t{$src1, $dst|$dst, $src1}",
7168      [(set VR128:$dst,
7169        (int_x86_aesni_aesimc VR128:$src1))]>,
7170      OpSize, VEX;
7171  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7172      (ins i128mem:$src1),
7173      "vaesimc\t{$src1, $dst|$dst, $src1}",
7174      [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7175      OpSize, VEX;
7176}
7177def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7178  (ins VR128:$src1),
7179  "aesimc\t{$src1, $dst|$dst, $src1}",
7180  [(set VR128:$dst,
7181    (int_x86_aesni_aesimc VR128:$src1))]>,
7182  OpSize;
7183def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7184  (ins i128mem:$src1),
7185  "aesimc\t{$src1, $dst|$dst, $src1}",
7186  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7187  OpSize;
7188
7189// AES Round Key Generation Assist
7190let Predicates = [HasAVX, HasAES] in {
7191  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7192      (ins VR128:$src1, i8imm:$src2),
7193      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7194      [(set VR128:$dst,
7195        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7196      OpSize, VEX;
7197  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7198      (ins i128mem:$src1, i8imm:$src2),
7199      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7200      [(set VR128:$dst,
7201        (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7202      OpSize, VEX;
7203}
7204def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7205  (ins VR128:$src1, i8imm:$src2),
7206  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7207  [(set VR128:$dst,
7208    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7209  OpSize;
7210def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7211  (ins i128mem:$src1, i8imm:$src2),
7212  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7213  [(set VR128:$dst,
7214    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7215  OpSize;
7216
7217//===----------------------------------------------------------------------===//
7218// PCLMUL Instructions
7219//===----------------------------------------------------------------------===//
7220
7221// AVX carry-less Multiplication instructions
7222def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7223           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7224           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7225           [(set VR128:$dst,
7226             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
7227
7228def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7229           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7230           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7231           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7232                              (memopv2i64 addr:$src2), imm:$src3))]>;
7233
7234// Carry-less Multiplication instructions
7235let Constraints = "$src1 = $dst" in {
7236def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7237           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7238           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7239           [(set VR128:$dst,
7240             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
7241
7242def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7243           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7244           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7245           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7246                              (memopv2i64 addr:$src2), imm:$src3))]>;
7247} // Constraints = "$src1 = $dst"
7248
7249
7250multiclass pclmul_alias<string asm, int immop> {
7251  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7252                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
7253
7254  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7255                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
7256
7257  def : InstAlias<!strconcat("vpclmul", asm,
7258                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7259                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
7260
7261  def : InstAlias<!strconcat("vpclmul", asm,
7262                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7263                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
7264}
7265defm : pclmul_alias<"hqhq", 0x11>;
7266defm : pclmul_alias<"hqlq", 0x01>;
7267defm : pclmul_alias<"lqhq", 0x10>;
7268defm : pclmul_alias<"lqlq", 0x00>;
7269
7270//===----------------------------------------------------------------------===//
7271// SSE4A Instructions
7272//===----------------------------------------------------------------------===//
7273
7274let Predicates = [HasSSE4A] in {
7275
7276let Constraints = "$src = $dst" in {
7277def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
7278                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
7279                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7280                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7281                                    imm:$idx))]>, TB, OpSize;
7282def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7283              (ins VR128:$src, VR128:$mask),
7284              "extrq\t{$mask, $src|$src, $mask}",
7285              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7286                                 VR128:$mask))]>, TB, OpSize;
7287
7288def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7289                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
7290                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7291                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7292                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
7293def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7294                 (ins VR128:$src, VR128:$mask),
7295                 "insertq\t{$mask, $src|$src, $mask}",
7296                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7297                                    VR128:$mask))]>, XD;
7298}
7299
7300def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7301                "movntss\t{$src, $dst|$dst, $src}",
7302                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7303
7304def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7305                "movntsd\t{$src, $dst|$dst, $src}",
7306                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7307}
7308
7309//===----------------------------------------------------------------------===//
7310// AVX Instructions
7311//===----------------------------------------------------------------------===//
7312
7313//===----------------------------------------------------------------------===//
7314// VBROADCAST - Load from memory and broadcast to all elements of the
7315//              destination operand
7316//
7317class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
7318                    X86MemOperand x86memop, Intrinsic Int> :
7319  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7320        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7321        [(set RC:$dst, (Int addr:$src))]>, VEX;
7322
7323// AVX2 adds register forms
7324class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
7325                         Intrinsic Int> :
7326  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7327         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7328         [(set RC:$dst, (Int VR128:$src))]>, VEX;
7329
7330let ExeDomain = SSEPackedSingle in {
7331  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
7332                                      int_x86_avx_vbroadcast_ss>;
7333  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
7334                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
7335}
7336let ExeDomain = SSEPackedDouble in
7337def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
7338                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
7339def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7340                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
7341
7342let ExeDomain = SSEPackedSingle in {
7343  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7344                                           int_x86_avx2_vbroadcast_ss_ps>;
7345  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7346                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
7347}
7348let ExeDomain = SSEPackedDouble in
7349def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
7350                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
7351
7352let Predicates = [HasAVX2] in
7353def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
7354                                   int_x86_avx2_vbroadcasti128>, VEX_L;
7355
7356let Predicates = [HasAVX] in
7357def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
7358          (VBROADCASTF128 addr:$src)>;
7359
7360
7361//===----------------------------------------------------------------------===//
7362// VINSERTF128 - Insert packed floating-point values
7363//
7364let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
7365def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7366          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
7367          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7368          []>, VEX_4V, VEX_L;
7369let mayLoad = 1 in
7370def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7371          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
7372          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7373          []>, VEX_4V, VEX_L;
7374}
7375
7376let Predicates = [HasAVX] in {
7377def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
7378                                   (iPTR imm)),
7379          (VINSERTF128rr VR256:$src1, VR128:$src2,
7380                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7381def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
7382                                   (iPTR imm)),
7383          (VINSERTF128rr VR256:$src1, VR128:$src2,
7384                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7385
7386def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
7387                                   (iPTR imm)),
7388          (VINSERTF128rm VR256:$src1, addr:$src2,
7389                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7390def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
7391                                   (iPTR imm)),
7392          (VINSERTF128rm VR256:$src1, addr:$src2,
7393                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7394}
7395
7396let Predicates = [HasAVX1Only] in {
7397def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7398                                   (iPTR imm)),
7399          (VINSERTF128rr VR256:$src1, VR128:$src2,
7400                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7401def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7402                                   (iPTR imm)),
7403          (VINSERTF128rr VR256:$src1, VR128:$src2,
7404                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7405def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7406                                   (iPTR imm)),
7407          (VINSERTF128rr VR256:$src1, VR128:$src2,
7408                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7409def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7410                                   (iPTR imm)),
7411          (VINSERTF128rr VR256:$src1, VR128:$src2,
7412                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7413
7414def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
7415                                   (iPTR imm)),
7416          (VINSERTF128rm VR256:$src1, addr:$src2,
7417                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7418def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
7419                                   (bc_v4i32 (memopv2i64 addr:$src2)),
7420                                   (iPTR imm)),
7421          (VINSERTF128rm VR256:$src1, addr:$src2,
7422                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7423def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
7424                                   (bc_v16i8 (memopv2i64 addr:$src2)),
7425                                   (iPTR imm)),
7426          (VINSERTF128rm VR256:$src1, addr:$src2,
7427                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7428def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
7429                                   (bc_v8i16 (memopv2i64 addr:$src2)),
7430                                   (iPTR imm)),
7431          (VINSERTF128rm VR256:$src1, addr:$src2,
7432                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7433}
7434
7435//===----------------------------------------------------------------------===//
7436// VEXTRACTF128 - Extract packed floating-point values
7437//
7438let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
7439def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7440          (ins VR256:$src1, i8imm:$src2),
7441          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7442          []>, VEX, VEX_L;
7443let mayStore = 1 in
7444def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7445          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
7446          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7447          []>, VEX, VEX_L;
7448}
7449
7450// AVX1 patterns
7451let Predicates = [HasAVX] in {
7452def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7453          (v4f32 (VEXTRACTF128rr
7454                    (v8f32 VR256:$src1),
7455                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7456def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7457          (v2f64 (VEXTRACTF128rr
7458                    (v4f64 VR256:$src1),
7459                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7460
7461def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
7462                                (iPTR imm))), addr:$dst),
7463          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7464           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7465def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
7466                                (iPTR imm))), addr:$dst),
7467          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7468           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7469}
7470
7471let Predicates = [HasAVX1Only] in {
7472def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7473          (v2i64 (VEXTRACTF128rr
7474                  (v4i64 VR256:$src1),
7475                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7476def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7477          (v4i32 (VEXTRACTF128rr
7478                  (v8i32 VR256:$src1),
7479                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7480def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7481          (v8i16 (VEXTRACTF128rr
7482                  (v16i16 VR256:$src1),
7483                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7484def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
7485          (v16i8 (VEXTRACTF128rr
7486                  (v32i8 VR256:$src1),
7487                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
7488
7489def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
7490                                (iPTR imm))), addr:$dst),
7491          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7492           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7493def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
7494                                (iPTR imm))), addr:$dst),
7495          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7496           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7497def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
7498                                (iPTR imm))), addr:$dst),
7499          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7500           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7501def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
7502                                (iPTR imm))), addr:$dst),
7503          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7504           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
7505}
7506
7507//===----------------------------------------------------------------------===//
7508// VMASKMOV - Conditional SIMD Packed Loads and Stores
7509//
7510multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7511                          Intrinsic IntLd, Intrinsic IntLd256,
7512                          Intrinsic IntSt, Intrinsic IntSt256> {
7513  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7514             (ins VR128:$src1, f128mem:$src2),
7515             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7516             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7517             VEX_4V;
7518  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7519             (ins VR256:$src1, f256mem:$src2),
7520             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7521             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7522             VEX_4V, VEX_L;
7523  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7524             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7525             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7526             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
7527  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7528             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7529             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7530             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
7531}
7532
7533let ExeDomain = SSEPackedSingle in
7534defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7535                                 int_x86_avx_maskload_ps,
7536                                 int_x86_avx_maskload_ps_256,
7537                                 int_x86_avx_maskstore_ps,
7538                                 int_x86_avx_maskstore_ps_256>;
7539let ExeDomain = SSEPackedDouble in
7540defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7541                                 int_x86_avx_maskload_pd,
7542                                 int_x86_avx_maskload_pd_256,
7543                                 int_x86_avx_maskstore_pd,
7544                                 int_x86_avx_maskstore_pd_256>;
7545
7546//===----------------------------------------------------------------------===//
7547// VPERMIL - Permute Single and Double Floating-Point Values
7548//
7549multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7550                      RegisterClass RC, X86MemOperand x86memop_f,
7551                      X86MemOperand x86memop_i, PatFrag i_frag,
7552                      Intrinsic IntVar, ValueType vt> {
7553  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7554             (ins RC:$src1, RC:$src2),
7555             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7556             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
7557  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7558             (ins RC:$src1, x86memop_i:$src2),
7559             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7560             [(set RC:$dst, (IntVar RC:$src1,
7561                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
7562
7563  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7564             (ins RC:$src1, i8imm:$src2),
7565             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7566             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
7567  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7568             (ins x86memop_f:$src1, i8imm:$src2),
7569             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7570             [(set RC:$dst,
7571               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
7572}
7573
7574let ExeDomain = SSEPackedSingle in {
7575  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7576                               memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
7577  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7578                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
7579}
7580let ExeDomain = SSEPackedDouble in {
7581  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7582                               memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
7583  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7584                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
7585}
7586
7587let Predicates = [HasAVX] in {
7588def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
7589          (VPERMILPSYri VR256:$src1, imm:$imm)>;
7590def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
7591          (VPERMILPDYri VR256:$src1, imm:$imm)>;
7592def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
7593                               (i8 imm:$imm))),
7594          (VPERMILPSYmi addr:$src1, imm:$imm)>;
7595def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
7596          (VPERMILPDYmi addr:$src1, imm:$imm)>;
7597
7598def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
7599          (VPERMILPDri VR128:$src1, imm:$imm)>;
7600def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
7601          (VPERMILPDmi addr:$src1, imm:$imm)>;
7602}
7603
7604//===----------------------------------------------------------------------===//
7605// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7606//
7607let ExeDomain = SSEPackedSingle in {
7608def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7609          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
7610          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7611          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7612                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
7613def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7614          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
7615          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7616          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
7617                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
7618}
7619
7620let Predicates = [HasAVX] in {
7621def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7622          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7623def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
7624                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
7625          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7626}
7627
7628let Predicates = [HasAVX1Only] in {
7629def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7630          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7631def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7632          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7633def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7634          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7635def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7636          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7637
7638def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
7639                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
7640          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7641def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7642                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
7643          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7644def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
7645                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
7646          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7647def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
7648                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
7649          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7650}
7651
7652//===----------------------------------------------------------------------===//
7653// VZERO - Zero YMM registers
7654//
7655let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7656            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7657  // Zero All YMM registers
7658  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7659                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
7660
7661  // Zero Upper bits of YMM registers
7662  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7663                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
7664}
7665
7666//===----------------------------------------------------------------------===//
7667// Half precision conversion instructions
7668//===----------------------------------------------------------------------===//
7669multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
7670  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7671             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7672             [(set RC:$dst, (Int VR128:$src))]>,
7673             T8, OpSize, VEX;
7674  let neverHasSideEffects = 1, mayLoad = 1 in
7675  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7676             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
7677}
7678
7679multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
7680  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7681               (ins RC:$src1, i32i8imm:$src2),
7682               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7683               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
7684               TA, OpSize, VEX;
7685  let neverHasSideEffects = 1, mayStore = 1 in
7686  def mr : Ii8<0x1D, MRMDestMem, (outs),
7687               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
7688               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7689               TA, OpSize, VEX;
7690}
7691
7692let Predicates = [HasAVX, HasF16C] in {
7693  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
7694  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
7695  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
7696  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
7697}
7698
7699//===----------------------------------------------------------------------===//
7700// AVX2 Instructions
7701//===----------------------------------------------------------------------===//
7702
7703/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
7704multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
7705                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
7706                 X86MemOperand x86memop> {
7707  let isCommutable = 1 in
7708  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7709        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
7710        !strconcat(OpcodeStr,
7711            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7712        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
7713        VEX_4V;
7714  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7715        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
7716        !strconcat(OpcodeStr,
7717            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7718        [(set RC:$dst,
7719          (IntId RC:$src1,
7720           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
7721        VEX_4V;
7722}
7723
7724let isCommutable = 0 in {
7725defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
7726                                   VR128, memopv2i64, i128mem>;
7727defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
7728                                    VR256, memopv4i64, i256mem>, VEX_L;
7729}
7730
7731//===----------------------------------------------------------------------===//
7732// VPBROADCAST - Load from memory and broadcast to all elements of the
7733//               destination operand
7734//
7735multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7736                          X86MemOperand x86memop, PatFrag ld_frag,
7737                          Intrinsic Int128, Intrinsic Int256> {
7738  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7739                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7740                  [(set VR128:$dst, (Int128 VR128:$src))]>, VEX;
7741  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7742                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7743                  [(set VR128:$dst,
7744                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
7745  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7746                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7747                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
7748  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7749                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7750                   [(set VR256:$dst,
7751                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
7752                   VEX, VEX_L;
7753}
7754
7755defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7756                                    int_x86_avx2_pbroadcastb_128,
7757                                    int_x86_avx2_pbroadcastb_256>;
7758defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7759                                    int_x86_avx2_pbroadcastw_128,
7760                                    int_x86_avx2_pbroadcastw_256>;
7761defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7762                                    int_x86_avx2_pbroadcastd_128,
7763                                    int_x86_avx2_pbroadcastd_256>;
7764defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7765                                    int_x86_avx2_pbroadcastq_128,
7766                                    int_x86_avx2_pbroadcastq_256>;
7767
7768let Predicates = [HasAVX2] in {
7769  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
7770          (VPBROADCASTBrm addr:$src)>;
7771  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
7772          (VPBROADCASTBYrm addr:$src)>;
7773  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
7774          (VPBROADCASTWrm addr:$src)>;
7775  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
7776          (VPBROADCASTWYrm addr:$src)>;
7777  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7778          (VPBROADCASTDrm addr:$src)>;
7779  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7780          (VPBROADCASTDYrm addr:$src)>;
7781  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
7782          (VPBROADCASTQrm addr:$src)>;
7783  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7784          (VPBROADCASTQYrm addr:$src)>;
7785
7786  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
7787          (VPBROADCASTBrr VR128:$src)>;
7788  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
7789          (VPBROADCASTBYrr VR128:$src)>;
7790  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
7791          (VPBROADCASTWrr VR128:$src)>;
7792  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
7793          (VPBROADCASTWYrr VR128:$src)>;
7794  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
7795          (VPBROADCASTDrr VR128:$src)>;
7796  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
7797          (VPBROADCASTDYrr VR128:$src)>;
7798  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
7799          (VPBROADCASTQrr VR128:$src)>;
7800  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
7801          (VPBROADCASTQYrr VR128:$src)>;
7802  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
7803          (VBROADCASTSSrr VR128:$src)>;
7804  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
7805          (VBROADCASTSSYrr VR128:$src)>;
7806  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
7807          (VPBROADCASTQrr VR128:$src)>;
7808  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
7809          (VBROADCASTSDYrr VR128:$src)>;
7810
7811  // Provide fallback in case the load node that is used in the patterns above
7812  // is used by additional users, which prevents the pattern selection.
7813  let AddedComplexity = 20 in {
7814    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7815              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
7816    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7817              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
7818    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7819              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
7820
7821    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7822              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
7823    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7824              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
7825    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7826              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
7827  }
7828}
7829
7830// AVX1 broadcast patterns
7831let Predicates = [HasAVX1Only] in {
7832def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7833          (VBROADCASTSSYrm addr:$src)>;
7834def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7835          (VBROADCASTSDYrm addr:$src)>;
7836def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7837          (VBROADCASTSSrm addr:$src)>;
7838}
7839
7840let Predicates = [HasAVX] in {
7841def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
7842          (VBROADCASTSSYrm addr:$src)>;
7843def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
7844          (VBROADCASTSDYrm addr:$src)>;
7845def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
7846          (VBROADCASTSSrm addr:$src)>;
7847
7848  // Provide fallback in case the load node that is used in the patterns above
7849  // is used by additional users, which prevents the pattern selection.
7850  let AddedComplexity = 20 in {
7851  // 128bit broadcasts:
7852  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7853            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
7854  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7855            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7856              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
7857              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
7858  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7859            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7860              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
7861              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
7862
7863  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7864            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
7865  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7866            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7867              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
7868              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
7869  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7870            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7871              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
7872              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
7873  }
7874}
7875
7876//===----------------------------------------------------------------------===//
7877// VPERM - Permute instructions
7878//
7879
7880multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7881                     ValueType OpVT> {
7882  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7883                   (ins VR256:$src1, VR256:$src2),
7884                   !strconcat(OpcodeStr,
7885                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7886                   [(set VR256:$dst,
7887                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7888                   VEX_4V, VEX_L;
7889  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7890                   (ins VR256:$src1, i256mem:$src2),
7891                   !strconcat(OpcodeStr,
7892                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7893                   [(set VR256:$dst,
7894                     (OpVT (X86VPermv VR256:$src1,
7895                            (bitconvert (mem_frag addr:$src2)))))]>,
7896                   VEX_4V, VEX_L;
7897}
7898
7899defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
7900let ExeDomain = SSEPackedSingle in
7901defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
7902
7903multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7904                         ValueType OpVT> {
7905  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7906                     (ins VR256:$src1, i8imm:$src2),
7907                     !strconcat(OpcodeStr,
7908                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7909                     [(set VR256:$dst,
7910                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
7911                     VEX, VEX_L;
7912  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7913                     (ins i256mem:$src1, i8imm:$src2),
7914                     !strconcat(OpcodeStr,
7915                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7916                     [(set VR256:$dst,
7917                       (OpVT (X86VPermi (mem_frag addr:$src1),
7918                              (i8 imm:$src2))))]>, VEX, VEX_L;
7919}
7920
7921defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
7922let ExeDomain = SSEPackedDouble in
7923defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
7924
7925//===----------------------------------------------------------------------===//
7926// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7927//
7928def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7929          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
7930          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7931          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7932                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
7933def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7934          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
7935          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7936          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
7937                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
7938
7939let Predicates = [HasAVX2] in {
7940def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7941          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7942def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7943          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7944def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7945          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7946
7947def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
7948                  (i8 imm:$imm))),
7949          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
7950def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
7951                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
7952          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
7953def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
7954                  (i8 imm:$imm))),
7955          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
7956}
7957
7958
7959//===----------------------------------------------------------------------===//
7960// VINSERTI128 - Insert packed integer values
7961//
7962let neverHasSideEffects = 1 in {
7963def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7964          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
7965          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7966          []>, VEX_4V, VEX_L;
7967let mayLoad = 1 in
7968def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7969          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
7970          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7971          []>, VEX_4V, VEX_L;
7972}
7973
7974let Predicates = [HasAVX2] in {
7975def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7976                                   (iPTR imm)),
7977          (VINSERTI128rr VR256:$src1, VR128:$src2,
7978                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7979def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7980                                   (iPTR imm)),
7981          (VINSERTI128rr VR256:$src1, VR128:$src2,
7982                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7983def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7984                                   (iPTR imm)),
7985          (VINSERTI128rr VR256:$src1, VR128:$src2,
7986                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7987def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7988                                   (iPTR imm)),
7989          (VINSERTI128rr VR256:$src1, VR128:$src2,
7990                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7991
7992def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
7993                                   (iPTR imm)),
7994          (VINSERTI128rm VR256:$src1, addr:$src2,
7995                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
7996def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
7997                                   (bc_v4i32 (memopv2i64 addr:$src2)),
7998                                   (iPTR imm)),
7999          (VINSERTI128rm VR256:$src1, addr:$src2,
8000                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
8001def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
8002                                   (bc_v16i8 (memopv2i64 addr:$src2)),
8003                                   (iPTR imm)),
8004          (VINSERTI128rm VR256:$src1, addr:$src2,
8005                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
8006def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
8007                                   (bc_v8i16 (memopv2i64 addr:$src2)),
8008                                   (iPTR imm)),
8009          (VINSERTI128rm VR256:$src1, addr:$src2,
8010                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
8011}
8012
8013//===----------------------------------------------------------------------===//
8014// VEXTRACTI128 - Extract packed integer values
8015//
8016def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8017          (ins VR256:$src1, i8imm:$src2),
8018          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8019          [(set VR128:$dst,
8020            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
8021          VEX, VEX_L;
8022let neverHasSideEffects = 1, mayStore = 1 in
8023def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8024          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
8025          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8026          VEX, VEX_L;
8027
8028let Predicates = [HasAVX2] in {
8029def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
8030          (v2i64 (VEXTRACTI128rr
8031                    (v4i64 VR256:$src1),
8032                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
8033def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
8034          (v4i32 (VEXTRACTI128rr
8035                    (v8i32 VR256:$src1),
8036                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
8037def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
8038          (v8i16 (VEXTRACTI128rr
8039                    (v16i16 VR256:$src1),
8040                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
8041def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
8042          (v16i8 (VEXTRACTI128rr
8043                    (v32i8 VR256:$src1),
8044                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
8045
8046def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
8047                                (iPTR imm))), addr:$dst),
8048          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8049           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
8050def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
8051                                (iPTR imm))), addr:$dst),
8052          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8053           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
8054def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
8055                                (iPTR imm))), addr:$dst),
8056          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8057           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
8058def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
8059                                (iPTR imm))), addr:$dst),
8060          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8061           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
8062}
8063
8064//===----------------------------------------------------------------------===//
8065// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8066//
8067multiclass avx2_pmovmask<string OpcodeStr,
8068                         Intrinsic IntLd128, Intrinsic IntLd256,
8069                         Intrinsic IntSt128, Intrinsic IntSt256> {
8070  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8071             (ins VR128:$src1, i128mem:$src2),
8072             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8073             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8074  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8075             (ins VR256:$src1, i256mem:$src2),
8076             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8077             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8078             VEX_4V, VEX_L;
8079  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8080             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8081             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8082             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8083  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8084             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8085             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8086             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8087}
8088
8089defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8090                                int_x86_avx2_maskload_d,
8091                                int_x86_avx2_maskload_d_256,
8092                                int_x86_avx2_maskstore_d,
8093                                int_x86_avx2_maskstore_d_256>;
8094defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8095                                int_x86_avx2_maskload_q,
8096                                int_x86_avx2_maskload_q_256,
8097                                int_x86_avx2_maskstore_q,
8098                                int_x86_avx2_maskstore_q_256>, VEX_W;
8099
8100
8101//===----------------------------------------------------------------------===//
8102// Variable Bit Shifts
8103//
8104multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8105                          ValueType vt128, ValueType vt256> {
8106  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8107             (ins VR128:$src1, VR128:$src2),
8108             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8109             [(set VR128:$dst,
8110               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8111             VEX_4V;
8112  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8113             (ins VR128:$src1, i128mem:$src2),
8114             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8115             [(set VR128:$dst,
8116               (vt128 (OpNode VR128:$src1,
8117                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
8118             VEX_4V;
8119  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8120             (ins VR256:$src1, VR256:$src2),
8121             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8122             [(set VR256:$dst,
8123               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8124             VEX_4V, VEX_L;
8125  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8126             (ins VR256:$src1, i256mem:$src2),
8127             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8128             [(set VR256:$dst,
8129               (vt256 (OpNode VR256:$src1,
8130                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
8131             VEX_4V, VEX_L;
8132}
8133
8134defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8135defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8136defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8137defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8138defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8139
8140//===----------------------------------------------------------------------===//
8141// VGATHER - GATHER Operations
8142multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8143                       X86MemOperand memop128, X86MemOperand memop256> {
8144  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8145            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8146            !strconcat(OpcodeStr,
8147              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8148            []>, VEX_4VOp3;
8149  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8150            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8151            !strconcat(OpcodeStr,
8152              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8153            []>, VEX_4VOp3, VEX_L;
8154}
8155
8156let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
8157  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8158  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8159  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8160  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8161  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8162  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8163  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8164  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8165}
8166