1/* Copyright (C) 2003-2015 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 9.0.  */
26
27#ifndef _EMMINTRIN_H_INCLUDED
28#define _EMMINTRIN_H_INCLUDED
29
30/* We need definitions from the SSE header files*/
31#include <xmmintrin.h>
32
33#ifndef __SSE2__
34#pragma GCC push_options
35#pragma GCC target("sse2")
36#define __DISABLE_SSE2__
37#endif /* __SSE2__ */
38
39/* SSE2 */
40typedef double __v2df __attribute__ ((__vector_size__ (16)));
41typedef long long __v2di __attribute__ ((__vector_size__ (16)));
42typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
43typedef int __v4si __attribute__ ((__vector_size__ (16)));
44typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
45typedef short __v8hi __attribute__ ((__vector_size__ (16)));
46typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
47typedef char __v16qi __attribute__ ((__vector_size__ (16)));
48typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
49
50/* The Intel API is flexible enough that we must allow aliasing with other
51   vector types, and their scalar components.  */
52typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
53typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
54
55/* Create a selector for use with the SHUFPD instruction.  */
56#define _MM_SHUFFLE2(fp1,fp0) \
57 (((fp1) << 1) | (fp0))
58
59/* Create a vector with element 0 as F and the rest zero.  */
60extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
61_mm_set_sd (double __F)
62{
63  return __extension__ (__m128d){ __F, 0.0 };
64}
65
66/* Create a vector with both elements equal to F.  */
67extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68_mm_set1_pd (double __F)
69{
70  return __extension__ (__m128d){ __F, __F };
71}
72
73extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74_mm_set_pd1 (double __F)
75{
76  return _mm_set1_pd (__F);
77}
78
79/* Create a vector with the lower value X and upper value W.  */
80extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81_mm_set_pd (double __W, double __X)
82{
83  return __extension__ (__m128d){ __X, __W };
84}
85
86/* Create a vector with the lower value W and upper value X.  */
87extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88_mm_setr_pd (double __W, double __X)
89{
90  return __extension__ (__m128d){ __W, __X };
91}
92
93/* Create an undefined vector.  */
94extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95_mm_undefined_pd (void)
96{
97  __m128d __Y = __Y;
98  return __Y;
99}
100
101/* Create a vector of zeros.  */
102extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103_mm_setzero_pd (void)
104{
105  return __extension__ (__m128d){ 0.0, 0.0 };
106}
107
108/* Sets the low DPFP value of A from the low value of B.  */
109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110_mm_move_sd (__m128d __A, __m128d __B)
111{
112  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
113}
114
115/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
116extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117_mm_load_pd (double const *__P)
118{
119  return *(__m128d *)__P;
120}
121
122/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
123extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124_mm_loadu_pd (double const *__P)
125{
126  return __builtin_ia32_loadupd (__P);
127}
128
129/* Create a vector with all two elements equal to *P.  */
130extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm_load1_pd (double const *__P)
132{
133  return _mm_set1_pd (*__P);
134}
135
136/* Create a vector with element 0 as *P and the rest zero.  */
137extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138_mm_load_sd (double const *__P)
139{
140  return _mm_set_sd (*__P);
141}
142
143extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144_mm_load_pd1 (double const *__P)
145{
146  return _mm_load1_pd (__P);
147}
148
149/* Load two DPFP values in reverse order.  The address must be aligned.  */
150extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151_mm_loadr_pd (double const *__P)
152{
153  __m128d __tmp = _mm_load_pd (__P);
154  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
155}
156
157/* Store two DPFP values.  The address must be 16-byte aligned.  */
158extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm_store_pd (double *__P, __m128d __A)
160{
161  *(__m128d *)__P = __A;
162}
163
164/* Store two DPFP values.  The address need not be 16-byte aligned.  */
165extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166_mm_storeu_pd (double *__P, __m128d __A)
167{
168  __builtin_ia32_storeupd (__P, __A);
169}
170
171/* Stores the lower DPFP value.  */
172extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173_mm_store_sd (double *__P, __m128d __A)
174{
175  *__P = ((__v2df)__A)[0];
176}
177
178extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179_mm_cvtsd_f64 (__m128d __A)
180{
181  return ((__v2df)__A)[0];
182}
183
184extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
185_mm_storel_pd (double *__P, __m128d __A)
186{
187  _mm_store_sd (__P, __A);
188}
189
190/* Stores the upper DPFP value.  */
191extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192_mm_storeh_pd (double *__P, __m128d __A)
193{
194  *__P = ((__v2df)__A)[1];
195}
196
197/* Store the lower DPFP value across two words.
198   The address must be 16-byte aligned.  */
199extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
200_mm_store1_pd (double *__P, __m128d __A)
201{
202  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
203}
204
205extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
206_mm_store_pd1 (double *__P, __m128d __A)
207{
208  _mm_store1_pd (__P, __A);
209}
210
211/* Store two DPFP values in reverse order.  The address must be aligned.  */
212extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213_mm_storer_pd (double *__P, __m128d __A)
214{
215  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
216}
217
218extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219_mm_cvtsi128_si32 (__m128i __A)
220{
221  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
222}
223
224#ifdef __x86_64__
225/* Intel intrinsic.  */
226extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
227_mm_cvtsi128_si64 (__m128i __A)
228{
229  return ((__v2di)__A)[0];
230}
231
232/* Microsoft intrinsic.  */
233extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234_mm_cvtsi128_si64x (__m128i __A)
235{
236  return ((__v2di)__A)[0];
237}
238#endif
239
240extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241_mm_add_pd (__m128d __A, __m128d __B)
242{
243  return (__m128d) ((__v2df)__A + (__v2df)__B);
244}
245
246extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
247_mm_add_sd (__m128d __A, __m128d __B)
248{
249  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
250}
251
252extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
253_mm_sub_pd (__m128d __A, __m128d __B)
254{
255  return (__m128d) ((__v2df)__A - (__v2df)__B);
256}
257
258extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259_mm_sub_sd (__m128d __A, __m128d __B)
260{
261  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
262}
263
264extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265_mm_mul_pd (__m128d __A, __m128d __B)
266{
267  return (__m128d) ((__v2df)__A * (__v2df)__B);
268}
269
270extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271_mm_mul_sd (__m128d __A, __m128d __B)
272{
273  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
274}
275
276extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277_mm_div_pd (__m128d __A, __m128d __B)
278{
279  return (__m128d) ((__v2df)__A / (__v2df)__B);
280}
281
282extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283_mm_div_sd (__m128d __A, __m128d __B)
284{
285  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
286}
287
288extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_sqrt_pd (__m128d __A)
290{
291  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
292}
293
294/* Return pair {sqrt (B[0]), A[1]}.  */
295extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm_sqrt_sd (__m128d __A, __m128d __B)
297{
298  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
299  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
300}
301
302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303_mm_min_pd (__m128d __A, __m128d __B)
304{
305  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
306}
307
308extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309_mm_min_sd (__m128d __A, __m128d __B)
310{
311  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
312}
313
314extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315_mm_max_pd (__m128d __A, __m128d __B)
316{
317  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
318}
319
320extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm_max_sd (__m128d __A, __m128d __B)
322{
323  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
324}
325
326extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327_mm_and_pd (__m128d __A, __m128d __B)
328{
329  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
330}
331
332extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_andnot_pd (__m128d __A, __m128d __B)
334{
335  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
336}
337
338extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339_mm_or_pd (__m128d __A, __m128d __B)
340{
341  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
342}
343
344extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_xor_pd (__m128d __A, __m128d __B)
346{
347  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
348}
349
350extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351_mm_cmpeq_pd (__m128d __A, __m128d __B)
352{
353  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
354}
355
356extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357_mm_cmplt_pd (__m128d __A, __m128d __B)
358{
359  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
360}
361
362extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_cmple_pd (__m128d __A, __m128d __B)
364{
365  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
366}
367
368extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_cmpgt_pd (__m128d __A, __m128d __B)
370{
371  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
372}
373
374extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375_mm_cmpge_pd (__m128d __A, __m128d __B)
376{
377  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
378}
379
380extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381_mm_cmpneq_pd (__m128d __A, __m128d __B)
382{
383  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
384}
385
386extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387_mm_cmpnlt_pd (__m128d __A, __m128d __B)
388{
389  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
390}
391
392extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393_mm_cmpnle_pd (__m128d __A, __m128d __B)
394{
395  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
396}
397
398extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399_mm_cmpngt_pd (__m128d __A, __m128d __B)
400{
401  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
402}
403
404extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405_mm_cmpnge_pd (__m128d __A, __m128d __B)
406{
407  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
408}
409
410extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411_mm_cmpord_pd (__m128d __A, __m128d __B)
412{
413  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
414}
415
416extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417_mm_cmpunord_pd (__m128d __A, __m128d __B)
418{
419  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
420}
421
422extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423_mm_cmpeq_sd (__m128d __A, __m128d __B)
424{
425  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
426}
427
428extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429_mm_cmplt_sd (__m128d __A, __m128d __B)
430{
431  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
432}
433
434extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435_mm_cmple_sd (__m128d __A, __m128d __B)
436{
437  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
438}
439
440extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm_cmpgt_sd (__m128d __A, __m128d __B)
442{
443  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
444					 (__v2df)
445					 __builtin_ia32_cmpltsd ((__v2df) __B,
446								 (__v2df)
447								 __A));
448}
449
450extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451_mm_cmpge_sd (__m128d __A, __m128d __B)
452{
453  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
454					 (__v2df)
455					 __builtin_ia32_cmplesd ((__v2df) __B,
456								 (__v2df)
457								 __A));
458}
459
460extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461_mm_cmpneq_sd (__m128d __A, __m128d __B)
462{
463  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
464}
465
466extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467_mm_cmpnlt_sd (__m128d __A, __m128d __B)
468{
469  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
470}
471
472extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473_mm_cmpnle_sd (__m128d __A, __m128d __B)
474{
475  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
476}
477
478extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479_mm_cmpngt_sd (__m128d __A, __m128d __B)
480{
481  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
482					 (__v2df)
483					 __builtin_ia32_cmpnltsd ((__v2df) __B,
484								  (__v2df)
485								  __A));
486}
487
488extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489_mm_cmpnge_sd (__m128d __A, __m128d __B)
490{
491  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
492					 (__v2df)
493					 __builtin_ia32_cmpnlesd ((__v2df) __B,
494								  (__v2df)
495								  __A));
496}
497
498extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499_mm_cmpord_sd (__m128d __A, __m128d __B)
500{
501  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
502}
503
504extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505_mm_cmpunord_sd (__m128d __A, __m128d __B)
506{
507  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
508}
509
510extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
511_mm_comieq_sd (__m128d __A, __m128d __B)
512{
513  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
514}
515
516extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517_mm_comilt_sd (__m128d __A, __m128d __B)
518{
519  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
520}
521
522extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523_mm_comile_sd (__m128d __A, __m128d __B)
524{
525  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
526}
527
528extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529_mm_comigt_sd (__m128d __A, __m128d __B)
530{
531  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
532}
533
534extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535_mm_comige_sd (__m128d __A, __m128d __B)
536{
537  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
538}
539
540extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541_mm_comineq_sd (__m128d __A, __m128d __B)
542{
543  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
544}
545
546extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547_mm_ucomieq_sd (__m128d __A, __m128d __B)
548{
549  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
550}
551
552extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553_mm_ucomilt_sd (__m128d __A, __m128d __B)
554{
555  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
556}
557
558extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559_mm_ucomile_sd (__m128d __A, __m128d __B)
560{
561  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
562}
563
564extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565_mm_ucomigt_sd (__m128d __A, __m128d __B)
566{
567  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
568}
569
570extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
571_mm_ucomige_sd (__m128d __A, __m128d __B)
572{
573  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
574}
575
576extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577_mm_ucomineq_sd (__m128d __A, __m128d __B)
578{
579  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
580}
581
582/* Create a vector of Qi, where i is the element number.  */
583
584extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585_mm_set_epi64x (long long __q1, long long __q0)
586{
587  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
588}
589
590extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591_mm_set_epi64 (__m64 __q1,  __m64 __q0)
592{
593  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
594}
595
596extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
597_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
598{
599  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
600}
601
602extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
604	       short __q3, short __q2, short __q1, short __q0)
605{
606  return __extension__ (__m128i)(__v8hi){
607    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
608}
609
610extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
612	      char __q11, char __q10, char __q09, char __q08,
613	      char __q07, char __q06, char __q05, char __q04,
614	      char __q03, char __q02, char __q01, char __q00)
615{
616  return __extension__ (__m128i)(__v16qi){
617    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
618    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
619  };
620}
621
622/* Set all of the elements of the vector to A.  */
623
624extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm_set1_epi64x (long long __A)
626{
627  return _mm_set_epi64x (__A, __A);
628}
629
630extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm_set1_epi64 (__m64 __A)
632{
633  return _mm_set_epi64 (__A, __A);
634}
635
636extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637_mm_set1_epi32 (int __A)
638{
639  return _mm_set_epi32 (__A, __A, __A, __A);
640}
641
642extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_set1_epi16 (short __A)
644{
645  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
646}
647
648extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm_set1_epi8 (char __A)
650{
651  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
652		       __A, __A, __A, __A, __A, __A, __A, __A);
653}
654
655/* Create a vector of Qi, where i is the element number.
656   The parameter order is reversed from the _mm_set_epi* functions.  */
657
658extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659_mm_setr_epi64 (__m64 __q0, __m64 __q1)
660{
661  return _mm_set_epi64 (__q1, __q0);
662}
663
664extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
666{
667  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
668}
669
670extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
672	        short __q4, short __q5, short __q6, short __q7)
673{
674  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
675}
676
677extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
679	       char __q04, char __q05, char __q06, char __q07,
680	       char __q08, char __q09, char __q10, char __q11,
681	       char __q12, char __q13, char __q14, char __q15)
682{
683  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
684		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
685}
686
687/* Create a vector with element 0 as *P and the rest zero.  */
688
689extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690_mm_load_si128 (__m128i const *__P)
691{
692  return *__P;
693}
694
695extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696_mm_loadu_si128 (__m128i const *__P)
697{
698  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
699}
700
701extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
702_mm_loadl_epi64 (__m128i const *__P)
703{
704  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
705}
706
707extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
708_mm_store_si128 (__m128i *__P, __m128i __B)
709{
710  *__P = __B;
711}
712
713extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
714_mm_storeu_si128 (__m128i *__P, __m128i __B)
715{
716  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
717}
718
719extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720_mm_storel_epi64 (__m128i *__P, __m128i __B)
721{
722  *(long long *)__P = ((__v2di)__B)[0];
723}
724
725extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726_mm_movepi64_pi64 (__m128i __B)
727{
728  return (__m64) ((__v2di)__B)[0];
729}
730
731extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732_mm_movpi64_epi64 (__m64 __A)
733{
734  return _mm_set_epi64 ((__m64)0LL, __A);
735}
736
737extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738_mm_move_epi64 (__m128i __A)
739{
740  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
741}
742
743/* Create an undefined vector.  */
744extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745_mm_undefined_si128 (void)
746{
747  __m128i __Y = __Y;
748  return __Y;
749}
750
751/* Create a vector of zeros.  */
752extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753_mm_setzero_si128 (void)
754{
755  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
756}
757
758extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
759_mm_cvtepi32_pd (__m128i __A)
760{
761  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
762}
763
764extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765_mm_cvtepi32_ps (__m128i __A)
766{
767  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
768}
769
770extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771_mm_cvtpd_epi32 (__m128d __A)
772{
773  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
774}
775
776extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
777_mm_cvtpd_pi32 (__m128d __A)
778{
779  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
780}
781
782extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
783_mm_cvtpd_ps (__m128d __A)
784{
785  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
786}
787
788extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
789_mm_cvttpd_epi32 (__m128d __A)
790{
791  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
792}
793
794extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795_mm_cvttpd_pi32 (__m128d __A)
796{
797  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
798}
799
800extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801_mm_cvtpi32_pd (__m64 __A)
802{
803  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
804}
805
806extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807_mm_cvtps_epi32 (__m128 __A)
808{
809  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
810}
811
812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813_mm_cvttps_epi32 (__m128 __A)
814{
815  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
816}
817
818extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819_mm_cvtps_pd (__m128 __A)
820{
821  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
822}
823
824extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825_mm_cvtsd_si32 (__m128d __A)
826{
827  return __builtin_ia32_cvtsd2si ((__v2df) __A);
828}
829
830#ifdef __x86_64__
831/* Intel intrinsic.  */
832extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833_mm_cvtsd_si64 (__m128d __A)
834{
835  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
836}
837
838/* Microsoft intrinsic.  */
839extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840_mm_cvtsd_si64x (__m128d __A)
841{
842  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
843}
844#endif
845
846extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847_mm_cvttsd_si32 (__m128d __A)
848{
849  return __builtin_ia32_cvttsd2si ((__v2df) __A);
850}
851
852#ifdef __x86_64__
853/* Intel intrinsic.  */
854extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
855_mm_cvttsd_si64 (__m128d __A)
856{
857  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
858}
859
860/* Microsoft intrinsic.  */
861extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862_mm_cvttsd_si64x (__m128d __A)
863{
864  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
865}
866#endif
867
868extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869_mm_cvtsd_ss (__m128 __A, __m128d __B)
870{
871  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
872}
873
874extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875_mm_cvtsi32_sd (__m128d __A, int __B)
876{
877  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
878}
879
880#ifdef __x86_64__
881/* Intel intrinsic.  */
882extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883_mm_cvtsi64_sd (__m128d __A, long long __B)
884{
885  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
886}
887
888/* Microsoft intrinsic.  */
889extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890_mm_cvtsi64x_sd (__m128d __A, long long __B)
891{
892  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
893}
894#endif
895
896extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
897_mm_cvtss_sd (__m128d __A, __m128 __B)
898{
899  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
900}
901
902#ifdef __OPTIMIZE__
903extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
905{
906  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
907}
908#else
909#define _mm_shuffle_pd(A, B, N)						\
910  ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),		\
911				   (__v2df)(__m128d)(B), (int)(N)))
912#endif
913
914extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915_mm_unpackhi_pd (__m128d __A, __m128d __B)
916{
917  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
918}
919
920extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921_mm_unpacklo_pd (__m128d __A, __m128d __B)
922{
923  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
924}
925
926extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
927_mm_loadh_pd (__m128d __A, double const *__B)
928{
929  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
930}
931
932extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
933_mm_loadl_pd (__m128d __A, double const *__B)
934{
935  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
936}
937
938extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm_movemask_pd (__m128d __A)
940{
941  return __builtin_ia32_movmskpd ((__v2df)__A);
942}
943
944extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945_mm_packs_epi16 (__m128i __A, __m128i __B)
946{
947  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
948}
949
950extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951_mm_packs_epi32 (__m128i __A, __m128i __B)
952{
953  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
954}
955
956extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957_mm_packus_epi16 (__m128i __A, __m128i __B)
958{
959  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
960}
961
962extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
964{
965  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
966}
967
968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
970{
971  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
972}
973
974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
976{
977  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
978}
979
980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
982{
983  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
984}
985
986extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
988{
989  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
990}
991
992extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
994{
995  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
996}
997
998extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1000{
1001  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
1002}
1003
1004extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1006{
1007  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
1008}
1009
1010extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1011_mm_add_epi8 (__m128i __A, __m128i __B)
1012{
1013  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1014}
1015
1016extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017_mm_add_epi16 (__m128i __A, __m128i __B)
1018{
1019  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1020}
1021
1022extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1023_mm_add_epi32 (__m128i __A, __m128i __B)
1024{
1025  return (__m128i) ((__v4su)__A + (__v4su)__B);
1026}
1027
1028extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029_mm_add_epi64 (__m128i __A, __m128i __B)
1030{
1031  return (__m128i) ((__v2du)__A + (__v2du)__B);
1032}
1033
1034extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035_mm_adds_epi8 (__m128i __A, __m128i __B)
1036{
1037  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1038}
1039
1040extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041_mm_adds_epi16 (__m128i __A, __m128i __B)
1042{
1043  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1044}
1045
1046extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047_mm_adds_epu8 (__m128i __A, __m128i __B)
1048{
1049  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1050}
1051
1052extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1053_mm_adds_epu16 (__m128i __A, __m128i __B)
1054{
1055  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1056}
1057
1058extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059_mm_sub_epi8 (__m128i __A, __m128i __B)
1060{
1061  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1062}
1063
1064extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065_mm_sub_epi16 (__m128i __A, __m128i __B)
1066{
1067  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1068}
1069
1070extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1071_mm_sub_epi32 (__m128i __A, __m128i __B)
1072{
1073  return (__m128i) ((__v4su)__A - (__v4su)__B);
1074}
1075
1076extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1077_mm_sub_epi64 (__m128i __A, __m128i __B)
1078{
1079  return (__m128i) ((__v2du)__A - (__v2du)__B);
1080}
1081
1082extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083_mm_subs_epi8 (__m128i __A, __m128i __B)
1084{
1085  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1086}
1087
1088extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089_mm_subs_epi16 (__m128i __A, __m128i __B)
1090{
1091  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1092}
1093
1094extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1095_mm_subs_epu8 (__m128i __A, __m128i __B)
1096{
1097  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1098}
1099
1100extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1101_mm_subs_epu16 (__m128i __A, __m128i __B)
1102{
1103  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1104}
1105
1106extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107_mm_madd_epi16 (__m128i __A, __m128i __B)
1108{
1109  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1110}
1111
1112extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1114{
1115  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1116}
1117
1118extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119_mm_mullo_epi16 (__m128i __A, __m128i __B)
1120{
1121  return (__m128i) ((__v8hu)__A * (__v8hu)__B);
1122}
1123
1124extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125_mm_mul_su32 (__m64 __A, __m64 __B)
1126{
1127  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1128}
1129
1130extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131_mm_mul_epu32 (__m128i __A, __m128i __B)
1132{
1133  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1134}
1135
1136extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1137_mm_slli_epi16 (__m128i __A, int __B)
1138{
1139  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1140}
1141
1142extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143_mm_slli_epi32 (__m128i __A, int __B)
1144{
1145  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1146}
1147
1148extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149_mm_slli_epi64 (__m128i __A, int __B)
1150{
1151  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1152}
1153
1154extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155_mm_srai_epi16 (__m128i __A, int __B)
1156{
1157  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1158}
1159
1160extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161_mm_srai_epi32 (__m128i __A, int __B)
1162{
1163  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1164}
1165
1166#ifdef __OPTIMIZE__
1167extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168_mm_bsrli_si128 (__m128i __A, const int __N)
1169{
1170  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
1171}
1172
1173extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174_mm_bslli_si128 (__m128i __A, const int __N)
1175{
1176  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
1177}
1178
1179extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180_mm_srli_si128 (__m128i __A, const int __N)
1181{
1182  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
1183}
1184
1185extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186_mm_slli_si128 (__m128i __A, const int __N)
1187{
1188  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
1189}
1190#else
1191#define _mm_bsrli_si128(A, N) \
1192  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
1193#define _mm_bslli_si128(A, N) \
1194  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
1195#define _mm_srli_si128(A, N) \
1196  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
1197#define _mm_slli_si128(A, N) \
1198  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
1199#endif
1200
1201extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202_mm_srli_epi16 (__m128i __A, int __B)
1203{
1204  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1205}
1206
1207extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208_mm_srli_epi32 (__m128i __A, int __B)
1209{
1210  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1211}
1212
1213extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214_mm_srli_epi64 (__m128i __A, int __B)
1215{
1216  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1217}
1218
1219extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220_mm_sll_epi16 (__m128i __A, __m128i __B)
1221{
1222  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1223}
1224
1225extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226_mm_sll_epi32 (__m128i __A, __m128i __B)
1227{
1228  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1229}
1230
1231extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232_mm_sll_epi64 (__m128i __A, __m128i __B)
1233{
1234  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1235}
1236
1237extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238_mm_sra_epi16 (__m128i __A, __m128i __B)
1239{
1240  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1241}
1242
1243extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244_mm_sra_epi32 (__m128i __A, __m128i __B)
1245{
1246  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1247}
1248
1249extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250_mm_srl_epi16 (__m128i __A, __m128i __B)
1251{
1252  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1253}
1254
1255extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256_mm_srl_epi32 (__m128i __A, __m128i __B)
1257{
1258  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1259}
1260
1261extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262_mm_srl_epi64 (__m128i __A, __m128i __B)
1263{
1264  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1265}
1266
1267extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268_mm_and_si128 (__m128i __A, __m128i __B)
1269{
1270  return (__m128i) ((__v2du)__A & (__v2du)__B);
1271}
1272
1273extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274_mm_andnot_si128 (__m128i __A, __m128i __B)
1275{
1276  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1277}
1278
1279extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280_mm_or_si128 (__m128i __A, __m128i __B)
1281{
1282  return (__m128i) ((__v2du)__A | (__v2du)__B);
1283}
1284
1285extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286_mm_xor_si128 (__m128i __A, __m128i __B)
1287{
1288  return (__m128i) ((__v2du)__A ^ (__v2du)__B);
1289}
1290
1291extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1293{
1294  return (__m128i) ((__v16qi)__A == (__v16qi)__B);
1295}
1296
1297extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1299{
1300  return (__m128i) ((__v8hi)__A == (__v8hi)__B);
1301}
1302
1303extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1305{
1306  return (__m128i) ((__v4si)__A == (__v4si)__B);
1307}
1308
1309extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1311{
1312  return (__m128i) ((__v16qi)__A < (__v16qi)__B);
1313}
1314
1315extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1317{
1318  return (__m128i) ((__v8hi)__A < (__v8hi)__B);
1319}
1320
1321extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1323{
1324  return (__m128i) ((__v4si)__A < (__v4si)__B);
1325}
1326
1327extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1329{
1330  return (__m128i) ((__v16qi)__A > (__v16qi)__B);
1331}
1332
1333extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1335{
1336  return (__m128i) ((__v8hi)__A > (__v8hi)__B);
1337}
1338
1339extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1341{
1342  return (__m128i) ((__v4si)__A > (__v4si)__B);
1343}
1344
1345#ifdef __OPTIMIZE__
1346extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347_mm_extract_epi16 (__m128i const __A, int const __N)
1348{
1349  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1350}
1351
1352extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1354{
1355  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1356}
1357#else
1358#define _mm_extract_epi16(A, N) \
1359  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
1360#define _mm_insert_epi16(A, D, N)				\
1361  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
1362					  (int)(D), (int)(N)))
1363#endif
1364
1365extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1366_mm_max_epi16 (__m128i __A, __m128i __B)
1367{
1368  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1369}
1370
1371extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372_mm_max_epu8 (__m128i __A, __m128i __B)
1373{
1374  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1375}
1376
1377extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378_mm_min_epi16 (__m128i __A, __m128i __B)
1379{
1380  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1381}
1382
1383extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384_mm_min_epu8 (__m128i __A, __m128i __B)
1385{
1386  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1387}
1388
1389extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390_mm_movemask_epi8 (__m128i __A)
1391{
1392  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1393}
1394
1395extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1397{
1398  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1399}
1400
1401#ifdef __OPTIMIZE__
1402extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403_mm_shufflehi_epi16 (__m128i __A, const int __mask)
1404{
1405  return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
1406}
1407
1408extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409_mm_shufflelo_epi16 (__m128i __A, const int __mask)
1410{
1411  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
1412}
1413
1414extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1415_mm_shuffle_epi32 (__m128i __A, const int __mask)
1416{
1417  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
1418}
1419#else
1420#define _mm_shufflehi_epi16(A, N) \
1421  ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
1422#define _mm_shufflelo_epi16(A, N) \
1423  ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
1424#define _mm_shuffle_epi32(A, N) \
1425  ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
1426#endif
1427
1428extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1430{
1431  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1432}
1433
1434extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435_mm_avg_epu8 (__m128i __A, __m128i __B)
1436{
1437  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1438}
1439
1440extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441_mm_avg_epu16 (__m128i __A, __m128i __B)
1442{
1443  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1444}
1445
1446extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1447_mm_sad_epu8 (__m128i __A, __m128i __B)
1448{
1449  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1450}
1451
1452extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1453_mm_stream_si32 (int *__A, int __B)
1454{
1455  __builtin_ia32_movnti (__A, __B);
1456}
1457
1458#ifdef __x86_64__
1459extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1460_mm_stream_si64 (long long int *__A, long long int __B)
1461{
1462  __builtin_ia32_movnti64 (__A, __B);
1463}
1464#endif
1465
1466extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1467_mm_stream_si128 (__m128i *__A, __m128i __B)
1468{
1469  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1470}
1471
1472extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1473_mm_stream_pd (double *__A, __m128d __B)
1474{
1475  __builtin_ia32_movntpd (__A, (__v2df)__B);
1476}
1477
1478extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1479_mm_clflush (void const *__A)
1480{
1481  __builtin_ia32_clflush (__A);
1482}
1483
1484extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1485_mm_lfence (void)
1486{
1487  __builtin_ia32_lfence ();
1488}
1489
1490extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1491_mm_mfence (void)
1492{
1493  __builtin_ia32_mfence ();
1494}
1495
1496extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497_mm_cvtsi32_si128 (int __A)
1498{
1499  return _mm_set_epi32 (0, 0, 0, __A);
1500}
1501
1502#ifdef __x86_64__
1503/* Intel intrinsic.  */
1504extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505_mm_cvtsi64_si128 (long long __A)
1506{
1507  return _mm_set_epi64x (0, __A);
1508}
1509
1510/* Microsoft intrinsic.  */
1511extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512_mm_cvtsi64x_si128 (long long __A)
1513{
1514  return _mm_set_epi64x (0, __A);
1515}
1516#endif
1517
1518/* Casts between various SP, DP, INT vector types.  Note that these do no
1519   conversion of values, they just change the type.  */
1520extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1521_mm_castpd_ps(__m128d __A)
1522{
1523  return (__m128) __A;
1524}
1525
1526extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1527_mm_castpd_si128(__m128d __A)
1528{
1529  return (__m128i) __A;
1530}
1531
1532extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533_mm_castps_pd(__m128 __A)
1534{
1535  return (__m128d) __A;
1536}
1537
1538extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539_mm_castps_si128(__m128 __A)
1540{
1541  return (__m128i) __A;
1542}
1543
1544extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1545_mm_castsi128_ps(__m128i __A)
1546{
1547  return (__m128) __A;
1548}
1549
1550extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1551_mm_castsi128_pd(__m128i __A)
1552{
1553  return (__m128d) __A;
1554}
1555
1556#ifdef __DISABLE_SSE2__
1557#undef __DISABLE_SSE2__
1558#pragma GCC pop_options
1559#endif /* __DISABLE_SSE2__ */
1560
1561#endif /* _EMMINTRIN_H_INCLUDED */
1562