emmintrin.h revision 169689
1/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GCC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18   Boston, MA 02110-1301, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 9.0.  */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
33#ifdef __SSE2__
34#include <xmmintrin.h>
35
36/* SSE2 */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef int __v4si __attribute__ ((__vector_size__ (16)));
40typedef short __v8hi __attribute__ ((__vector_size__ (16)));
41typedef char __v16qi __attribute__ ((__vector_size__ (16)));
42
43/* The Intel API is flexible enough that we must allow aliasing with other
44   vector types, and their scalar components.  */
45typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
46typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
47
48/* Create a selector for use with the SHUFPD instruction.  */
49#define _MM_SHUFFLE2(fp1,fp0) \
50 (((fp1) << 1) | (fp0))
51
52/* Create a vector with element 0 as F and the rest zero.  */
53static __inline __m128d __attribute__((__always_inline__))
54_mm_set_sd (double __F)
55{
56  return __extension__ (__m128d){ __F, 0 };
57}
58
59/* Create a vector with both elements equal to F.  */
60static __inline __m128d __attribute__((__always_inline__))
61_mm_set1_pd (double __F)
62{
63  return __extension__ (__m128d){ __F, __F };
64}
65
66static __inline __m128d __attribute__((__always_inline__))
67_mm_set_pd1 (double __F)
68{
69  return _mm_set1_pd (__F);
70}
71
72/* Create a vector with the lower value X and upper value W.  */
73static __inline __m128d __attribute__((__always_inline__))
74_mm_set_pd (double __W, double __X)
75{
76  return __extension__ (__m128d){ __X, __W };
77}
78
79/* Create a vector with the lower value W and upper value X.  */
80static __inline __m128d __attribute__((__always_inline__))
81_mm_setr_pd (double __W, double __X)
82{
83  return __extension__ (__m128d){ __W, __X };
84}
85
86/* Create a vector of zeros.  */
87static __inline __m128d __attribute__((__always_inline__))
88_mm_setzero_pd (void)
89{
90  return __extension__ (__m128d){ 0.0, 0.0 };
91}
92
93/* Sets the low DPFP value of A from the low value of B.  */
94static __inline __m128d __attribute__((__always_inline__))
95_mm_move_sd (__m128d __A, __m128d __B)
96{
97  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
98}
99
100/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
101static __inline __m128d __attribute__((__always_inline__))
102_mm_load_pd (double const *__P)
103{
104  return *(__m128d *)__P;
105}
106
107/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
108static __inline __m128d __attribute__((__always_inline__))
109_mm_loadu_pd (double const *__P)
110{
111  return __builtin_ia32_loadupd (__P);
112}
113
114/* Create a vector with all two elements equal to *P.  */
115static __inline __m128d __attribute__((__always_inline__))
116_mm_load1_pd (double const *__P)
117{
118  return _mm_set1_pd (*__P);
119}
120
121/* Create a vector with element 0 as *P and the rest zero.  */
122static __inline __m128d __attribute__((__always_inline__))
123_mm_load_sd (double const *__P)
124{
125  return _mm_set_sd (*__P);
126}
127
128static __inline __m128d __attribute__((__always_inline__))
129_mm_load_pd1 (double const *__P)
130{
131  return _mm_load1_pd (__P);
132}
133
134/* Load two DPFP values in reverse order.  The address must be aligned.  */
135static __inline __m128d __attribute__((__always_inline__))
136_mm_loadr_pd (double const *__P)
137{
138  __m128d __tmp = _mm_load_pd (__P);
139  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
140}
141
142/* Store two DPFP values.  The address must be 16-byte aligned.  */
143static __inline void __attribute__((__always_inline__))
144_mm_store_pd (double *__P, __m128d __A)
145{
146  *(__m128d *)__P = __A;
147}
148
149/* Store two DPFP values.  The address need not be 16-byte aligned.  */
150static __inline void __attribute__((__always_inline__))
151_mm_storeu_pd (double *__P, __m128d __A)
152{
153  __builtin_ia32_storeupd (__P, __A);
154}
155
156/* Stores the lower DPFP value.  */
157static __inline void __attribute__((__always_inline__))
158_mm_store_sd (double *__P, __m128d __A)
159{
160  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
161}
162
163static __inline double __attribute__((__always_inline__))
164_mm_cvtsd_f64 (__m128d __A)
165{
166  return __builtin_ia32_vec_ext_v2df (__A, 0);
167}
168
169static __inline void __attribute__((__always_inline__))
170_mm_storel_pd (double *__P, __m128d __A)
171{
172  _mm_store_sd (__P, __A);
173}
174
175/* Stores the upper DPFP value.  */
176static __inline void __attribute__((__always_inline__))
177_mm_storeh_pd (double *__P, __m128d __A)
178{
179  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
180}
181
182/* Store the lower DPFP value across two words.
183   The address must be 16-byte aligned.  */
184static __inline void __attribute__((__always_inline__))
185_mm_store1_pd (double *__P, __m128d __A)
186{
187  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
188}
189
190static __inline void __attribute__((__always_inline__))
191_mm_store_pd1 (double *__P, __m128d __A)
192{
193  _mm_store1_pd (__P, __A);
194}
195
196/* Store two DPFP values in reverse order.  The address must be aligned.  */
197static __inline void __attribute__((__always_inline__))
198_mm_storer_pd (double *__P, __m128d __A)
199{
200  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
201}
202
203static __inline int __attribute__((__always_inline__))
204_mm_cvtsi128_si32 (__m128i __A)
205{
206  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
207}
208
209#ifdef __x86_64__
210/* Intel intrinsic.  */
211static __inline long long __attribute__((__always_inline__))
212_mm_cvtsi128_si64 (__m128i __A)
213{
214  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
215}
216
217/* Microsoft intrinsic.  */
218static __inline long long __attribute__((__always_inline__))
219_mm_cvtsi128_si64x (__m128i __A)
220{
221  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
222}
223#endif
224
225static __inline __m128d __attribute__((__always_inline__))
226_mm_add_pd (__m128d __A, __m128d __B)
227{
228  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
229}
230
231static __inline __m128d __attribute__((__always_inline__))
232_mm_add_sd (__m128d __A, __m128d __B)
233{
234  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
235}
236
237static __inline __m128d __attribute__((__always_inline__))
238_mm_sub_pd (__m128d __A, __m128d __B)
239{
240  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
241}
242
243static __inline __m128d __attribute__((__always_inline__))
244_mm_sub_sd (__m128d __A, __m128d __B)
245{
246  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
247}
248
249static __inline __m128d __attribute__((__always_inline__))
250_mm_mul_pd (__m128d __A, __m128d __B)
251{
252  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
253}
254
255static __inline __m128d __attribute__((__always_inline__))
256_mm_mul_sd (__m128d __A, __m128d __B)
257{
258  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
259}
260
261static __inline __m128d __attribute__((__always_inline__))
262_mm_div_pd (__m128d __A, __m128d __B)
263{
264  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
265}
266
267static __inline __m128d __attribute__((__always_inline__))
268_mm_div_sd (__m128d __A, __m128d __B)
269{
270  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
271}
272
273static __inline __m128d __attribute__((__always_inline__))
274_mm_sqrt_pd (__m128d __A)
275{
276  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
277}
278
279/* Return pair {sqrt (A[0), B[1]}.  */
280static __inline __m128d __attribute__((__always_inline__))
281_mm_sqrt_sd (__m128d __A, __m128d __B)
282{
283  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
284  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
285}
286
287static __inline __m128d __attribute__((__always_inline__))
288_mm_min_pd (__m128d __A, __m128d __B)
289{
290  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
291}
292
293static __inline __m128d __attribute__((__always_inline__))
294_mm_min_sd (__m128d __A, __m128d __B)
295{
296  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
297}
298
299static __inline __m128d __attribute__((__always_inline__))
300_mm_max_pd (__m128d __A, __m128d __B)
301{
302  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
303}
304
305static __inline __m128d __attribute__((__always_inline__))
306_mm_max_sd (__m128d __A, __m128d __B)
307{
308  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
309}
310
311static __inline __m128d __attribute__((__always_inline__))
312_mm_and_pd (__m128d __A, __m128d __B)
313{
314  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
315}
316
317static __inline __m128d __attribute__((__always_inline__))
318_mm_andnot_pd (__m128d __A, __m128d __B)
319{
320  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
321}
322
323static __inline __m128d __attribute__((__always_inline__))
324_mm_or_pd (__m128d __A, __m128d __B)
325{
326  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
327}
328
329static __inline __m128d __attribute__((__always_inline__))
330_mm_xor_pd (__m128d __A, __m128d __B)
331{
332  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
333}
334
335static __inline __m128d __attribute__((__always_inline__))
336_mm_cmpeq_pd (__m128d __A, __m128d __B)
337{
338  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
339}
340
341static __inline __m128d __attribute__((__always_inline__))
342_mm_cmplt_pd (__m128d __A, __m128d __B)
343{
344  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
345}
346
347static __inline __m128d __attribute__((__always_inline__))
348_mm_cmple_pd (__m128d __A, __m128d __B)
349{
350  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
351}
352
353static __inline __m128d __attribute__((__always_inline__))
354_mm_cmpgt_pd (__m128d __A, __m128d __B)
355{
356  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
357}
358
359static __inline __m128d __attribute__((__always_inline__))
360_mm_cmpge_pd (__m128d __A, __m128d __B)
361{
362  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
363}
364
365static __inline __m128d __attribute__((__always_inline__))
366_mm_cmpneq_pd (__m128d __A, __m128d __B)
367{
368  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
369}
370
371static __inline __m128d __attribute__((__always_inline__))
372_mm_cmpnlt_pd (__m128d __A, __m128d __B)
373{
374  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
375}
376
377static __inline __m128d __attribute__((__always_inline__))
378_mm_cmpnle_pd (__m128d __A, __m128d __B)
379{
380  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
381}
382
383static __inline __m128d __attribute__((__always_inline__))
384_mm_cmpngt_pd (__m128d __A, __m128d __B)
385{
386  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
387}
388
389static __inline __m128d __attribute__((__always_inline__))
390_mm_cmpnge_pd (__m128d __A, __m128d __B)
391{
392  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
393}
394
395static __inline __m128d __attribute__((__always_inline__))
396_mm_cmpord_pd (__m128d __A, __m128d __B)
397{
398  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
399}
400
401static __inline __m128d __attribute__((__always_inline__))
402_mm_cmpunord_pd (__m128d __A, __m128d __B)
403{
404  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
405}
406
407static __inline __m128d __attribute__((__always_inline__))
408_mm_cmpeq_sd (__m128d __A, __m128d __B)
409{
410  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
411}
412
413static __inline __m128d __attribute__((__always_inline__))
414_mm_cmplt_sd (__m128d __A, __m128d __B)
415{
416  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
417}
418
419static __inline __m128d __attribute__((__always_inline__))
420_mm_cmple_sd (__m128d __A, __m128d __B)
421{
422  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
423}
424
425static __inline __m128d __attribute__((__always_inline__))
426_mm_cmpgt_sd (__m128d __A, __m128d __B)
427{
428  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
429					 (__v2df)
430					 __builtin_ia32_cmpltsd ((__v2df) __B,
431								 (__v2df)
432								 __A));
433}
434
435static __inline __m128d __attribute__((__always_inline__))
436_mm_cmpge_sd (__m128d __A, __m128d __B)
437{
438  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
439					 (__v2df)
440					 __builtin_ia32_cmplesd ((__v2df) __B,
441								 (__v2df)
442								 __A));
443}
444
445static __inline __m128d __attribute__((__always_inline__))
446_mm_cmpneq_sd (__m128d __A, __m128d __B)
447{
448  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
449}
450
451static __inline __m128d __attribute__((__always_inline__))
452_mm_cmpnlt_sd (__m128d __A, __m128d __B)
453{
454  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
455}
456
457static __inline __m128d __attribute__((__always_inline__))
458_mm_cmpnle_sd (__m128d __A, __m128d __B)
459{
460  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
461}
462
463static __inline __m128d __attribute__((__always_inline__))
464_mm_cmpngt_sd (__m128d __A, __m128d __B)
465{
466  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
467					 (__v2df)
468					 __builtin_ia32_cmpnltsd ((__v2df) __B,
469								  (__v2df)
470								  __A));
471}
472
473static __inline __m128d __attribute__((__always_inline__))
474_mm_cmpnge_sd (__m128d __A, __m128d __B)
475{
476  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
477					 (__v2df)
478					 __builtin_ia32_cmpnlesd ((__v2df) __B,
479								  (__v2df)
480								  __A));
481}
482
483static __inline __m128d __attribute__((__always_inline__))
484_mm_cmpord_sd (__m128d __A, __m128d __B)
485{
486  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
487}
488
489static __inline __m128d __attribute__((__always_inline__))
490_mm_cmpunord_sd (__m128d __A, __m128d __B)
491{
492  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
493}
494
495static __inline int __attribute__((__always_inline__))
496_mm_comieq_sd (__m128d __A, __m128d __B)
497{
498  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
499}
500
501static __inline int __attribute__((__always_inline__))
502_mm_comilt_sd (__m128d __A, __m128d __B)
503{
504  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
505}
506
507static __inline int __attribute__((__always_inline__))
508_mm_comile_sd (__m128d __A, __m128d __B)
509{
510  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
511}
512
513static __inline int __attribute__((__always_inline__))
514_mm_comigt_sd (__m128d __A, __m128d __B)
515{
516  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
517}
518
519static __inline int __attribute__((__always_inline__))
520_mm_comige_sd (__m128d __A, __m128d __B)
521{
522  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
523}
524
525static __inline int __attribute__((__always_inline__))
526_mm_comineq_sd (__m128d __A, __m128d __B)
527{
528  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
529}
530
531static __inline int __attribute__((__always_inline__))
532_mm_ucomieq_sd (__m128d __A, __m128d __B)
533{
534  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
535}
536
537static __inline int __attribute__((__always_inline__))
538_mm_ucomilt_sd (__m128d __A, __m128d __B)
539{
540  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
541}
542
543static __inline int __attribute__((__always_inline__))
544_mm_ucomile_sd (__m128d __A, __m128d __B)
545{
546  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
547}
548
549static __inline int __attribute__((__always_inline__))
550_mm_ucomigt_sd (__m128d __A, __m128d __B)
551{
552  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
553}
554
555static __inline int __attribute__((__always_inline__))
556_mm_ucomige_sd (__m128d __A, __m128d __B)
557{
558  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
559}
560
561static __inline int __attribute__((__always_inline__))
562_mm_ucomineq_sd (__m128d __A, __m128d __B)
563{
564  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
565}
566
567/* Create a vector of Qi, where i is the element number.  */
568
569static __inline __m128i __attribute__((__always_inline__))
570_mm_set_epi64x (long long __q1, long long __q0)
571{
572  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
573}
574
575static __inline __m128i __attribute__((__always_inline__))
576_mm_set_epi64 (__m64 __q1,  __m64 __q0)
577{
578  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
579}
580
581static __inline __m128i __attribute__((__always_inline__))
582_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
583{
584  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
585}
586
587static __inline __m128i __attribute__((__always_inline__))
588_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
589	       short __q3, short __q2, short __q1, short __q0)
590{
591  return __extension__ (__m128i)(__v8hi){
592    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
593}
594
595static __inline __m128i __attribute__((__always_inline__))
596_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
597	      char __q11, char __q10, char __q09, char __q08,
598	      char __q07, char __q06, char __q05, char __q04,
599	      char __q03, char __q02, char __q01, char __q00)
600{
601  return __extension__ (__m128i)(__v16qi){
602    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
603    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
604  };
605}
606
607/* Set all of the elements of the vector to A.  */
608
609static __inline __m128i __attribute__((__always_inline__))
610_mm_set1_epi64x (long long __A)
611{
612  return _mm_set_epi64x (__A, __A);
613}
614
615static __inline __m128i __attribute__((__always_inline__))
616_mm_set1_epi64 (__m64 __A)
617{
618  return _mm_set_epi64 (__A, __A);
619}
620
621static __inline __m128i __attribute__((__always_inline__))
622_mm_set1_epi32 (int __A)
623{
624  return _mm_set_epi32 (__A, __A, __A, __A);
625}
626
627static __inline __m128i __attribute__((__always_inline__))
628_mm_set1_epi16 (short __A)
629{
630  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
631}
632
633static __inline __m128i __attribute__((__always_inline__))
634_mm_set1_epi8 (char __A)
635{
636  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
637		       __A, __A, __A, __A, __A, __A, __A, __A);
638}
639
640/* Create a vector of Qi, where i is the element number.
641   The parameter order is reversed from the _mm_set_epi* functions.  */
642
643static __inline __m128i __attribute__((__always_inline__))
644_mm_setr_epi64 (__m64 __q0, __m64 __q1)
645{
646  return _mm_set_epi64 (__q1, __q0);
647}
648
649static __inline __m128i __attribute__((__always_inline__))
650_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
651{
652  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
653}
654
655static __inline __m128i __attribute__((__always_inline__))
656_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
657	        short __q4, short __q5, short __q6, short __q7)
658{
659  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
660}
661
662static __inline __m128i __attribute__((__always_inline__))
663_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
664	       char __q04, char __q05, char __q06, char __q07,
665	       char __q08, char __q09, char __q10, char __q11,
666	       char __q12, char __q13, char __q14, char __q15)
667{
668  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
669		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
670}
671
672/* Create a vector with element 0 as *P and the rest zero.  */
673
674static __inline __m128i __attribute__((__always_inline__))
675_mm_load_si128 (__m128i const *__P)
676{
677  return *__P;
678}
679
680static __inline __m128i __attribute__((__always_inline__))
681_mm_loadu_si128 (__m128i const *__P)
682{
683  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
684}
685
686static __inline __m128i __attribute__((__always_inline__))
687_mm_loadl_epi64 (__m128i const *__P)
688{
689  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
690}
691
692static __inline void __attribute__((__always_inline__))
693_mm_store_si128 (__m128i *__P, __m128i __B)
694{
695  *__P = __B;
696}
697
698static __inline void __attribute__((__always_inline__))
699_mm_storeu_si128 (__m128i *__P, __m128i __B)
700{
701  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
702}
703
704static __inline void __attribute__((__always_inline__))
705_mm_storel_epi64 (__m128i *__P, __m128i __B)
706{
707  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
708}
709
710static __inline __m64 __attribute__((__always_inline__))
711_mm_movepi64_pi64 (__m128i __B)
712{
713  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
714}
715
716static __inline __m128i __attribute__((__always_inline__))
717_mm_movpi64_epi64 (__m64 __A)
718{
719  return _mm_set_epi64 ((__m64)0LL, __A);
720}
721
722static __inline __m128i __attribute__((__always_inline__))
723_mm_move_epi64 (__m128i __A)
724{
725  return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
726}
727
728/* Create a vector of zeros.  */
729static __inline __m128i __attribute__((__always_inline__))
730_mm_setzero_si128 (void)
731{
732  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
733}
734
735static __inline __m128d __attribute__((__always_inline__))
736_mm_cvtepi32_pd (__m128i __A)
737{
738  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
739}
740
741static __inline __m128 __attribute__((__always_inline__))
742_mm_cvtepi32_ps (__m128i __A)
743{
744  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
745}
746
747static __inline __m128i __attribute__((__always_inline__))
748_mm_cvtpd_epi32 (__m128d __A)
749{
750  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
751}
752
753static __inline __m64 __attribute__((__always_inline__))
754_mm_cvtpd_pi32 (__m128d __A)
755{
756  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
757}
758
759static __inline __m128 __attribute__((__always_inline__))
760_mm_cvtpd_ps (__m128d __A)
761{
762  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
763}
764
765static __inline __m128i __attribute__((__always_inline__))
766_mm_cvttpd_epi32 (__m128d __A)
767{
768  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
769}
770
771static __inline __m64 __attribute__((__always_inline__))
772_mm_cvttpd_pi32 (__m128d __A)
773{
774  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
775}
776
777static __inline __m128d __attribute__((__always_inline__))
778_mm_cvtpi32_pd (__m64 __A)
779{
780  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
781}
782
783static __inline __m128i __attribute__((__always_inline__))
784_mm_cvtps_epi32 (__m128 __A)
785{
786  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
787}
788
789static __inline __m128i __attribute__((__always_inline__))
790_mm_cvttps_epi32 (__m128 __A)
791{
792  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
793}
794
795static __inline __m128d __attribute__((__always_inline__))
796_mm_cvtps_pd (__m128 __A)
797{
798  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
799}
800
801static __inline int __attribute__((__always_inline__))
802_mm_cvtsd_si32 (__m128d __A)
803{
804  return __builtin_ia32_cvtsd2si ((__v2df) __A);
805}
806
807#ifdef __x86_64__
808/* Intel intrinsic.  */
809static __inline long long __attribute__((__always_inline__))
810_mm_cvtsd_si64 (__m128d __A)
811{
812  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
813}
814
815/* Microsoft intrinsic.  */
816static __inline long long __attribute__((__always_inline__))
817_mm_cvtsd_si64x (__m128d __A)
818{
819  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
820}
821#endif
822
823static __inline int __attribute__((__always_inline__))
824_mm_cvttsd_si32 (__m128d __A)
825{
826  return __builtin_ia32_cvttsd2si ((__v2df) __A);
827}
828
829#ifdef __x86_64__
830/* Intel intrinsic.  */
831static __inline long long __attribute__((__always_inline__))
832_mm_cvttsd_si64 (__m128d __A)
833{
834  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
835}
836
837/* Microsoft intrinsic.  */
838static __inline long long __attribute__((__always_inline__))
839_mm_cvttsd_si64x (__m128d __A)
840{
841  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
842}
843#endif
844
845static __inline __m128 __attribute__((__always_inline__))
846_mm_cvtsd_ss (__m128 __A, __m128d __B)
847{
848  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
849}
850
851static __inline __m128d __attribute__((__always_inline__))
852_mm_cvtsi32_sd (__m128d __A, int __B)
853{
854  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
855}
856
857#ifdef __x86_64__
858/* Intel intrinsic.  */
859static __inline __m128d __attribute__((__always_inline__))
860_mm_cvtsi64_sd (__m128d __A, long long __B)
861{
862  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
863}
864
865/* Microsoft intrinsic.  */
866static __inline __m128d __attribute__((__always_inline__))
867_mm_cvtsi64x_sd (__m128d __A, long long __B)
868{
869  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
870}
871#endif
872
873static __inline __m128d __attribute__((__always_inline__))
874_mm_cvtss_sd (__m128d __A, __m128 __B)
875{
876  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
877}
878
879#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
880
881static __inline __m128d __attribute__((__always_inline__))
882_mm_unpackhi_pd (__m128d __A, __m128d __B)
883{
884  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
885}
886
887static __inline __m128d __attribute__((__always_inline__))
888_mm_unpacklo_pd (__m128d __A, __m128d __B)
889{
890  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
891}
892
893static __inline __m128d __attribute__((__always_inline__))
894_mm_loadh_pd (__m128d __A, double const *__B)
895{
896  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
897}
898
899static __inline __m128d __attribute__((__always_inline__))
900_mm_loadl_pd (__m128d __A, double const *__B)
901{
902  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
903}
904
905static __inline int __attribute__((__always_inline__))
906_mm_movemask_pd (__m128d __A)
907{
908  return __builtin_ia32_movmskpd ((__v2df)__A);
909}
910
911static __inline __m128i __attribute__((__always_inline__))
912_mm_packs_epi16 (__m128i __A, __m128i __B)
913{
914  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
915}
916
917static __inline __m128i __attribute__((__always_inline__))
918_mm_packs_epi32 (__m128i __A, __m128i __B)
919{
920  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
921}
922
923static __inline __m128i __attribute__((__always_inline__))
924_mm_packus_epi16 (__m128i __A, __m128i __B)
925{
926  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
927}
928
929static __inline __m128i __attribute__((__always_inline__))
930_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
931{
932  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
933}
934
935static __inline __m128i __attribute__((__always_inline__))
936_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
937{
938  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
939}
940
941static __inline __m128i __attribute__((__always_inline__))
942_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
943{
944  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
945}
946
947static __inline __m128i __attribute__((__always_inline__))
948_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
949{
950  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
951}
952
953static __inline __m128i __attribute__((__always_inline__))
954_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
955{
956  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
957}
958
959static __inline __m128i __attribute__((__always_inline__))
960_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
961{
962  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
963}
964
965static __inline __m128i __attribute__((__always_inline__))
966_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
967{
968  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
969}
970
971static __inline __m128i __attribute__((__always_inline__))
972_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
973{
974  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
975}
976
977static __inline __m128i __attribute__((__always_inline__))
978_mm_add_epi8 (__m128i __A, __m128i __B)
979{
980  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
981}
982
983static __inline __m128i __attribute__((__always_inline__))
984_mm_add_epi16 (__m128i __A, __m128i __B)
985{
986  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
987}
988
989static __inline __m128i __attribute__((__always_inline__))
990_mm_add_epi32 (__m128i __A, __m128i __B)
991{
992  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
993}
994
995static __inline __m128i __attribute__((__always_inline__))
996_mm_add_epi64 (__m128i __A, __m128i __B)
997{
998  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
999}
1000
1001static __inline __m128i __attribute__((__always_inline__))
1002_mm_adds_epi8 (__m128i __A, __m128i __B)
1003{
1004  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1005}
1006
1007static __inline __m128i __attribute__((__always_inline__))
1008_mm_adds_epi16 (__m128i __A, __m128i __B)
1009{
1010  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1011}
1012
1013static __inline __m128i __attribute__((__always_inline__))
1014_mm_adds_epu8 (__m128i __A, __m128i __B)
1015{
1016  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1017}
1018
1019static __inline __m128i __attribute__((__always_inline__))
1020_mm_adds_epu16 (__m128i __A, __m128i __B)
1021{
1022  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1023}
1024
1025static __inline __m128i __attribute__((__always_inline__))
1026_mm_sub_epi8 (__m128i __A, __m128i __B)
1027{
1028  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1029}
1030
1031static __inline __m128i __attribute__((__always_inline__))
1032_mm_sub_epi16 (__m128i __A, __m128i __B)
1033{
1034  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1035}
1036
1037static __inline __m128i __attribute__((__always_inline__))
1038_mm_sub_epi32 (__m128i __A, __m128i __B)
1039{
1040  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1041}
1042
1043static __inline __m128i __attribute__((__always_inline__))
1044_mm_sub_epi64 (__m128i __A, __m128i __B)
1045{
1046  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1047}
1048
1049static __inline __m128i __attribute__((__always_inline__))
1050_mm_subs_epi8 (__m128i __A, __m128i __B)
1051{
1052  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1053}
1054
1055static __inline __m128i __attribute__((__always_inline__))
1056_mm_subs_epi16 (__m128i __A, __m128i __B)
1057{
1058  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1059}
1060
1061static __inline __m128i __attribute__((__always_inline__))
1062_mm_subs_epu8 (__m128i __A, __m128i __B)
1063{
1064  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1065}
1066
1067static __inline __m128i __attribute__((__always_inline__))
1068_mm_subs_epu16 (__m128i __A, __m128i __B)
1069{
1070  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1071}
1072
1073static __inline __m128i __attribute__((__always_inline__))
1074_mm_madd_epi16 (__m128i __A, __m128i __B)
1075{
1076  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1077}
1078
1079static __inline __m128i __attribute__((__always_inline__))
1080_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1081{
1082  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1083}
1084
1085static __inline __m128i __attribute__((__always_inline__))
1086_mm_mullo_epi16 (__m128i __A, __m128i __B)
1087{
1088  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1089}
1090
1091static __inline __m64 __attribute__((__always_inline__))
1092_mm_mul_su32 (__m64 __A, __m64 __B)
1093{
1094  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1095}
1096
1097static __inline __m128i __attribute__((__always_inline__))
1098_mm_mul_epu32 (__m128i __A, __m128i __B)
1099{
1100  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1101}
1102
1103#if 0
1104static __inline __m128i __attribute__((__always_inline__))
1105_mm_slli_epi16 (__m128i __A, int __B)
1106{
1107  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1108}
1109
1110static __inline __m128i __attribute__((__always_inline__))
1111_mm_slli_epi32 (__m128i __A, int __B)
1112{
1113  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1114}
1115
1116static __inline __m128i __attribute__((__always_inline__))
1117_mm_slli_epi64 (__m128i __A, int __B)
1118{
1119  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1120}
1121#else
1122#define _mm_slli_epi16(__A, __B) \
1123  ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B))
1124#define _mm_slli_epi32(__A, __B) \
1125  ((__m128i)__builtin_ia32_pslldi128 ((__v8hi)(__A), __B))
1126#define _mm_slli_epi64(__A, __B) \
1127  ((__m128i)__builtin_ia32_psllqi128 ((__v8hi)(__A), __B))
1128#endif
1129
1130#if 0
1131static __inline __m128i __attribute__((__always_inline__))
1132_mm_srai_epi16 (__m128i __A, int __B)
1133{
1134  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1135}
1136
1137static __inline __m128i __attribute__((__always_inline__))
1138_mm_srai_epi32 (__m128i __A, int __B)
1139{
1140  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1141}
1142#else
1143#define _mm_srai_epi16(__A, __B) \
1144  ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B))
1145#define _mm_srai_epi32(__A, __B) \
1146  ((__m128i)__builtin_ia32_psradi128 ((__v8hi)(__A), __B))
1147#endif
1148
1149#if 0
1150static __m128i __attribute__((__always_inline__))
1151_mm_srli_si128 (__m128i __A, int __B)
1152{
1153  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8));
1154}
1155
1156static __m128i __attribute__((__always_inline__))
1157_mm_srli_si128 (__m128i __A, int __B)
1158{
1159  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8));
1160}
1161#else
1162#define _mm_srli_si128(__A, __B) \
1163  ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1164#define _mm_slli_si128(__A, __B) \
1165  ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1166#endif
1167
1168#if 0
1169static __inline __m128i __attribute__((__always_inline__))
1170_mm_srli_epi16 (__m128i __A, int __B)
1171{
1172  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1173}
1174
1175static __inline __m128i __attribute__((__always_inline__))
1176_mm_srli_epi32 (__m128i __A, int __B)
1177{
1178  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1179}
1180
1181static __inline __m128i __attribute__((__always_inline__))
1182_mm_srli_epi64 (__m128i __A, int __B)
1183{
1184  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1185}
1186#else
1187#define _mm_srli_epi16(__A, __B) \
1188  ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B))
1189#define _mm_srli_epi32(__A, __B) \
1190  ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B))
1191#define _mm_srli_epi64(__A, __B) \
1192  ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B))
1193#endif
1194
1195static __inline __m128i __attribute__((__always_inline__))
1196_mm_sll_epi16 (__m128i __A, __m128i __B)
1197{
1198  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1199}
1200
1201static __inline __m128i __attribute__((__always_inline__))
1202_mm_sll_epi32 (__m128i __A, __m128i __B)
1203{
1204  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1205}
1206
1207static __inline __m128i __attribute__((__always_inline__))
1208_mm_sll_epi64 (__m128i __A, __m128i __B)
1209{
1210  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1211}
1212
1213static __inline __m128i __attribute__((__always_inline__))
1214_mm_sra_epi16 (__m128i __A, __m128i __B)
1215{
1216  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1217}
1218
1219static __inline __m128i __attribute__((__always_inline__))
1220_mm_sra_epi32 (__m128i __A, __m128i __B)
1221{
1222  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1223}
1224
1225static __inline __m128i __attribute__((__always_inline__))
1226_mm_srl_epi16 (__m128i __A, __m128i __B)
1227{
1228  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1229}
1230
1231static __inline __m128i __attribute__((__always_inline__))
1232_mm_srl_epi32 (__m128i __A, __m128i __B)
1233{
1234  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1235}
1236
1237static __inline __m128i __attribute__((__always_inline__))
1238_mm_srl_epi64 (__m128i __A, __m128i __B)
1239{
1240  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1241}
1242
1243static __inline __m128i __attribute__((__always_inline__))
1244_mm_and_si128 (__m128i __A, __m128i __B)
1245{
1246  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1247}
1248
1249static __inline __m128i __attribute__((__always_inline__))
1250_mm_andnot_si128 (__m128i __A, __m128i __B)
1251{
1252  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1253}
1254
1255static __inline __m128i __attribute__((__always_inline__))
1256_mm_or_si128 (__m128i __A, __m128i __B)
1257{
1258  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1259}
1260
1261static __inline __m128i __attribute__((__always_inline__))
1262_mm_xor_si128 (__m128i __A, __m128i __B)
1263{
1264  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1265}
1266
1267static __inline __m128i __attribute__((__always_inline__))
1268_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1269{
1270  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1271}
1272
1273static __inline __m128i __attribute__((__always_inline__))
1274_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1275{
1276  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1277}
1278
1279static __inline __m128i __attribute__((__always_inline__))
1280_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1281{
1282  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1283}
1284
1285static __inline __m128i __attribute__((__always_inline__))
1286_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1287{
1288  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1289}
1290
1291static __inline __m128i __attribute__((__always_inline__))
1292_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1293{
1294  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1295}
1296
1297static __inline __m128i __attribute__((__always_inline__))
1298_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1299{
1300  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1301}
1302
1303static __inline __m128i __attribute__((__always_inline__))
1304_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1305{
1306  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1307}
1308
1309static __inline __m128i __attribute__((__always_inline__))
1310_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1311{
1312  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1313}
1314
1315static __inline __m128i __attribute__((__always_inline__))
1316_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1317{
1318  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1319}
1320
1321#if 0
1322static __inline int __attribute__((__always_inline__))
1323_mm_extract_epi16 (__m128i const __A, int const __N)
1324{
1325  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1326}
1327
1328static __inline __m128i __attribute__((__always_inline__))
1329_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1330{
1331  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1332}
1333#else
1334#define _mm_extract_epi16(A, N) \
1335  ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N)))
1336#define _mm_insert_epi16(A, D, N) \
1337  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
1338#endif
1339
1340static __inline __m128i __attribute__((__always_inline__))
1341_mm_max_epi16 (__m128i __A, __m128i __B)
1342{
1343  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1344}
1345
1346static __inline __m128i __attribute__((__always_inline__))
1347_mm_max_epu8 (__m128i __A, __m128i __B)
1348{
1349  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1350}
1351
1352static __inline __m128i __attribute__((__always_inline__))
1353_mm_min_epi16 (__m128i __A, __m128i __B)
1354{
1355  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1356}
1357
1358static __inline __m128i __attribute__((__always_inline__))
1359_mm_min_epu8 (__m128i __A, __m128i __B)
1360{
1361  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1362}
1363
1364static __inline int __attribute__((__always_inline__))
1365_mm_movemask_epi8 (__m128i __A)
1366{
1367  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1368}
1369
1370static __inline __m128i __attribute__((__always_inline__))
1371_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1372{
1373  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1374}
1375
1376#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1377#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1378#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1379
1380static __inline void __attribute__((__always_inline__))
1381_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1382{
1383  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1384}
1385
1386static __inline __m128i __attribute__((__always_inline__))
1387_mm_avg_epu8 (__m128i __A, __m128i __B)
1388{
1389  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1390}
1391
1392static __inline __m128i __attribute__((__always_inline__))
1393_mm_avg_epu16 (__m128i __A, __m128i __B)
1394{
1395  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1396}
1397
1398static __inline __m128i __attribute__((__always_inline__))
1399_mm_sad_epu8 (__m128i __A, __m128i __B)
1400{
1401  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1402}
1403
1404static __inline void __attribute__((__always_inline__))
1405_mm_stream_si32 (int *__A, int __B)
1406{
1407  __builtin_ia32_movnti (__A, __B);
1408}
1409
1410static __inline void __attribute__((__always_inline__))
1411_mm_stream_si128 (__m128i *__A, __m128i __B)
1412{
1413  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1414}
1415
1416static __inline void __attribute__((__always_inline__))
1417_mm_stream_pd (double *__A, __m128d __B)
1418{
1419  __builtin_ia32_movntpd (__A, (__v2df)__B);
1420}
1421
1422static __inline void __attribute__((__always_inline__))
1423_mm_clflush (void const *__A)
1424{
1425  __builtin_ia32_clflush (__A);
1426}
1427
1428static __inline void __attribute__((__always_inline__))
1429_mm_lfence (void)
1430{
1431  __builtin_ia32_lfence ();
1432}
1433
1434static __inline void __attribute__((__always_inline__))
1435_mm_mfence (void)
1436{
1437  __builtin_ia32_mfence ();
1438}
1439
1440static __inline __m128i __attribute__((__always_inline__))
1441_mm_cvtsi32_si128 (int __A)
1442{
1443  return _mm_set_epi32 (0, 0, 0, __A);
1444}
1445
1446#ifdef __x86_64__
1447/* Intel intrinsic.  */
1448static __inline __m128i __attribute__((__always_inline__))
1449_mm_cvtsi64_si128 (long long __A)
1450{
1451  return _mm_set_epi64x (0, __A);
1452}
1453
1454/* Microsoft intrinsic.  */
1455static __inline __m128i __attribute__((__always_inline__))
1456_mm_cvtsi64x_si128 (long long __A)
1457{
1458  return _mm_set_epi64x (0, __A);
1459}
1460#endif
1461
1462/* Casts between various SP, DP, INT vector types.  Note that these do no
1463   conversion of values, they just change the type.  */
1464static __inline __m128 __attribute__((__always_inline__))
1465_mm_castpd_ps(__m128d __A)
1466{
1467  return (__m128) __A;
1468}
1469
1470static __inline __m128i __attribute__((__always_inline__))
1471_mm_castpd_si128(__m128d __A)
1472{
1473  return (__m128i) __A;
1474}
1475
1476static __inline __m128d __attribute__((__always_inline__))
1477_mm_castps_pd(__m128 __A)
1478{
1479  return (__m128d) __A;
1480}
1481
1482static __inline __m128i __attribute__((__always_inline__))
1483_mm_castps_si128(__m128 __A)
1484{
1485  return (__m128i) __A;
1486}
1487
1488static __inline __m128 __attribute__((__always_inline__))
1489_mm_castsi128_ps(__m128i __A)
1490{
1491  return (__m128) __A;
1492}
1493
1494static __inline __m128d __attribute__((__always_inline__))
1495_mm_castsi128_pd(__m128i __A)
1496{
1497  return (__m128d) __A;
1498}
1499
1500#endif /* __SSE2__  */
1501
1502#endif /* _EMMINTRIN_H_INCLUDED */
1503