1/* Copyright (C) 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GCC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18   Boston, MA 02110-1301, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 8.0.  */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
33#ifdef __SSE2__
34#include <xmmintrin.h>
35
36/* SSE2 */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef int __v4si __attribute__ ((__vector_size__ (16)));
40typedef short __v8hi __attribute__ ((__vector_size__ (16)));
41typedef char __v16qi __attribute__ ((__vector_size__ (16)));
42
43/* The Intel API is flexible enough that we must allow aliasing with other
44   vector types, and their scalar components.  */
45typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
46typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
47
48/* Create a selector for use with the SHUFPD instruction.  */
49#define _MM_SHUFFLE2(fp1,fp0) \
50 (((fp1) << 1) | (fp0))
51
52/* Create a vector with element 0 as F and the rest zero.  */
53static __inline __m128d __attribute__((__always_inline__))
54_mm_set_sd (double __F)
55{
56  return __extension__ (__m128d){ __F, 0 };
57}
58
59/* Create a vector with both elements equal to F.  */
60static __inline __m128d __attribute__((__always_inline__))
61_mm_set1_pd (double __F)
62{
63  return __extension__ (__m128d){ __F, __F };
64}
65
66static __inline __m128d __attribute__((__always_inline__))
67_mm_set_pd1 (double __F)
68{
69  return _mm_set1_pd (__F);
70}
71
72/* Create a vector with the lower value X and upper value W.  */
73static __inline __m128d __attribute__((__always_inline__))
74_mm_set_pd (double __W, double __X)
75{
76  return __extension__ (__m128d){ __X, __W };
77}
78
79/* Create a vector with the lower value W and upper value X.  */
80static __inline __m128d __attribute__((__always_inline__))
81_mm_setr_pd (double __W, double __X)
82{
83  return __extension__ (__m128d){ __W, __X };
84}
85
86/* Create a vector of zeros.  */
87static __inline __m128d __attribute__((__always_inline__))
88_mm_setzero_pd (void)
89{
90  return __extension__ (__m128d){ 0.0, 0.0 };
91}
92
93/* Sets the low DPFP value of A from the low value of B.  */
94static __inline __m128d __attribute__((__always_inline__))
95_mm_move_sd (__m128d __A, __m128d __B)
96{
97  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
98}
99
100/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
101static __inline __m128d __attribute__((__always_inline__))
102_mm_load_pd (double const *__P)
103{
104  return *(__m128d *)__P;
105}
106
107/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
108static __inline __m128d __attribute__((__always_inline__))
109_mm_loadu_pd (double const *__P)
110{
111  return __builtin_ia32_loadupd (__P);
112}
113
114/* Create a vector with all two elements equal to *P.  */
115static __inline __m128d __attribute__((__always_inline__))
116_mm_load1_pd (double const *__P)
117{
118  return _mm_set1_pd (*__P);
119}
120
121/* Create a vector with element 0 as *P and the rest zero.  */
122static __inline __m128d __attribute__((__always_inline__))
123_mm_load_sd (double const *__P)
124{
125  return _mm_set_sd (*__P);
126}
127
128static __inline __m128d __attribute__((__always_inline__))
129_mm_load_pd1 (double const *__P)
130{
131  return _mm_load1_pd (__P);
132}
133
134/* Load two DPFP values in reverse order.  The address must be aligned.  */
135static __inline __m128d __attribute__((__always_inline__))
136_mm_loadr_pd (double const *__P)
137{
138  __m128d __tmp = _mm_load_pd (__P);
139  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
140}
141
142/* Store two DPFP values.  The address must be 16-byte aligned.  */
143static __inline void __attribute__((__always_inline__))
144_mm_store_pd (double *__P, __m128d __A)
145{
146  *(__m128d *)__P = __A;
147}
148
149/* Store two DPFP values.  The address need not be 16-byte aligned.  */
150static __inline void __attribute__((__always_inline__))
151_mm_storeu_pd (double *__P, __m128d __A)
152{
153  __builtin_ia32_storeupd (__P, __A);
154}
155
156/* Stores the lower DPFP value.  */
157static __inline void __attribute__((__always_inline__))
158_mm_store_sd (double *__P, __m128d __A)
159{
160  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
161}
162
163static __inline void __attribute__((__always_inline__))
164_mm_storel_pd (double *__P, __m128d __A)
165{
166  _mm_store_sd (__P, __A);
167}
168
169/* Stores the upper DPFP value.  */
170static __inline void __attribute__((__always_inline__))
171_mm_storeh_pd (double *__P, __m128d __A)
172{
173  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
174}
175
176/* Store the lower DPFP value across two words.
177   The address must be 16-byte aligned.  */
178static __inline void __attribute__((__always_inline__))
179_mm_store1_pd (double *__P, __m128d __A)
180{
181  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
182}
183
184static __inline void __attribute__((__always_inline__))
185_mm_store_pd1 (double *__P, __m128d __A)
186{
187  _mm_store1_pd (__P, __A);
188}
189
190/* Store two DPFP values in reverse order.  The address must be aligned.  */
191static __inline void __attribute__((__always_inline__))
192_mm_storer_pd (double *__P, __m128d __A)
193{
194  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
195}
196
197static __inline int __attribute__((__always_inline__))
198_mm_cvtsi128_si32 (__m128i __A)
199{
200  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
201}
202
203#ifdef __x86_64__
204static __inline long long __attribute__((__always_inline__))
205_mm_cvtsi128_si64x (__m128i __A)
206{
207  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
208}
209#endif
210
211static __inline __m128d __attribute__((__always_inline__))
212_mm_add_pd (__m128d __A, __m128d __B)
213{
214  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
215}
216
217static __inline __m128d __attribute__((__always_inline__))
218_mm_add_sd (__m128d __A, __m128d __B)
219{
220  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
221}
222
223static __inline __m128d __attribute__((__always_inline__))
224_mm_sub_pd (__m128d __A, __m128d __B)
225{
226  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
227}
228
229static __inline __m128d __attribute__((__always_inline__))
230_mm_sub_sd (__m128d __A, __m128d __B)
231{
232  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
233}
234
235static __inline __m128d __attribute__((__always_inline__))
236_mm_mul_pd (__m128d __A, __m128d __B)
237{
238  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
239}
240
241static __inline __m128d __attribute__((__always_inline__))
242_mm_mul_sd (__m128d __A, __m128d __B)
243{
244  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
245}
246
247static __inline __m128d __attribute__((__always_inline__))
248_mm_div_pd (__m128d __A, __m128d __B)
249{
250  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
251}
252
253static __inline __m128d __attribute__((__always_inline__))
254_mm_div_sd (__m128d __A, __m128d __B)
255{
256  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
257}
258
259static __inline __m128d __attribute__((__always_inline__))
260_mm_sqrt_pd (__m128d __A)
261{
262  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
263}
264
265/* Return pair {sqrt (A[0), B[1]}.  */
266static __inline __m128d __attribute__((__always_inline__))
267_mm_sqrt_sd (__m128d __A, __m128d __B)
268{
269  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
270  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
271}
272
273static __inline __m128d __attribute__((__always_inline__))
274_mm_min_pd (__m128d __A, __m128d __B)
275{
276  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
277}
278
279static __inline __m128d __attribute__((__always_inline__))
280_mm_min_sd (__m128d __A, __m128d __B)
281{
282  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
283}
284
285static __inline __m128d __attribute__((__always_inline__))
286_mm_max_pd (__m128d __A, __m128d __B)
287{
288  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
289}
290
291static __inline __m128d __attribute__((__always_inline__))
292_mm_max_sd (__m128d __A, __m128d __B)
293{
294  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
295}
296
297static __inline __m128d __attribute__((__always_inline__))
298_mm_and_pd (__m128d __A, __m128d __B)
299{
300  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
301}
302
303static __inline __m128d __attribute__((__always_inline__))
304_mm_andnot_pd (__m128d __A, __m128d __B)
305{
306  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
307}
308
309static __inline __m128d __attribute__((__always_inline__))
310_mm_or_pd (__m128d __A, __m128d __B)
311{
312  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
313}
314
315static __inline __m128d __attribute__((__always_inline__))
316_mm_xor_pd (__m128d __A, __m128d __B)
317{
318  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
319}
320
321static __inline __m128d __attribute__((__always_inline__))
322_mm_cmpeq_pd (__m128d __A, __m128d __B)
323{
324  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
325}
326
327static __inline __m128d __attribute__((__always_inline__))
328_mm_cmplt_pd (__m128d __A, __m128d __B)
329{
330  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
331}
332
333static __inline __m128d __attribute__((__always_inline__))
334_mm_cmple_pd (__m128d __A, __m128d __B)
335{
336  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
337}
338
339static __inline __m128d __attribute__((__always_inline__))
340_mm_cmpgt_pd (__m128d __A, __m128d __B)
341{
342  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
343}
344
345static __inline __m128d __attribute__((__always_inline__))
346_mm_cmpge_pd (__m128d __A, __m128d __B)
347{
348  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
349}
350
351static __inline __m128d __attribute__((__always_inline__))
352_mm_cmpneq_pd (__m128d __A, __m128d __B)
353{
354  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
355}
356
357static __inline __m128d __attribute__((__always_inline__))
358_mm_cmpnlt_pd (__m128d __A, __m128d __B)
359{
360  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
361}
362
363static __inline __m128d __attribute__((__always_inline__))
364_mm_cmpnle_pd (__m128d __A, __m128d __B)
365{
366  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
367}
368
369static __inline __m128d __attribute__((__always_inline__))
370_mm_cmpngt_pd (__m128d __A, __m128d __B)
371{
372  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
373}
374
375static __inline __m128d __attribute__((__always_inline__))
376_mm_cmpnge_pd (__m128d __A, __m128d __B)
377{
378  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
379}
380
381static __inline __m128d __attribute__((__always_inline__))
382_mm_cmpord_pd (__m128d __A, __m128d __B)
383{
384  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
385}
386
387static __inline __m128d __attribute__((__always_inline__))
388_mm_cmpunord_pd (__m128d __A, __m128d __B)
389{
390  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
391}
392
393static __inline __m128d __attribute__((__always_inline__))
394_mm_cmpeq_sd (__m128d __A, __m128d __B)
395{
396  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
397}
398
399static __inline __m128d __attribute__((__always_inline__))
400_mm_cmplt_sd (__m128d __A, __m128d __B)
401{
402  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
403}
404
405static __inline __m128d __attribute__((__always_inline__))
406_mm_cmple_sd (__m128d __A, __m128d __B)
407{
408  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
409}
410
411static __inline __m128d __attribute__((__always_inline__))
412_mm_cmpgt_sd (__m128d __A, __m128d __B)
413{
414  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
415					 (__v2df)
416					 __builtin_ia32_cmpltsd ((__v2df) __B,
417								 (__v2df)
418								 __A));
419}
420
421static __inline __m128d __attribute__((__always_inline__))
422_mm_cmpge_sd (__m128d __A, __m128d __B)
423{
424  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
425					 (__v2df)
426					 __builtin_ia32_cmplesd ((__v2df) __B,
427								 (__v2df)
428								 __A));
429}
430
431static __inline __m128d __attribute__((__always_inline__))
432_mm_cmpneq_sd (__m128d __A, __m128d __B)
433{
434  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
435}
436
437static __inline __m128d __attribute__((__always_inline__))
438_mm_cmpnlt_sd (__m128d __A, __m128d __B)
439{
440  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
441}
442
443static __inline __m128d __attribute__((__always_inline__))
444_mm_cmpnle_sd (__m128d __A, __m128d __B)
445{
446  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
447}
448
449static __inline __m128d __attribute__((__always_inline__))
450_mm_cmpngt_sd (__m128d __A, __m128d __B)
451{
452  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
453					 (__v2df)
454					 __builtin_ia32_cmpnltsd ((__v2df) __B,
455								  (__v2df)
456								  __A));
457}
458
459static __inline __m128d __attribute__((__always_inline__))
460_mm_cmpnge_sd (__m128d __A, __m128d __B)
461{
462  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
463					 (__v2df)
464					 __builtin_ia32_cmpnlesd ((__v2df) __B,
465								  (__v2df)
466								  __A));
467}
468
469static __inline __m128d __attribute__((__always_inline__))
470_mm_cmpord_sd (__m128d __A, __m128d __B)
471{
472  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
473}
474
475static __inline __m128d __attribute__((__always_inline__))
476_mm_cmpunord_sd (__m128d __A, __m128d __B)
477{
478  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
479}
480
481static __inline int __attribute__((__always_inline__))
482_mm_comieq_sd (__m128d __A, __m128d __B)
483{
484  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
485}
486
487static __inline int __attribute__((__always_inline__))
488_mm_comilt_sd (__m128d __A, __m128d __B)
489{
490  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
491}
492
493static __inline int __attribute__((__always_inline__))
494_mm_comile_sd (__m128d __A, __m128d __B)
495{
496  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
497}
498
499static __inline int __attribute__((__always_inline__))
500_mm_comigt_sd (__m128d __A, __m128d __B)
501{
502  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
503}
504
505static __inline int __attribute__((__always_inline__))
506_mm_comige_sd (__m128d __A, __m128d __B)
507{
508  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
509}
510
511static __inline int __attribute__((__always_inline__))
512_mm_comineq_sd (__m128d __A, __m128d __B)
513{
514  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
515}
516
517static __inline int __attribute__((__always_inline__))
518_mm_ucomieq_sd (__m128d __A, __m128d __B)
519{
520  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
521}
522
523static __inline int __attribute__((__always_inline__))
524_mm_ucomilt_sd (__m128d __A, __m128d __B)
525{
526  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
527}
528
529static __inline int __attribute__((__always_inline__))
530_mm_ucomile_sd (__m128d __A, __m128d __B)
531{
532  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
533}
534
535static __inline int __attribute__((__always_inline__))
536_mm_ucomigt_sd (__m128d __A, __m128d __B)
537{
538  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
539}
540
541static __inline int __attribute__((__always_inline__))
542_mm_ucomige_sd (__m128d __A, __m128d __B)
543{
544  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
545}
546
547static __inline int __attribute__((__always_inline__))
548_mm_ucomineq_sd (__m128d __A, __m128d __B)
549{
550  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
551}
552
553/* Create a vector of Qi, where i is the element number.  */
554
555static __inline __m128i __attribute__((__always_inline__))
556_mm_set_epi64x (long long __q1, long long __q0)
557{
558  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
559}
560
561static __inline __m128i __attribute__((__always_inline__))
562_mm_set_epi64 (__m64 __q1,  __m64 __q0)
563{
564  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
565}
566
567static __inline __m128i __attribute__((__always_inline__))
568_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
569{
570  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
571}
572
573static __inline __m128i __attribute__((__always_inline__))
574_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
575	       short __q3, short __q2, short __q1, short __q0)
576{
577  return __extension__ (__m128i)(__v8hi){
578    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
579}
580
581static __inline __m128i __attribute__((__always_inline__))
582_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
583	      char __q11, char __q10, char __q09, char __q08,
584	      char __q07, char __q06, char __q05, char __q04,
585	      char __q03, char __q02, char __q01, char __q00)
586{
587  return __extension__ (__m128i)(__v16qi){
588    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
589    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
590  };
591}
592
593/* Set all of the elements of the vector to A.  */
594
595static __inline __m128i __attribute__((__always_inline__))
596_mm_set1_epi64x (long long __A)
597{
598  return _mm_set_epi64x (__A, __A);
599}
600
601static __inline __m128i __attribute__((__always_inline__))
602_mm_set1_epi64 (__m64 __A)
603{
604  return _mm_set_epi64 (__A, __A);
605}
606
607static __inline __m128i __attribute__((__always_inline__))
608_mm_set1_epi32 (int __A)
609{
610  return _mm_set_epi32 (__A, __A, __A, __A);
611}
612
613static __inline __m128i __attribute__((__always_inline__))
614_mm_set1_epi16 (short __A)
615{
616  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
617}
618
619static __inline __m128i __attribute__((__always_inline__))
620_mm_set1_epi8 (char __A)
621{
622  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
623		       __A, __A, __A, __A, __A, __A, __A, __A);
624}
625
626/* Create a vector of Qi, where i is the element number.
627   The parameter order is reversed from the _mm_set_epi* functions.  */
628
629static __inline __m128i __attribute__((__always_inline__))
630_mm_setr_epi64 (__m64 __q0, __m64 __q1)
631{
632  return _mm_set_epi64 (__q1, __q0);
633}
634
635static __inline __m128i __attribute__((__always_inline__))
636_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
637{
638  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
639}
640
641static __inline __m128i __attribute__((__always_inline__))
642_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
643	        short __q4, short __q5, short __q6, short __q7)
644{
645  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
646}
647
648static __inline __m128i __attribute__((__always_inline__))
649_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
650	       char __q04, char __q05, char __q06, char __q07,
651	       char __q08, char __q09, char __q10, char __q11,
652	       char __q12, char __q13, char __q14, char __q15)
653{
654  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
655		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
656}
657
658/* Create a vector with element 0 as *P and the rest zero.  */
659
660static __inline __m128i __attribute__((__always_inline__))
661_mm_load_si128 (__m128i const *__P)
662{
663  return *__P;
664}
665
666static __inline __m128i __attribute__((__always_inline__))
667_mm_loadu_si128 (__m128i const *__P)
668{
669  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
670}
671
672static __inline __m128i __attribute__((__always_inline__))
673_mm_loadl_epi64 (__m128i const *__P)
674{
675  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
676}
677
678static __inline void __attribute__((__always_inline__))
679_mm_store_si128 (__m128i *__P, __m128i __B)
680{
681  *__P = __B;
682}
683
684static __inline void __attribute__((__always_inline__))
685_mm_storeu_si128 (__m128i *__P, __m128i __B)
686{
687  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
688}
689
690static __inline void __attribute__((__always_inline__))
691_mm_storel_epi64 (__m128i *__P, __m128i __B)
692{
693  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
694}
695
696static __inline __m64 __attribute__((__always_inline__))
697_mm_movepi64_pi64 (__m128i __B)
698{
699  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
700}
701
702static __inline __m128i __attribute__((__always_inline__))
703_mm_movpi64_epi64 (__m64 __A)
704{
705  return _mm_set_epi64 ((__m64)0LL, __A);
706}
707
708static __inline __m128i __attribute__((__always_inline__))
709_mm_move_epi64 (__m128i __A)
710{
711  return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
712}
713
714/* Create a vector of zeros.  */
715static __inline __m128i __attribute__((__always_inline__))
716_mm_setzero_si128 (void)
717{
718  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
719}
720
721static __inline __m128d __attribute__((__always_inline__))
722_mm_cvtepi32_pd (__m128i __A)
723{
724  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
725}
726
727static __inline __m128 __attribute__((__always_inline__))
728_mm_cvtepi32_ps (__m128i __A)
729{
730  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
731}
732
733static __inline __m128i __attribute__((__always_inline__))
734_mm_cvtpd_epi32 (__m128d __A)
735{
736  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
737}
738
739static __inline __m64 __attribute__((__always_inline__))
740_mm_cvtpd_pi32 (__m128d __A)
741{
742  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
743}
744
745static __inline __m128 __attribute__((__always_inline__))
746_mm_cvtpd_ps (__m128d __A)
747{
748  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
749}
750
751static __inline __m128i __attribute__((__always_inline__))
752_mm_cvttpd_epi32 (__m128d __A)
753{
754  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
755}
756
757static __inline __m64 __attribute__((__always_inline__))
758_mm_cvttpd_pi32 (__m128d __A)
759{
760  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
761}
762
763static __inline __m128d __attribute__((__always_inline__))
764_mm_cvtpi32_pd (__m64 __A)
765{
766  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
767}
768
769static __inline __m128i __attribute__((__always_inline__))
770_mm_cvtps_epi32 (__m128 __A)
771{
772  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
773}
774
775static __inline __m128i __attribute__((__always_inline__))
776_mm_cvttps_epi32 (__m128 __A)
777{
778  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
779}
780
781static __inline __m128d __attribute__((__always_inline__))
782_mm_cvtps_pd (__m128 __A)
783{
784  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
785}
786
787static __inline int __attribute__((__always_inline__))
788_mm_cvtsd_si32 (__m128d __A)
789{
790  return __builtin_ia32_cvtsd2si ((__v2df) __A);
791}
792
793#ifdef __x86_64__
794static __inline long long __attribute__((__always_inline__))
795_mm_cvtsd_si64x (__m128d __A)
796{
797  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
798}
799#endif
800
801static __inline int __attribute__((__always_inline__))
802_mm_cvttsd_si32 (__m128d __A)
803{
804  return __builtin_ia32_cvttsd2si ((__v2df) __A);
805}
806
807#ifdef __x86_64__
808static __inline long long __attribute__((__always_inline__))
809_mm_cvttsd_si64x (__m128d __A)
810{
811  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
812}
813#endif
814
815static __inline __m128 __attribute__((__always_inline__))
816_mm_cvtsd_ss (__m128 __A, __m128d __B)
817{
818  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
819}
820
821static __inline __m128d __attribute__((__always_inline__))
822_mm_cvtsi32_sd (__m128d __A, int __B)
823{
824  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
825}
826
827#ifdef __x86_64__
828static __inline __m128d __attribute__((__always_inline__))
829_mm_cvtsi64x_sd (__m128d __A, long long __B)
830{
831  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
832}
833#endif
834
835static __inline __m128d __attribute__((__always_inline__))
836_mm_cvtss_sd (__m128d __A, __m128 __B)
837{
838  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
839}
840
841#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
842
843static __inline __m128d __attribute__((__always_inline__))
844_mm_unpackhi_pd (__m128d __A, __m128d __B)
845{
846  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
847}
848
849static __inline __m128d __attribute__((__always_inline__))
850_mm_unpacklo_pd (__m128d __A, __m128d __B)
851{
852  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
853}
854
855static __inline __m128d __attribute__((__always_inline__))
856_mm_loadh_pd (__m128d __A, double const *__B)
857{
858  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
859}
860
861static __inline __m128d __attribute__((__always_inline__))
862_mm_loadl_pd (__m128d __A, double const *__B)
863{
864  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
865}
866
867static __inline int __attribute__((__always_inline__))
868_mm_movemask_pd (__m128d __A)
869{
870  return __builtin_ia32_movmskpd ((__v2df)__A);
871}
872
873static __inline __m128i __attribute__((__always_inline__))
874_mm_packs_epi16 (__m128i __A, __m128i __B)
875{
876  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
877}
878
879static __inline __m128i __attribute__((__always_inline__))
880_mm_packs_epi32 (__m128i __A, __m128i __B)
881{
882  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
883}
884
885static __inline __m128i __attribute__((__always_inline__))
886_mm_packus_epi16 (__m128i __A, __m128i __B)
887{
888  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
889}
890
891static __inline __m128i __attribute__((__always_inline__))
892_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
893{
894  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
895}
896
897static __inline __m128i __attribute__((__always_inline__))
898_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
899{
900  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
901}
902
903static __inline __m128i __attribute__((__always_inline__))
904_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
905{
906  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
907}
908
909static __inline __m128i __attribute__((__always_inline__))
910_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
911{
912  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
913}
914
915static __inline __m128i __attribute__((__always_inline__))
916_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
917{
918  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
919}
920
921static __inline __m128i __attribute__((__always_inline__))
922_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
923{
924  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
925}
926
927static __inline __m128i __attribute__((__always_inline__))
928_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
929{
930  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
931}
932
933static __inline __m128i __attribute__((__always_inline__))
934_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
935{
936  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
937}
938
939static __inline __m128i __attribute__((__always_inline__))
940_mm_add_epi8 (__m128i __A, __m128i __B)
941{
942  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
943}
944
945static __inline __m128i __attribute__((__always_inline__))
946_mm_add_epi16 (__m128i __A, __m128i __B)
947{
948  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
949}
950
951static __inline __m128i __attribute__((__always_inline__))
952_mm_add_epi32 (__m128i __A, __m128i __B)
953{
954  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
955}
956
957static __inline __m128i __attribute__((__always_inline__))
958_mm_add_epi64 (__m128i __A, __m128i __B)
959{
960  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
961}
962
963static __inline __m128i __attribute__((__always_inline__))
964_mm_adds_epi8 (__m128i __A, __m128i __B)
965{
966  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
967}
968
969static __inline __m128i __attribute__((__always_inline__))
970_mm_adds_epi16 (__m128i __A, __m128i __B)
971{
972  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
973}
974
975static __inline __m128i __attribute__((__always_inline__))
976_mm_adds_epu8 (__m128i __A, __m128i __B)
977{
978  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
979}
980
981static __inline __m128i __attribute__((__always_inline__))
982_mm_adds_epu16 (__m128i __A, __m128i __B)
983{
984  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
985}
986
987static __inline __m128i __attribute__((__always_inline__))
988_mm_sub_epi8 (__m128i __A, __m128i __B)
989{
990  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
991}
992
993static __inline __m128i __attribute__((__always_inline__))
994_mm_sub_epi16 (__m128i __A, __m128i __B)
995{
996  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
997}
998
999static __inline __m128i __attribute__((__always_inline__))
1000_mm_sub_epi32 (__m128i __A, __m128i __B)
1001{
1002  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1003}
1004
1005static __inline __m128i __attribute__((__always_inline__))
1006_mm_sub_epi64 (__m128i __A, __m128i __B)
1007{
1008  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1009}
1010
1011static __inline __m128i __attribute__((__always_inline__))
1012_mm_subs_epi8 (__m128i __A, __m128i __B)
1013{
1014  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1015}
1016
1017static __inline __m128i __attribute__((__always_inline__))
1018_mm_subs_epi16 (__m128i __A, __m128i __B)
1019{
1020  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1021}
1022
1023static __inline __m128i __attribute__((__always_inline__))
1024_mm_subs_epu8 (__m128i __A, __m128i __B)
1025{
1026  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1027}
1028
1029static __inline __m128i __attribute__((__always_inline__))
1030_mm_subs_epu16 (__m128i __A, __m128i __B)
1031{
1032  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1033}
1034
1035static __inline __m128i __attribute__((__always_inline__))
1036_mm_madd_epi16 (__m128i __A, __m128i __B)
1037{
1038  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1039}
1040
1041static __inline __m128i __attribute__((__always_inline__))
1042_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1043{
1044  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1045}
1046
1047static __inline __m128i __attribute__((__always_inline__))
1048_mm_mullo_epi16 (__m128i __A, __m128i __B)
1049{
1050  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1051}
1052
1053static __inline __m64 __attribute__((__always_inline__))
1054_mm_mul_su32 (__m64 __A, __m64 __B)
1055{
1056  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1057}
1058
1059static __inline __m128i __attribute__((__always_inline__))
1060_mm_mul_epu32 (__m128i __A, __m128i __B)
1061{
1062  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1063}
1064
1065#if 0
1066static __inline __m128i __attribute__((__always_inline__))
1067_mm_slli_epi16 (__m128i __A, int __B)
1068{
1069  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1070}
1071
1072static __inline __m128i __attribute__((__always_inline__))
1073_mm_slli_epi32 (__m128i __A, int __B)
1074{
1075  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1076}
1077
1078static __inline __m128i __attribute__((__always_inline__))
1079_mm_slli_epi64 (__m128i __A, int __B)
1080{
1081  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1082}
1083#else
1084#define _mm_slli_epi16(__A, __B) \
1085  ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B))
1086#define _mm_slli_epi32(__A, __B) \
1087  ((__m128i)__builtin_ia32_pslldi128 ((__v8hi)(__A), __B))
1088#define _mm_slli_epi64(__A, __B) \
1089  ((__m128i)__builtin_ia32_psllqi128 ((__v8hi)(__A), __B))
1090#endif
1091
1092#if 0
1093static __inline __m128i __attribute__((__always_inline__))
1094_mm_srai_epi16 (__m128i __A, int __B)
1095{
1096  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1097}
1098
1099static __inline __m128i __attribute__((__always_inline__))
1100_mm_srai_epi32 (__m128i __A, int __B)
1101{
1102  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1103}
1104#else
1105#define _mm_srai_epi16(__A, __B) \
1106  ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B))
1107#define _mm_srai_epi32(__A, __B) \
1108  ((__m128i)__builtin_ia32_psradi128 ((__v8hi)(__A), __B))
1109#endif
1110
1111#if 0
1112static __m128i __attribute__((__always_inline__))
1113_mm_srli_si128 (__m128i __A, int __B)
1114{
1115  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8));
1116}
1117
1118static __m128i __attribute__((__always_inline__))
1119_mm_srli_si128 (__m128i __A, int __B)
1120{
1121  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8));
1122}
1123#else
1124#define _mm_srli_si128(__A, __B) \
1125  ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1126#define _mm_slli_si128(__A, __B) \
1127  ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1128#endif
1129
1130#if 0
1131static __inline __m128i __attribute__((__always_inline__))
1132_mm_srli_epi16 (__m128i __A, int __B)
1133{
1134  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1135}
1136
1137static __inline __m128i __attribute__((__always_inline__))
1138_mm_srli_epi32 (__m128i __A, int __B)
1139{
1140  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1141}
1142
1143static __inline __m128i __attribute__((__always_inline__))
1144_mm_srli_epi64 (__m128i __A, int __B)
1145{
1146  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1147}
1148#else
1149#define _mm_srli_epi16(__A, __B) \
1150  ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B))
1151#define _mm_srli_epi32(__A, __B) \
1152  ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B))
1153#define _mm_srli_epi64(__A, __B) \
1154  ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B))
1155#endif
1156
1157static __inline __m128i __attribute__((__always_inline__))
1158_mm_sll_epi16 (__m128i __A, __m128i __B)
1159{
1160  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1161}
1162
1163static __inline __m128i __attribute__((__always_inline__))
1164_mm_sll_epi32 (__m128i __A, __m128i __B)
1165{
1166  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1167}
1168
1169static __inline __m128i __attribute__((__always_inline__))
1170_mm_sll_epi64 (__m128i __A, __m128i __B)
1171{
1172  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1173}
1174
1175static __inline __m128i __attribute__((__always_inline__))
1176_mm_sra_epi16 (__m128i __A, __m128i __B)
1177{
1178  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1179}
1180
1181static __inline __m128i __attribute__((__always_inline__))
1182_mm_sra_epi32 (__m128i __A, __m128i __B)
1183{
1184  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1185}
1186
1187static __inline __m128i __attribute__((__always_inline__))
1188_mm_srl_epi16 (__m128i __A, __m128i __B)
1189{
1190  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1191}
1192
1193static __inline __m128i __attribute__((__always_inline__))
1194_mm_srl_epi32 (__m128i __A, __m128i __B)
1195{
1196  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1197}
1198
1199static __inline __m128i __attribute__((__always_inline__))
1200_mm_srl_epi64 (__m128i __A, __m128i __B)
1201{
1202  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1203}
1204
1205static __inline __m128i __attribute__((__always_inline__))
1206_mm_and_si128 (__m128i __A, __m128i __B)
1207{
1208  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1209}
1210
1211static __inline __m128i __attribute__((__always_inline__))
1212_mm_andnot_si128 (__m128i __A, __m128i __B)
1213{
1214  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1215}
1216
1217static __inline __m128i __attribute__((__always_inline__))
1218_mm_or_si128 (__m128i __A, __m128i __B)
1219{
1220  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1221}
1222
1223static __inline __m128i __attribute__((__always_inline__))
1224_mm_xor_si128 (__m128i __A, __m128i __B)
1225{
1226  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1227}
1228
1229static __inline __m128i __attribute__((__always_inline__))
1230_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1231{
1232  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1233}
1234
1235static __inline __m128i __attribute__((__always_inline__))
1236_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1237{
1238  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1239}
1240
1241static __inline __m128i __attribute__((__always_inline__))
1242_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1243{
1244  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1245}
1246
1247static __inline __m128i __attribute__((__always_inline__))
1248_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1249{
1250  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1251}
1252
1253static __inline __m128i __attribute__((__always_inline__))
1254_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1255{
1256  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1257}
1258
1259static __inline __m128i __attribute__((__always_inline__))
1260_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1261{
1262  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1263}
1264
1265static __inline __m128i __attribute__((__always_inline__))
1266_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1267{
1268  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1269}
1270
1271static __inline __m128i __attribute__((__always_inline__))
1272_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1273{
1274  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1275}
1276
1277static __inline __m128i __attribute__((__always_inline__))
1278_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1279{
1280  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1281}
1282
1283#if 0
1284static __inline int __attribute__((__always_inline__))
1285_mm_extract_epi16 (__m128i const __A, int const __N)
1286{
1287  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1288}
1289
1290static __inline __m128i __attribute__((__always_inline__))
1291_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1292{
1293  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1294}
1295#else
1296#define _mm_extract_epi16(A, N) \
1297  ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N)))
1298#define _mm_insert_epi16(A, D, N) \
1299  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
1300#endif
1301
1302static __inline __m128i __attribute__((__always_inline__))
1303_mm_max_epi16 (__m128i __A, __m128i __B)
1304{
1305  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1306}
1307
1308static __inline __m128i __attribute__((__always_inline__))
1309_mm_max_epu8 (__m128i __A, __m128i __B)
1310{
1311  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1312}
1313
1314static __inline __m128i __attribute__((__always_inline__))
1315_mm_min_epi16 (__m128i __A, __m128i __B)
1316{
1317  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1318}
1319
1320static __inline __m128i __attribute__((__always_inline__))
1321_mm_min_epu8 (__m128i __A, __m128i __B)
1322{
1323  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1324}
1325
1326static __inline int __attribute__((__always_inline__))
1327_mm_movemask_epi8 (__m128i __A)
1328{
1329  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1330}
1331
1332static __inline __m128i __attribute__((__always_inline__))
1333_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1334{
1335  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1336}
1337
1338#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1339#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1340#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1341
1342static __inline void __attribute__((__always_inline__))
1343_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1344{
1345  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1346}
1347
1348static __inline __m128i __attribute__((__always_inline__))
1349_mm_avg_epu8 (__m128i __A, __m128i __B)
1350{
1351  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1352}
1353
1354static __inline __m128i __attribute__((__always_inline__))
1355_mm_avg_epu16 (__m128i __A, __m128i __B)
1356{
1357  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1358}
1359
1360static __inline __m128i __attribute__((__always_inline__))
1361_mm_sad_epu8 (__m128i __A, __m128i __B)
1362{
1363  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1364}
1365
1366static __inline void __attribute__((__always_inline__))
1367_mm_stream_si32 (int *__A, int __B)
1368{
1369  __builtin_ia32_movnti (__A, __B);
1370}
1371
1372static __inline void __attribute__((__always_inline__))
1373_mm_stream_si128 (__m128i *__A, __m128i __B)
1374{
1375  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1376}
1377
1378static __inline void __attribute__((__always_inline__))
1379_mm_stream_pd (double *__A, __m128d __B)
1380{
1381  __builtin_ia32_movntpd (__A, (__v2df)__B);
1382}
1383
1384static __inline void __attribute__((__always_inline__))
1385_mm_clflush (void const *__A)
1386{
1387  __builtin_ia32_clflush (__A);
1388}
1389
1390static __inline void __attribute__((__always_inline__))
1391_mm_lfence (void)
1392{
1393  __builtin_ia32_lfence ();
1394}
1395
1396static __inline void __attribute__((__always_inline__))
1397_mm_mfence (void)
1398{
1399  __builtin_ia32_mfence ();
1400}
1401
1402static __inline __m128i __attribute__((__always_inline__))
1403_mm_cvtsi32_si128 (int __A)
1404{
1405  return _mm_set_epi32 (0, 0, 0, __A);
1406}
1407
1408#ifdef __x86_64__
1409static __inline __m128i __attribute__((__always_inline__))
1410_mm_cvtsi64x_si128 (long long __A)
1411{
1412  return _mm_set_epi64x (0, __A);
1413}
1414#endif
1415
1416/* Casts between various SP, DP, INT vector types.  Note that these do no
1417   conversion of values, they just change the type.  */
1418static __inline __m128 __attribute__((__always_inline__))
1419_mm_castpd_ps(__m128d __A)
1420{
1421  return (__m128) __A;
1422}
1423
1424static __inline __m128i __attribute__((__always_inline__))
1425_mm_castpd_si128(__m128d __A)
1426{
1427  return (__m128i) __A;
1428}
1429
1430static __inline __m128d __attribute__((__always_inline__))
1431_mm_castps_pd(__m128 __A)
1432{
1433  return (__m128d) __A;
1434}
1435
1436static __inline __m128i __attribute__((__always_inline__))
1437_mm_castps_si128(__m128 __A)
1438{
1439  return (__m128i) __A;
1440}
1441
1442static __inline __m128 __attribute__((__always_inline__))
1443_mm_castsi128_ps(__m128i __A)
1444{
1445  return (__m128) __A;
1446}
1447
1448static __inline __m128d __attribute__((__always_inline__))
1449_mm_castsi128_pd(__m128i __A)
1450{
1451  return (__m128d) __A;
1452}
1453
1454#endif /* __SSE2__  */
1455
1456#endif /* _EMMINTRIN_H_INCLUDED */
1457