emmintrin.h revision 258749
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d __a, __m128d __b)
44{
45  __a[0] += __b[0];
46  return __a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d __a, __m128d __b)
51{
52  return __a + __b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d __a, __m128d __b)
57{
58  __a[0] -= __b[0];
59  return __a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d __a, __m128d __b)
64{
65  return __a - __b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d __a, __m128d __b)
70{
71  __a[0] *= __b[0];
72  return __a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d __a, __m128d __b)
77{
78  return __a * __b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d __a, __m128d __b)
83{
84  __a[0] /= __b[0];
85  return __a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d __a, __m128d __b)
90{
91  return __a / __b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d __a, __m128d __b)
96{
97  __m128d __c = __builtin_ia32_sqrtsd(__b);
98  return (__m128d) { __c[0], __a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d __a)
103{
104  return __builtin_ia32_sqrtpd(__a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d __a, __m128d __b)
109{
110  return __builtin_ia32_minsd(__a, __b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d __a, __m128d __b)
115{
116  return __builtin_ia32_minpd(__a, __b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d __a, __m128d __b)
121{
122  return __builtin_ia32_maxsd(__a, __b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d __a, __m128d __b)
127{
128  return __builtin_ia32_maxpd(__a, __b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d __a, __m128d __b)
133{
134  return (__m128d)((__v4si)__a & (__v4si)__b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d __a, __m128d __b)
139{
140  return (__m128d)(~(__v4si)__a & (__v4si)__b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d __a, __m128d __b)
145{
146  return (__m128d)((__v4si)__a | (__v4si)__b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d __a, __m128d __b)
151{
152  return (__m128d)((__v4si)__a ^ (__v4si)__b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d __a, __m128d __b)
157{
158  return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d __a, __m128d __b)
163{
164  return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d __a, __m128d __b)
169{
170  return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d __a, __m128d __b)
175{
176  return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d __a, __m128d __b)
181{
182  return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d __a, __m128d __b)
187{
188  return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d __a, __m128d __b)
193{
194  return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d __a, __m128d __b)
199{
200  return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205{
206  return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d __a, __m128d __b)
211{
212  return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d __a, __m128d __b)
217{
218  return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d __a, __m128d __b)
223{
224  return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d __a, __m128d __b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d __a, __m128d __b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d __a, __m128d __b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d __a, __m128d __b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(__b, __a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d __a, __m128d __b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(__b, __a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d __a, __m128d __b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d __a, __m128d __b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d __a, __m128d __b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d __a, __m128d __b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d __a, __m128d __b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d __a, __m128d __b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(__b, __a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d __a, __m128d __b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(__b, __a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d __a, __m128d __b)
301{
302  return __builtin_ia32_comisdeq(__a, __b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d __a, __m128d __b)
307{
308  return __builtin_ia32_comisdlt(__a, __b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d __a, __m128d __b)
313{
314  return __builtin_ia32_comisdle(__a, __b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d __a, __m128d __b)
319{
320  return __builtin_ia32_comisdgt(__a, __b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comige_sd(__m128d __a, __m128d __b)
325{
326  return __builtin_ia32_comisdge(__a, __b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comineq_sd(__m128d __a, __m128d __b)
331{
332  return __builtin_ia32_comisdneq(__a, __b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomieq_sd(__m128d __a, __m128d __b)
337{
338  return __builtin_ia32_ucomisdeq(__a, __b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomilt_sd(__m128d __a, __m128d __b)
343{
344  return __builtin_ia32_ucomisdlt(__a, __b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomile_sd(__m128d __a, __m128d __b)
349{
350  return __builtin_ia32_ucomisdle(__a, __b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomigt_sd(__m128d __a, __m128d __b)
355{
356  return __builtin_ia32_ucomisdgt(__a, __b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomige_sd(__m128d __a, __m128d __b)
361{
362  return __builtin_ia32_ucomisdge(__a, __b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomineq_sd(__m128d __a, __m128d __b)
367{
368  return __builtin_ia32_ucomisdneq(__a, __b);
369}
370
371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
372_mm_cvtpd_ps(__m128d __a)
373{
374  return __builtin_ia32_cvtpd2ps(__a);
375}
376
377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
378_mm_cvtps_pd(__m128 __a)
379{
380  return __builtin_ia32_cvtps2pd(__a);
381}
382
383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
384_mm_cvtepi32_pd(__m128i __a)
385{
386  return __builtin_ia32_cvtdq2pd((__v4si)__a);
387}
388
389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
390_mm_cvtpd_epi32(__m128d __a)
391{
392  return __builtin_ia32_cvtpd2dq(__a);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtsd_si32(__m128d __a)
397{
398  return __builtin_ia32_cvtsd2si(__a);
399}
400
401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
402_mm_cvtsd_ss(__m128 __a, __m128d __b)
403{
404  __a[0] = __b[0];
405  return __a;
406}
407
408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
409_mm_cvtsi32_sd(__m128d __a, int __b)
410{
411  __a[0] = __b;
412  return __a;
413}
414
415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cvtss_sd(__m128d __a, __m128 __b)
417{
418  __a[0] = __b[0];
419  return __a;
420}
421
422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
423_mm_cvttpd_epi32(__m128d __a)
424{
425  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
426}
427
428static __inline__ int __attribute__((__always_inline__, __nodebug__))
429_mm_cvttsd_si32(__m128d __a)
430{
431  return __a[0];
432}
433
434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpd_pi32(__m128d __a)
436{
437  return (__m64)__builtin_ia32_cvtpd2pi(__a);
438}
439
440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
441_mm_cvttpd_pi32(__m128d __a)
442{
443  return (__m64)__builtin_ia32_cvttpd2pi(__a);
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_cvtpi32_pd(__m64 __a)
448{
449  return __builtin_ia32_cvtpi2pd((__v2si)__a);
450}
451
452static __inline__ double __attribute__((__always_inline__, __nodebug__))
453_mm_cvtsd_f64(__m128d __a)
454{
455  return __a[0];
456}
457
458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
459_mm_load_pd(double const *__dp)
460{
461  return *(__m128d*)__dp;
462}
463
464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
465_mm_load1_pd(double const *__dp)
466{
467  struct __mm_load1_pd_struct {
468    double __u;
469  } __attribute__((__packed__, __may_alias__));
470  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
471  return (__m128d){ __u, __u };
472}
473
474#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
475
476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
477_mm_loadr_pd(double const *__dp)
478{
479  __m128d __u = *(__m128d*)__dp;
480  return __builtin_shufflevector(__u, __u, 1, 0);
481}
482
483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
484_mm_loadu_pd(double const *__dp)
485{
486  struct __loadu_pd {
487    __m128d __v;
488  } __attribute__((packed, may_alias));
489  return ((struct __loadu_pd*)__dp)->__v;
490}
491
492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
493_mm_load_sd(double const *__dp)
494{
495  struct __mm_load_sd_struct {
496    double __u;
497  } __attribute__((__packed__, __may_alias__));
498  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
499  return (__m128d){ __u, 0 };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_loadh_pd(__m128d __a, double const *__dp)
504{
505  struct __mm_loadh_pd_struct {
506    double __u;
507  } __attribute__((__packed__, __may_alias__));
508  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
509  return (__m128d){ __a[0], __u };
510}
511
512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
513_mm_loadl_pd(__m128d __a, double const *__dp)
514{
515  struct __mm_loadl_pd_struct {
516    double __u;
517  } __attribute__((__packed__, __may_alias__));
518  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
519  return (__m128d){ __u, __a[1] };
520}
521
522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
523_mm_set_sd(double __w)
524{
525  return (__m128d){ __w, 0 };
526}
527
528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
529_mm_set1_pd(double __w)
530{
531  return (__m128d){ __w, __w };
532}
533
534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
535_mm_set_pd(double __w, double __x)
536{
537  return (__m128d){ __x, __w };
538}
539
540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
541_mm_setr_pd(double __w, double __x)
542{
543  return (__m128d){ __w, __x };
544}
545
546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
547_mm_setzero_pd(void)
548{
549  return (__m128d){ 0, 0 };
550}
551
552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
553_mm_move_sd(__m128d __a, __m128d __b)
554{
555  return (__m128d){ __b[0], __a[1] };
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_store_sd(double *__dp, __m128d __a)
560{
561  struct __mm_store_sd_struct {
562    double __u;
563  } __attribute__((__packed__, __may_alias__));
564  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
565}
566
567static __inline__ void __attribute__((__always_inline__, __nodebug__))
568_mm_store1_pd(double *__dp, __m128d __a)
569{
570  struct __mm_store1_pd_struct {
571    double __u[2];
572  } __attribute__((__packed__, __may_alias__));
573  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
574  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
575}
576
577static __inline__ void __attribute__((__always_inline__, __nodebug__))
578_mm_store_pd(double *__dp, __m128d __a)
579{
580  *(__m128d *)__dp = __a;
581}
582
583static __inline__ void __attribute__((__always_inline__, __nodebug__))
584_mm_storeu_pd(double *__dp, __m128d __a)
585{
586  __builtin_ia32_storeupd(__dp, __a);
587}
588
589static __inline__ void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_pd(double *__dp, __m128d __a)
591{
592  __a = __builtin_shufflevector(__a, __a, 1, 0);
593  *(__m128d *)__dp = __a;
594}
595
596static __inline__ void __attribute__((__always_inline__, __nodebug__))
597_mm_storeh_pd(double *__dp, __m128d __a)
598{
599  struct __mm_storeh_pd_struct {
600    double __u;
601  } __attribute__((__packed__, __may_alias__));
602  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
603}
604
605static __inline__ void __attribute__((__always_inline__, __nodebug__))
606_mm_storel_pd(double *__dp, __m128d __a)
607{
608  struct __mm_storeh_pd_struct {
609    double __u;
610  } __attribute__((__packed__, __may_alias__));
611  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
612}
613
614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
615_mm_add_epi8(__m128i __a, __m128i __b)
616{
617  return (__m128i)((__v16qi)__a + (__v16qi)__b);
618}
619
620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
621_mm_add_epi16(__m128i __a, __m128i __b)
622{
623  return (__m128i)((__v8hi)__a + (__v8hi)__b);
624}
625
626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
627_mm_add_epi32(__m128i __a, __m128i __b)
628{
629  return (__m128i)((__v4si)__a + (__v4si)__b);
630}
631
632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_add_si64(__m64 __a, __m64 __b)
634{
635  return __a + __b;
636}
637
638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
639_mm_add_epi64(__m128i __a, __m128i __b)
640{
641  return __a + __b;
642}
643
644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
645_mm_adds_epi8(__m128i __a, __m128i __b)
646{
647  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
648}
649
650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
651_mm_adds_epi16(__m128i __a, __m128i __b)
652{
653  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
654}
655
656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
657_mm_adds_epu8(__m128i __a, __m128i __b)
658{
659  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
660}
661
662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
663_mm_adds_epu16(__m128i __a, __m128i __b)
664{
665  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
666}
667
668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
669_mm_avg_epu8(__m128i __a, __m128i __b)
670{
671  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
672}
673
674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
675_mm_avg_epu16(__m128i __a, __m128i __b)
676{
677  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
678}
679
680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
681_mm_madd_epi16(__m128i __a, __m128i __b)
682{
683  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
684}
685
686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
687_mm_max_epi16(__m128i __a, __m128i __b)
688{
689  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
690}
691
692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693_mm_max_epu8(__m128i __a, __m128i __b)
694{
695  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
696}
697
698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
699_mm_min_epi16(__m128i __a, __m128i __b)
700{
701  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
702}
703
704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
705_mm_min_epu8(__m128i __a, __m128i __b)
706{
707  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
708}
709
710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
711_mm_mulhi_epi16(__m128i __a, __m128i __b)
712{
713  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
714}
715
716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
717_mm_mulhi_epu16(__m128i __a, __m128i __b)
718{
719  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
720}
721
722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
723_mm_mullo_epi16(__m128i __a, __m128i __b)
724{
725  return (__m128i)((__v8hi)__a * (__v8hi)__b);
726}
727
728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
729_mm_mul_su32(__m64 __a, __m64 __b)
730{
731  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
732}
733
734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
735_mm_mul_epu32(__m128i __a, __m128i __b)
736{
737  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
738}
739
740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
741_mm_sad_epu8(__m128i __a, __m128i __b)
742{
743  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
744}
745
746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
747_mm_sub_epi8(__m128i __a, __m128i __b)
748{
749  return (__m128i)((__v16qi)__a - (__v16qi)__b);
750}
751
752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
753_mm_sub_epi16(__m128i __a, __m128i __b)
754{
755  return (__m128i)((__v8hi)__a - (__v8hi)__b);
756}
757
758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
759_mm_sub_epi32(__m128i __a, __m128i __b)
760{
761  return (__m128i)((__v4si)__a - (__v4si)__b);
762}
763
764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
765_mm_sub_si64(__m64 __a, __m64 __b)
766{
767  return __a - __b;
768}
769
770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
771_mm_sub_epi64(__m128i __a, __m128i __b)
772{
773  return __a - __b;
774}
775
776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
777_mm_subs_epi8(__m128i __a, __m128i __b)
778{
779  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
780}
781
782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
783_mm_subs_epi16(__m128i __a, __m128i __b)
784{
785  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
786}
787
788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
789_mm_subs_epu8(__m128i __a, __m128i __b)
790{
791  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
792}
793
794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
795_mm_subs_epu16(__m128i __a, __m128i __b)
796{
797  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
798}
799
800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
801_mm_and_si128(__m128i __a, __m128i __b)
802{
803  return __a & __b;
804}
805
806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
807_mm_andnot_si128(__m128i __a, __m128i __b)
808{
809  return ~__a & __b;
810}
811
812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
813_mm_or_si128(__m128i __a, __m128i __b)
814{
815  return __a | __b;
816}
817
818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
819_mm_xor_si128(__m128i __a, __m128i __b)
820{
821  return __a ^ __b;
822}
823
824#define _mm_slli_si128(a, count) __extension__ ({ \
825  __m128i __a = (a); \
826  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
827
828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829_mm_slli_epi16(__m128i __a, int __count)
830{
831  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
832}
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_sll_epi16(__m128i __a, __m128i __count)
836{
837  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_slli_epi32(__m128i __a, int __count)
842{
843  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
844}
845
846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847_mm_sll_epi32(__m128i __a, __m128i __count)
848{
849  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
850}
851
852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853_mm_slli_epi64(__m128i __a, int __count)
854{
855  return __builtin_ia32_psllqi128(__a, __count);
856}
857
858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859_mm_sll_epi64(__m128i __a, __m128i __count)
860{
861  return __builtin_ia32_psllq128(__a, __count);
862}
863
864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865_mm_srai_epi16(__m128i __a, int __count)
866{
867  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
868}
869
870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871_mm_sra_epi16(__m128i __a, __m128i __count)
872{
873  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
874}
875
876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877_mm_srai_epi32(__m128i __a, int __count)
878{
879  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
880}
881
882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883_mm_sra_epi32(__m128i __a, __m128i __count)
884{
885  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
886}
887
888
889#define _mm_srli_si128(a, count) __extension__ ({ \
890  __m128i __a = (a); \
891  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
892
893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
894_mm_srli_epi16(__m128i __a, int __count)
895{
896  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
897}
898
899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
900_mm_srl_epi16(__m128i __a, __m128i __count)
901{
902  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
903}
904
905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
906_mm_srli_epi32(__m128i __a, int __count)
907{
908  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
909}
910
911static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
912_mm_srl_epi32(__m128i __a, __m128i __count)
913{
914  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
915}
916
917static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
918_mm_srli_epi64(__m128i __a, int __count)
919{
920  return __builtin_ia32_psrlqi128(__a, __count);
921}
922
923static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
924_mm_srl_epi64(__m128i __a, __m128i __count)
925{
926  return __builtin_ia32_psrlq128(__a, __count);
927}
928
929static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
930_mm_cmpeq_epi8(__m128i __a, __m128i __b)
931{
932  return (__m128i)((__v16qi)__a == (__v16qi)__b);
933}
934
935static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
936_mm_cmpeq_epi16(__m128i __a, __m128i __b)
937{
938  return (__m128i)((__v8hi)__a == (__v8hi)__b);
939}
940
941static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
942_mm_cmpeq_epi32(__m128i __a, __m128i __b)
943{
944  return (__m128i)((__v4si)__a == (__v4si)__b);
945}
946
947static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
948_mm_cmpgt_epi8(__m128i __a, __m128i __b)
949{
950  /* This function always performs a signed comparison, but __v16qi is a char
951     which may be signed or unsigned. */
952  typedef signed char __v16qs __attribute__((__vector_size__(16)));
953  return (__m128i)((__v16qs)__a > (__v16qs)__b);
954}
955
956static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
957_mm_cmpgt_epi16(__m128i __a, __m128i __b)
958{
959  return (__m128i)((__v8hi)__a > (__v8hi)__b);
960}
961
962static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
963_mm_cmpgt_epi32(__m128i __a, __m128i __b)
964{
965  return (__m128i)((__v4si)__a > (__v4si)__b);
966}
967
968static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
969_mm_cmplt_epi8(__m128i __a, __m128i __b)
970{
971  return _mm_cmpgt_epi8(__b, __a);
972}
973
974static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
975_mm_cmplt_epi16(__m128i __a, __m128i __b)
976{
977  return _mm_cmpgt_epi16(__b, __a);
978}
979
980static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
981_mm_cmplt_epi32(__m128i __a, __m128i __b)
982{
983  return _mm_cmpgt_epi32(__b, __a);
984}
985
986#ifdef __x86_64__
987static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
988_mm_cvtsi64_sd(__m128d __a, long long __b)
989{
990  __a[0] = __b;
991  return __a;
992}
993
994static __inline__ long long __attribute__((__always_inline__, __nodebug__))
995_mm_cvtsd_si64(__m128d __a)
996{
997  return __builtin_ia32_cvtsd2si64(__a);
998}
999
1000static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1001_mm_cvttsd_si64(__m128d __a)
1002{
1003  return __a[0];
1004}
1005#endif
1006
1007static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1008_mm_cvtepi32_ps(__m128i __a)
1009{
1010  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1011}
1012
1013static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1014_mm_cvtps_epi32(__m128 __a)
1015{
1016  return (__m128i)__builtin_ia32_cvtps2dq(__a);
1017}
1018
1019static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1020_mm_cvttps_epi32(__m128 __a)
1021{
1022  return (__m128i)__builtin_ia32_cvttps2dq(__a);
1023}
1024
1025static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1026_mm_cvtsi32_si128(int __a)
1027{
1028  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1029}
1030
1031#ifdef __x86_64__
1032static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1033_mm_cvtsi64_si128(long long __a)
1034{
1035  return (__m128i){ __a, 0 };
1036}
1037#endif
1038
1039static __inline__ int __attribute__((__always_inline__, __nodebug__))
1040_mm_cvtsi128_si32(__m128i __a)
1041{
1042  __v4si __b = (__v4si)__a;
1043  return __b[0];
1044}
1045
1046#ifdef __x86_64__
1047static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1048_mm_cvtsi128_si64(__m128i __a)
1049{
1050  return __a[0];
1051}
1052#endif
1053
1054static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1055_mm_load_si128(__m128i const *__p)
1056{
1057  return *__p;
1058}
1059
1060static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1061_mm_loadu_si128(__m128i const *__p)
1062{
1063  struct __loadu_si128 {
1064    __m128i __v;
1065  } __attribute__((packed, may_alias));
1066  return ((struct __loadu_si128*)__p)->__v;
1067}
1068
1069static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1070_mm_loadl_epi64(__m128i const *__p)
1071{
1072  struct __mm_loadl_epi64_struct {
1073    long long __u;
1074  } __attribute__((__packed__, __may_alias__));
1075  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1076}
1077
1078static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1079_mm_set_epi64x(long long q1, long long q0)
1080{
1081  return (__m128i){ q0, q1 };
1082}
1083
1084static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1085_mm_set_epi64(__m64 q1, __m64 q0)
1086{
1087  return (__m128i){ (long long)q0, (long long)q1 };
1088}
1089
1090static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1091_mm_set_epi32(int i3, int i2, int i1, int i0)
1092{
1093  return (__m128i)(__v4si){ i0, i1, i2, i3};
1094}
1095
1096static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1097_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1098{
1099  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1100}
1101
1102static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1103_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1104{
1105  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1106}
1107
1108static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1109_mm_set1_epi64x(long long __q)
1110{
1111  return (__m128i){ __q, __q };
1112}
1113
1114static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1115_mm_set1_epi64(__m64 __q)
1116{
1117  return (__m128i){ (long long)__q, (long long)__q };
1118}
1119
1120static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1121_mm_set1_epi32(int __i)
1122{
1123  return (__m128i)(__v4si){ __i, __i, __i, __i };
1124}
1125
1126static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1127_mm_set1_epi16(short __w)
1128{
1129  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1130}
1131
1132static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1133_mm_set1_epi8(char __b)
1134{
1135  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1136}
1137
1138static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1139_mm_setr_epi64(__m64 q0, __m64 q1)
1140{
1141  return (__m128i){ (long long)q0, (long long)q1 };
1142}
1143
1144static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1145_mm_setr_epi32(int i0, int i1, int i2, int i3)
1146{
1147  return (__m128i)(__v4si){ i0, i1, i2, i3};
1148}
1149
1150static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1151_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1152{
1153  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1154}
1155
1156static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1157_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1158{
1159  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1160}
1161
1162static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1163_mm_setzero_si128(void)
1164{
1165  return (__m128i){ 0LL, 0LL };
1166}
1167
1168static __inline__ void __attribute__((__always_inline__, __nodebug__))
1169_mm_store_si128(__m128i *__p, __m128i __b)
1170{
1171  *__p = __b;
1172}
1173
1174static __inline__ void __attribute__((__always_inline__, __nodebug__))
1175_mm_storeu_si128(__m128i *__p, __m128i __b)
1176{
1177  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1178}
1179
1180static __inline__ void __attribute__((__always_inline__, __nodebug__))
1181_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1182{
1183  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1184}
1185
1186static __inline__ void __attribute__((__always_inline__, __nodebug__))
1187_mm_storel_epi64(__m128i *__p, __m128i __a)
1188{
1189  struct __mm_storel_epi64_struct {
1190    long long __u;
1191  } __attribute__((__packed__, __may_alias__));
1192  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1193}
1194
1195static __inline__ void __attribute__((__always_inline__, __nodebug__))
1196_mm_stream_pd(double *__p, __m128d __a)
1197{
1198  __builtin_ia32_movntpd(__p, __a);
1199}
1200
1201static __inline__ void __attribute__((__always_inline__, __nodebug__))
1202_mm_stream_si128(__m128i *__p, __m128i __a)
1203{
1204  __builtin_ia32_movntdq(__p, __a);
1205}
1206
1207static __inline__ void __attribute__((__always_inline__, __nodebug__))
1208_mm_stream_si32(int *__p, int __a)
1209{
1210  __builtin_ia32_movnti(__p, __a);
1211}
1212
1213static __inline__ void __attribute__((__always_inline__, __nodebug__))
1214_mm_clflush(void const *__p)
1215{
1216  __builtin_ia32_clflush(__p);
1217}
1218
1219static __inline__ void __attribute__((__always_inline__, __nodebug__))
1220_mm_lfence(void)
1221{
1222  __builtin_ia32_lfence();
1223}
1224
1225static __inline__ void __attribute__((__always_inline__, __nodebug__))
1226_mm_mfence(void)
1227{
1228  __builtin_ia32_mfence();
1229}
1230
1231static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1232_mm_packs_epi16(__m128i __a, __m128i __b)
1233{
1234  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1235}
1236
1237static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1238_mm_packs_epi32(__m128i __a, __m128i __b)
1239{
1240  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1241}
1242
1243static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1244_mm_packus_epi16(__m128i __a, __m128i __b)
1245{
1246  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1247}
1248
1249static __inline__ int __attribute__((__always_inline__, __nodebug__))
1250_mm_extract_epi16(__m128i __a, int __imm)
1251{
1252  __v8hi __b = (__v8hi)__a;
1253  return (unsigned short)__b[__imm];
1254}
1255
1256static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1257_mm_insert_epi16(__m128i __a, int __b, int __imm)
1258{
1259  __v8hi __c = (__v8hi)__a;
1260  __c[__imm & 7] = __b;
1261  return (__m128i)__c;
1262}
1263
1264static __inline__ int __attribute__((__always_inline__, __nodebug__))
1265_mm_movemask_epi8(__m128i __a)
1266{
1267  return __builtin_ia32_pmovmskb128((__v16qi)__a);
1268}
1269
1270#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1271  __m128i __a = (a); \
1272  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1273                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1274                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1275
1276#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1277  __m128i __a = (a); \
1278  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1279                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1280                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1281                                   4, 5, 6, 7); })
1282
1283#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1284  __m128i __a = (a); \
1285  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1286                                   0, 1, 2, 3, \
1287                                   4 + (((imm) & 0x03) >> 0), \
1288                                   4 + (((imm) & 0x0c) >> 2), \
1289                                   4 + (((imm) & 0x30) >> 4), \
1290                                   4 + (((imm) & 0xc0) >> 6)); })
1291
1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1293_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1294{
1295  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1296}
1297
1298static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1299_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1300{
1301  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1302}
1303
1304static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1305_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1306{
1307  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1308}
1309
1310static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1311_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1312{
1313  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1314}
1315
1316static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1317_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1318{
1319  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1320}
1321
1322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1323_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1324{
1325  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1326}
1327
1328static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1329_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1330{
1331  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1332}
1333
1334static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1335_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1336{
1337  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1338}
1339
1340static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1341_mm_movepi64_pi64(__m128i __a)
1342{
1343  return (__m64)__a[0];
1344}
1345
1346static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1347_mm_movpi64_epi64(__m64 __a)
1348{
1349  return (__m128i){ (long long)__a, 0 };
1350}
1351
1352static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1353_mm_move_epi64(__m128i __a)
1354{
1355  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1356}
1357
1358static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1359_mm_unpackhi_pd(__m128d __a, __m128d __b)
1360{
1361  return __builtin_shufflevector(__a, __b, 1, 2+1);
1362}
1363
1364static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1365_mm_unpacklo_pd(__m128d __a, __m128d __b)
1366{
1367  return __builtin_shufflevector(__a, __b, 0, 2+0);
1368}
1369
1370static __inline__ int __attribute__((__always_inline__, __nodebug__))
1371_mm_movemask_pd(__m128d __a)
1372{
1373  return __builtin_ia32_movmskpd(__a);
1374}
1375
1376#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1377  __m128d __a = (a); \
1378  __m128d __b = (b); \
1379  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1380
1381static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1382_mm_castpd_ps(__m128d __a)
1383{
1384  return (__m128)__a;
1385}
1386
1387static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1388_mm_castpd_si128(__m128d __a)
1389{
1390  return (__m128i)__a;
1391}
1392
1393static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1394_mm_castps_pd(__m128 __a)
1395{
1396  return (__m128d)__a;
1397}
1398
1399static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1400_mm_castps_si128(__m128 __a)
1401{
1402  return (__m128i)__a;
1403}
1404
1405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406_mm_castsi128_ps(__m128i __a)
1407{
1408  return (__m128)__a;
1409}
1410
1411static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1412_mm_castsi128_pd(__m128i __a)
1413{
1414  return (__m128d)__a;
1415}
1416
1417static __inline__ void __attribute__((__always_inline__, __nodebug__))
1418_mm_pause(void)
1419{
1420  __asm__ volatile ("pause");
1421}
1422
1423#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1424
1425#endif /* __SSE2__ */
1426
1427#endif /* __EMMINTRIN_H */
1428