emmintrin.h revision 226890
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d a, __m128d b)
44{
45  a[0] += b[0];
46  return a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d a, __m128d b)
51{
52  return a + b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d a, __m128d b)
57{
58  a[0] -= b[0];
59  return a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d a, __m128d b)
64{
65  return a - b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d a, __m128d b)
70{
71  a[0] *= b[0];
72  return a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d a, __m128d b)
77{
78  return a * b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d a, __m128d b)
83{
84  a[0] /= b[0];
85  return a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d a, __m128d b)
90{
91  return a / b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d a, __m128d b)
96{
97  __m128d c = __builtin_ia32_sqrtsd(b);
98  return (__m128d) { c[0], a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d a)
103{
104  return __builtin_ia32_sqrtpd(a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minsd(a, b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comige_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdge(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comineq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_comisdneq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomieq_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdeq(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomilt_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdlt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomile_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdle(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomigt_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdgt(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomige_sd(__m128d a, __m128d b)
361{
362  return __builtin_ia32_ucomisdge(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomineq_sd(__m128d a, __m128d b)
367{
368  return __builtin_ia32_ucomisdneq(a, b);
369}
370
371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
372_mm_cvtpd_ps(__m128d a)
373{
374  return __builtin_ia32_cvtpd2ps(a);
375}
376
377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
378_mm_cvtps_pd(__m128 a)
379{
380  return __builtin_ia32_cvtps2pd(a);
381}
382
383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
384_mm_cvtepi32_pd(__m128i a)
385{
386  return __builtin_ia32_cvtdq2pd((__v4si)a);
387}
388
389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
390_mm_cvtpd_epi32(__m128d a)
391{
392  return __builtin_ia32_cvtpd2dq(a);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtsd_si32(__m128d a)
397{
398  return __builtin_ia32_cvtsd2si(a);
399}
400
401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
402_mm_cvtsd_ss(__m128 a, __m128d b)
403{
404  a[0] = b[0];
405  return a;
406}
407
408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
409_mm_cvtsi32_sd(__m128d a, int b)
410{
411  a[0] = b;
412  return a;
413}
414
415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416_mm_cvtss_sd(__m128d a, __m128 b)
417{
418  a[0] = b[0];
419  return a;
420}
421
422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
423_mm_cvttpd_epi32(__m128d a)
424{
425  return (__m128i)__builtin_ia32_cvttpd2dq(a);
426}
427
428static __inline__ int __attribute__((__always_inline__, __nodebug__))
429_mm_cvttsd_si32(__m128d a)
430{
431  return a[0];
432}
433
434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpd_pi32(__m128d a)
436{
437  return (__m64)__builtin_ia32_cvtpd2pi(a);
438}
439
440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
441_mm_cvttpd_pi32(__m128d a)
442{
443  return (__m64)__builtin_ia32_cvttpd2pi(a);
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_cvtpi32_pd(__m64 a)
448{
449  return __builtin_ia32_cvtpi2pd((__v2si)a);
450}
451
452static __inline__ double __attribute__((__always_inline__, __nodebug__))
453_mm_cvtsd_f64(__m128d a)
454{
455  return a[0];
456}
457
458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
459_mm_load_pd(double const *dp)
460{
461  return *(__m128d*)dp;
462}
463
464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
465_mm_load1_pd(double const *dp)
466{
467  struct __mm_load1_pd_struct {
468    double u;
469  } __attribute__((__packed__, __may_alias__));
470  double u = ((struct __mm_load1_pd_struct*)dp)->u;
471  return (__m128d){ u, u };
472}
473
474#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
475
476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
477_mm_loadr_pd(double const *dp)
478{
479  __m128d u = *(__m128d*)dp;
480  return __builtin_shufflevector(u, u, 1, 0);
481}
482
483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
484_mm_loadu_pd(double const *dp)
485{
486  struct __loadu_pd {
487    __m128d v;
488  } __attribute__((packed, may_alias));
489  return ((struct __loadu_pd*)dp)->v;
490}
491
492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
493_mm_load_sd(double const *dp)
494{
495  struct __mm_load_sd_struct {
496    double u;
497  } __attribute__((__packed__, __may_alias__));
498  double u = ((struct __mm_load_sd_struct*)dp)->u;
499  return (__m128d){ u, 0 };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_loadh_pd(__m128d a, double const *dp)
504{
505  struct __mm_loadh_pd_struct {
506    double u;
507  } __attribute__((__packed__, __may_alias__));
508  double u = ((struct __mm_loadh_pd_struct*)dp)->u;
509  return (__m128d){ a[0], u };
510}
511
512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
513_mm_loadl_pd(__m128d a, double const *dp)
514{
515  struct __mm_loadl_pd_struct {
516    double u;
517  } __attribute__((__packed__, __may_alias__));
518  double u = ((struct __mm_loadl_pd_struct*)dp)->u;
519  return (__m128d){ u, a[1] };
520}
521
522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
523_mm_set_sd(double w)
524{
525  return (__m128d){ w, 0 };
526}
527
528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
529_mm_set1_pd(double w)
530{
531  return (__m128d){ w, w };
532}
533
534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
535_mm_set_pd(double w, double x)
536{
537  return (__m128d){ x, w };
538}
539
540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
541_mm_setr_pd(double w, double x)
542{
543  return (__m128d){ w, x };
544}
545
546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
547_mm_setzero_pd(void)
548{
549  return (__m128d){ 0, 0 };
550}
551
552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
553_mm_move_sd(__m128d a, __m128d b)
554{
555  return (__m128d){ b[0], a[1] };
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_store_sd(double *dp, __m128d a)
560{
561  struct __mm_store_sd_struct {
562    double u;
563  } __attribute__((__packed__, __may_alias__));
564  ((struct __mm_store_sd_struct*)dp)->u = a[0];
565}
566
567static __inline__ void __attribute__((__always_inline__, __nodebug__))
568_mm_store1_pd(double *dp, __m128d a)
569{
570  struct __mm_store1_pd_struct {
571    double u[2];
572  } __attribute__((__packed__, __may_alias__));
573  ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
574  ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
575}
576
577static __inline__ void __attribute__((__always_inline__, __nodebug__))
578_mm_store_pd(double *dp, __m128d a)
579{
580  *(__m128d *)dp = a;
581}
582
583static __inline__ void __attribute__((__always_inline__, __nodebug__))
584_mm_storeu_pd(double *dp, __m128d a)
585{
586  __builtin_ia32_storeupd(dp, a);
587}
588
589static __inline__ void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_pd(double *dp, __m128d a)
591{
592  a = __builtin_shufflevector(a, a, 1, 0);
593  *(__m128d *)dp = a;
594}
595
596static __inline__ void __attribute__((__always_inline__, __nodebug__))
597_mm_storeh_pd(double *dp, __m128d a)
598{
599  struct __mm_storeh_pd_struct {
600    double u;
601  } __attribute__((__packed__, __may_alias__));
602  ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
603}
604
605static __inline__ void __attribute__((__always_inline__, __nodebug__))
606_mm_storel_pd(double *dp, __m128d a)
607{
608  struct __mm_storeh_pd_struct {
609    double u;
610  } __attribute__((__packed__, __may_alias__));
611  ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
612}
613
614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
615_mm_add_epi8(__m128i a, __m128i b)
616{
617  return (__m128i)((__v16qi)a + (__v16qi)b);
618}
619
620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
621_mm_add_epi16(__m128i a, __m128i b)
622{
623  return (__m128i)((__v8hi)a + (__v8hi)b);
624}
625
626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
627_mm_add_epi32(__m128i a, __m128i b)
628{
629  return (__m128i)((__v4si)a + (__v4si)b);
630}
631
632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
633_mm_add_si64(__m64 a, __m64 b)
634{
635  return a + b;
636}
637
638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
639_mm_add_epi64(__m128i a, __m128i b)
640{
641  return a + b;
642}
643
644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
645_mm_adds_epi8(__m128i a, __m128i b)
646{
647  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
648}
649
650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
651_mm_adds_epi16(__m128i a, __m128i b)
652{
653  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
654}
655
656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
657_mm_adds_epu8(__m128i a, __m128i b)
658{
659  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
660}
661
662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
663_mm_adds_epu16(__m128i a, __m128i b)
664{
665  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
666}
667
668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
669_mm_avg_epu8(__m128i a, __m128i b)
670{
671  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
672}
673
674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
675_mm_avg_epu16(__m128i a, __m128i b)
676{
677  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
678}
679
680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
681_mm_madd_epi16(__m128i a, __m128i b)
682{
683  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
684}
685
686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
687_mm_max_epi16(__m128i a, __m128i b)
688{
689  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
690}
691
692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693_mm_max_epu8(__m128i a, __m128i b)
694{
695  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
696}
697
698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
699_mm_min_epi16(__m128i a, __m128i b)
700{
701  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
702}
703
704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
705_mm_min_epu8(__m128i a, __m128i b)
706{
707  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
708}
709
710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
711_mm_mulhi_epi16(__m128i a, __m128i b)
712{
713  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
714}
715
716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
717_mm_mulhi_epu16(__m128i a, __m128i b)
718{
719  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
720}
721
722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
723_mm_mullo_epi16(__m128i a, __m128i b)
724{
725  return (__m128i)((__v8hi)a * (__v8hi)b);
726}
727
728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
729_mm_mul_su32(__m64 a, __m64 b)
730{
731  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
732}
733
734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
735_mm_mul_epu32(__m128i a, __m128i b)
736{
737  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
738}
739
740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
741_mm_sad_epu8(__m128i a, __m128i b)
742{
743  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
744}
745
746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
747_mm_sub_epi8(__m128i a, __m128i b)
748{
749  return (__m128i)((__v16qi)a - (__v16qi)b);
750}
751
752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
753_mm_sub_epi16(__m128i a, __m128i b)
754{
755  return (__m128i)((__v8hi)a - (__v8hi)b);
756}
757
758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
759_mm_sub_epi32(__m128i a, __m128i b)
760{
761  return (__m128i)((__v4si)a - (__v4si)b);
762}
763
764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
765_mm_sub_si64(__m64 a, __m64 b)
766{
767  return a - b;
768}
769
770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
771_mm_sub_epi64(__m128i a, __m128i b)
772{
773  return a - b;
774}
775
776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
777_mm_subs_epi8(__m128i a, __m128i b)
778{
779  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
780}
781
782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
783_mm_subs_epi16(__m128i a, __m128i b)
784{
785  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
786}
787
788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
789_mm_subs_epu8(__m128i a, __m128i b)
790{
791  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
792}
793
794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
795_mm_subs_epu16(__m128i a, __m128i b)
796{
797  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
798}
799
800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
801_mm_and_si128(__m128i a, __m128i b)
802{
803  return a & b;
804}
805
806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
807_mm_andnot_si128(__m128i a, __m128i b)
808{
809  return ~a & b;
810}
811
812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
813_mm_or_si128(__m128i a, __m128i b)
814{
815  return a | b;
816}
817
818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
819_mm_xor_si128(__m128i a, __m128i b)
820{
821  return a ^ b;
822}
823
824#define _mm_slli_si128(VEC, IMM) \
825  ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
826
827static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
828_mm_slli_epi16(__m128i a, int count)
829{
830  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
831}
832
833static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
834_mm_sll_epi16(__m128i a, __m128i count)
835{
836  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
837}
838
839static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
840_mm_slli_epi32(__m128i a, int count)
841{
842  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
843}
844
845static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
846_mm_sll_epi32(__m128i a, __m128i count)
847{
848  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
849}
850
851static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
852_mm_slli_epi64(__m128i a, int count)
853{
854  return __builtin_ia32_psllqi128(a, count);
855}
856
857static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
858_mm_sll_epi64(__m128i a, __m128i count)
859{
860  return __builtin_ia32_psllq128(a, count);
861}
862
863static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
864_mm_srai_epi16(__m128i a, int count)
865{
866  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
867}
868
869static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
870_mm_sra_epi16(__m128i a, __m128i count)
871{
872  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
873}
874
875static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
876_mm_srai_epi32(__m128i a, int count)
877{
878  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
879}
880
881static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
882_mm_sra_epi32(__m128i a, __m128i count)
883{
884  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
885}
886
887
888#define _mm_srli_si128(VEC, IMM) \
889  ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
890
891static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
892_mm_srli_epi16(__m128i a, int count)
893{
894  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
895}
896
897static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
898_mm_srl_epi16(__m128i a, __m128i count)
899{
900  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
901}
902
903static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
904_mm_srli_epi32(__m128i a, int count)
905{
906  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
907}
908
909static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
910_mm_srl_epi32(__m128i a, __m128i count)
911{
912  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
913}
914
915static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
916_mm_srli_epi64(__m128i a, int count)
917{
918  return __builtin_ia32_psrlqi128(a, count);
919}
920
921static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
922_mm_srl_epi64(__m128i a, __m128i count)
923{
924  return __builtin_ia32_psrlq128(a, count);
925}
926
927static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
928_mm_cmpeq_epi8(__m128i a, __m128i b)
929{
930  return (__m128i)((__v16qi)a == (__v16qi)b);
931}
932
933static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
934_mm_cmpeq_epi16(__m128i a, __m128i b)
935{
936  return (__m128i)((__v8hi)a == (__v8hi)b);
937}
938
939static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
940_mm_cmpeq_epi32(__m128i a, __m128i b)
941{
942  return (__m128i)((__v4si)a == (__v4si)b);
943}
944
945static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
946_mm_cmpgt_epi8(__m128i a, __m128i b)
947{
948  return (__m128i)((__v16qi)a > (__v16qi)b);
949}
950
951static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
952_mm_cmpgt_epi16(__m128i a, __m128i b)
953{
954  return (__m128i)((__v8hi)a > (__v8hi)b);
955}
956
957static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
958_mm_cmpgt_epi32(__m128i a, __m128i b)
959{
960  return (__m128i)((__v4si)a > (__v4si)b);
961}
962
963static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
964_mm_cmplt_epi8(__m128i a, __m128i b)
965{
966  return _mm_cmpgt_epi8(b,a);
967}
968
969static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
970_mm_cmplt_epi16(__m128i a, __m128i b)
971{
972  return _mm_cmpgt_epi16(b,a);
973}
974
975static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
976_mm_cmplt_epi32(__m128i a, __m128i b)
977{
978  return _mm_cmpgt_epi32(b,a);
979}
980
981#ifdef __x86_64__
982static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
983_mm_cvtsi64_sd(__m128d a, long long b)
984{
985  a[0] = b;
986  return a;
987}
988
989static __inline__ long long __attribute__((__always_inline__, __nodebug__))
990_mm_cvtsd_si64(__m128d a)
991{
992  return __builtin_ia32_cvtsd2si64(a);
993}
994
995static __inline__ long long __attribute__((__always_inline__, __nodebug__))
996_mm_cvttsd_si64(__m128d a)
997{
998  return a[0];
999}
1000#endif
1001
1002static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1003_mm_cvtepi32_ps(__m128i a)
1004{
1005  return __builtin_ia32_cvtdq2ps((__v4si)a);
1006}
1007
1008static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1009_mm_cvtps_epi32(__m128 a)
1010{
1011  return (__m128i)__builtin_ia32_cvtps2dq(a);
1012}
1013
1014static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1015_mm_cvttps_epi32(__m128 a)
1016{
1017  return (__m128i)__builtin_ia32_cvttps2dq(a);
1018}
1019
1020static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1021_mm_cvtsi32_si128(int a)
1022{
1023  return (__m128i)(__v4si){ a, 0, 0, 0 };
1024}
1025
1026#ifdef __x86_64__
1027static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028_mm_cvtsi64_si128(long long a)
1029{
1030  return (__m128i){ a, 0 };
1031}
1032#endif
1033
1034static __inline__ int __attribute__((__always_inline__, __nodebug__))
1035_mm_cvtsi128_si32(__m128i a)
1036{
1037  __v4si b = (__v4si)a;
1038  return b[0];
1039}
1040
1041#ifdef __x86_64__
1042static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1043_mm_cvtsi128_si64(__m128i a)
1044{
1045  return a[0];
1046}
1047#endif
1048
1049static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1050_mm_load_si128(__m128i const *p)
1051{
1052  return *p;
1053}
1054
1055static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1056_mm_loadu_si128(__m128i const *p)
1057{
1058  struct __loadu_si128 {
1059    __m128i v;
1060  } __attribute__((packed, may_alias));
1061  return ((struct __loadu_si128*)p)->v;
1062}
1063
1064static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1065_mm_loadl_epi64(__m128i const *p)
1066{
1067  struct __mm_loadl_epi64_struct {
1068    long long u;
1069  } __attribute__((__packed__, __may_alias__));
1070  return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
1071}
1072
1073static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1074_mm_set_epi64x(long long q1, long long q0)
1075{
1076  return (__m128i){ q0, q1 };
1077}
1078
1079static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1080_mm_set_epi64(__m64 q1, __m64 q0)
1081{
1082  return (__m128i){ (long long)q0, (long long)q1 };
1083}
1084
1085static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1086_mm_set_epi32(int i3, int i2, int i1, int i0)
1087{
1088  return (__m128i)(__v4si){ i0, i1, i2, i3};
1089}
1090
1091static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1092_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1093{
1094  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1095}
1096
1097static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1098_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1099{
1100  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1101}
1102
1103static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1104_mm_set1_epi64x(long long q)
1105{
1106  return (__m128i){ q, q };
1107}
1108
1109static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1110_mm_set1_epi64(__m64 q)
1111{
1112  return (__m128i){ (long long)q, (long long)q };
1113}
1114
1115static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1116_mm_set1_epi32(int i)
1117{
1118  return (__m128i)(__v4si){ i, i, i, i };
1119}
1120
1121static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1122_mm_set1_epi16(short w)
1123{
1124  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1125}
1126
1127static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1128_mm_set1_epi8(char b)
1129{
1130  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1131}
1132
1133static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1134_mm_setr_epi64(__m64 q0, __m64 q1)
1135{
1136  return (__m128i){ (long long)q0, (long long)q1 };
1137}
1138
1139static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1140_mm_setr_epi32(int i0, int i1, int i2, int i3)
1141{
1142  return (__m128i)(__v4si){ i0, i1, i2, i3};
1143}
1144
1145static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1146_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1147{
1148  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1149}
1150
1151static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1152_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1153{
1154  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1155}
1156
1157static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1158_mm_setzero_si128(void)
1159{
1160  return (__m128i){ 0LL, 0LL };
1161}
1162
1163static __inline__ void __attribute__((__always_inline__, __nodebug__))
1164_mm_store_si128(__m128i *p, __m128i b)
1165{
1166  *p = b;
1167}
1168
1169static __inline__ void __attribute__((__always_inline__, __nodebug__))
1170_mm_storeu_si128(__m128i *p, __m128i b)
1171{
1172  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1173}
1174
1175static __inline__ void __attribute__((__always_inline__, __nodebug__))
1176_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1177{
1178  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1179}
1180
1181static __inline__ void __attribute__((__always_inline__, __nodebug__))
1182_mm_storel_epi64(__m128i *p, __m128i a)
1183{
1184  __builtin_ia32_storelv4si((__v2si *)p, a);
1185}
1186
1187static __inline__ void __attribute__((__always_inline__, __nodebug__))
1188_mm_stream_pd(double *p, __m128d a)
1189{
1190  __builtin_ia32_movntpd(p, a);
1191}
1192
1193static __inline__ void __attribute__((__always_inline__, __nodebug__))
1194_mm_stream_si128(__m128i *p, __m128i a)
1195{
1196  __builtin_ia32_movntdq(p, a);
1197}
1198
1199static __inline__ void __attribute__((__always_inline__, __nodebug__))
1200_mm_stream_si32(int *p, int a)
1201{
1202  __builtin_ia32_movnti(p, a);
1203}
1204
1205static __inline__ void __attribute__((__always_inline__, __nodebug__))
1206_mm_clflush(void const *p)
1207{
1208  __builtin_ia32_clflush(p);
1209}
1210
1211static __inline__ void __attribute__((__always_inline__, __nodebug__))
1212_mm_lfence(void)
1213{
1214  __builtin_ia32_lfence();
1215}
1216
1217static __inline__ void __attribute__((__always_inline__, __nodebug__))
1218_mm_mfence(void)
1219{
1220  __builtin_ia32_mfence();
1221}
1222
1223static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1224_mm_packs_epi16(__m128i a, __m128i b)
1225{
1226  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1227}
1228
1229static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1230_mm_packs_epi32(__m128i a, __m128i b)
1231{
1232  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1233}
1234
1235static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1236_mm_packus_epi16(__m128i a, __m128i b)
1237{
1238  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1239}
1240
1241static __inline__ int __attribute__((__always_inline__, __nodebug__))
1242_mm_extract_epi16(__m128i a, int imm)
1243{
1244  __v8hi b = (__v8hi)a;
1245  return (unsigned short)b[imm];
1246}
1247
1248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1249_mm_insert_epi16(__m128i a, int b, int imm)
1250{
1251  __v8hi c = (__v8hi)a;
1252  c[imm & 7] = b;
1253  return (__m128i)c;
1254}
1255
1256static __inline__ int __attribute__((__always_inline__, __nodebug__))
1257_mm_movemask_epi8(__m128i a)
1258{
1259  return __builtin_ia32_pmovmskb128((__v16qi)a);
1260}
1261
1262#define _mm_shuffle_epi32(a, imm) \
1263  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1264                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1265                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1266
1267
1268#define _mm_shufflelo_epi16(a, imm) \
1269  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1270                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1271                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1272                                    4, 5, 6, 7))
1273#define _mm_shufflehi_epi16(a, imm) \
1274  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1275                                    4 + (((imm) & 0x03) >> 0), \
1276                                    4 + (((imm) & 0x0c) >> 2), \
1277                                    4 + (((imm) & 0x30) >> 4), \
1278                                    4 + (((imm) & 0xc0) >> 6)))
1279
1280static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1281_mm_unpackhi_epi8(__m128i a, __m128i b)
1282{
1283  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1284}
1285
1286static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1287_mm_unpackhi_epi16(__m128i a, __m128i b)
1288{
1289  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1290}
1291
1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1293_mm_unpackhi_epi32(__m128i a, __m128i b)
1294{
1295  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1296}
1297
1298static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1299_mm_unpackhi_epi64(__m128i a, __m128i b)
1300{
1301  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1302}
1303
1304static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1305_mm_unpacklo_epi8(__m128i a, __m128i b)
1306{
1307  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1308}
1309
1310static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1311_mm_unpacklo_epi16(__m128i a, __m128i b)
1312{
1313  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1314}
1315
1316static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1317_mm_unpacklo_epi32(__m128i a, __m128i b)
1318{
1319  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1320}
1321
1322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1323_mm_unpacklo_epi64(__m128i a, __m128i b)
1324{
1325  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1326}
1327
1328static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1329_mm_movepi64_pi64(__m128i a)
1330{
1331  return (__m64)a[0];
1332}
1333
1334static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1335_mm_movpi64_pi64(__m64 a)
1336{
1337  return (__m128i){ (long long)a, 0 };
1338}
1339
1340static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1341_mm_move_epi64(__m128i a)
1342{
1343  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1344}
1345
1346static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1347_mm_unpackhi_pd(__m128d a, __m128d b)
1348{
1349  return __builtin_shufflevector(a, b, 1, 2+1);
1350}
1351
1352static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1353_mm_unpacklo_pd(__m128d a, __m128d b)
1354{
1355  return __builtin_shufflevector(a, b, 0, 2+0);
1356}
1357
1358static __inline__ int __attribute__((__always_inline__, __nodebug__))
1359_mm_movemask_pd(__m128d a)
1360{
1361  return __builtin_ia32_movmskpd(a);
1362}
1363
1364#define _mm_shuffle_pd(a, b, i) \
1365  (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1366                                                       (((i) & 2) >> 1) + 2))
1367
1368static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1369_mm_castpd_ps(__m128d in)
1370{
1371  return (__m128)in;
1372}
1373
1374static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375_mm_castpd_si128(__m128d in)
1376{
1377  return (__m128i)in;
1378}
1379
1380static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381_mm_castps_pd(__m128 in)
1382{
1383  return (__m128d)in;
1384}
1385
1386static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1387_mm_castps_si128(__m128 in)
1388{
1389  return (__m128i)in;
1390}
1391
1392static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1393_mm_castsi128_ps(__m128i in)
1394{
1395  return (__m128)in;
1396}
1397
1398static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1399_mm_castsi128_pd(__m128i in)
1400{
1401  return (__m128d)in;
1402}
1403
1404static __inline__ void __attribute__((__always_inline__, __nodebug__))
1405_mm_pause(void)
1406{
1407  __asm__ volatile ("pause");
1408}
1409
1410#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1411
1412#endif /* __SSE2__ */
1413
1414#endif /* __EMMINTRIN_H */
1415