emmintrin.h revision 212904
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d a, __m128d b)
44{
45  a[0] += b[0];
46  return a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d a, __m128d b)
51{
52  return a + b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d a, __m128d b)
57{
58  a[0] -= b[0];
59  return a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d a, __m128d b)
64{
65  return a - b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d a, __m128d b)
70{
71  a[0] *= b[0];
72  return a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d a, __m128d b)
77{
78  return a * b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d a, __m128d b)
83{
84  a[0] /= b[0];
85  return a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d a, __m128d b)
90{
91  return a / b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d a, __m128d b)
96{
97  __m128d c = __builtin_ia32_sqrtsd(b);
98  return (__m128d) { c[0], a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d a)
103{
104  return __builtin_ia32_sqrtpd(a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minsd(a, b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comineq_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdneq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_ucomieq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_ucomisdeq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomilt_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdlt(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomile_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdle(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomigt_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdgt(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomineq_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdneq(a, b);
357}
358
359static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
360_mm_cvtpd_ps(__m128d a)
361{
362  return __builtin_ia32_cvtpd2ps(a);
363}
364
365static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
366_mm_cvtps_pd(__m128 a)
367{
368  return __builtin_ia32_cvtps2pd(a);
369}
370
371static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
372_mm_cvtepi32_pd(__m128i a)
373{
374  return __builtin_ia32_cvtdq2pd((__v4si)a);
375}
376
377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
378_mm_cvtpd_epi32(__m128d a)
379{
380  return __builtin_ia32_cvtpd2dq(a);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_cvtsd_si32(__m128d a)
385{
386  return __builtin_ia32_cvtsd2si(a);
387}
388
389static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
390_mm_cvtsd_ss(__m128 a, __m128d b)
391{
392  a[0] = b[0];
393  return a;
394}
395
396static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
397_mm_cvtsi32_sd(__m128d a, int b)
398{
399  a[0] = b;
400  return a;
401}
402
403static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
404_mm_cvtss_sd(__m128d a, __m128 b)
405{
406  a[0] = b[0];
407  return a;
408}
409
410static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
411_mm_cvttpd_epi32(__m128d a)
412{
413  return (__m128i)__builtin_ia32_cvttpd2dq(a);
414}
415
416static __inline__ int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttsd_si32(__m128d a)
418{
419  return a[0];
420}
421
422static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
423_mm_cvtpd_pi32(__m128d a)
424{
425  return (__m64)__builtin_ia32_cvtpd2pi(a);
426}
427
428static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttpd_pi32(__m128d a)
430{
431  return (__m64)__builtin_ia32_cvttpd2pi(a);
432}
433
434static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpi32_pd(__m64 a)
436{
437  return __builtin_ia32_cvtpi2pd((__v2si)a);
438}
439
440static __inline__ double __attribute__((__always_inline__, __nodebug__))
441_mm_cvtsd_f64(__m128d a)
442{
443  return a[0];
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_load_pd(double const *dp)
448{
449  return *(__m128d*)dp;
450}
451
452static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
453_mm_load1_pd(double const *dp)
454{
455  return (__m128d){ dp[0], dp[0] };
456}
457
458#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
459
460static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
461_mm_loadr_pd(double const *dp)
462{
463  return (__m128d){ dp[1], dp[0] };
464}
465
466static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
467_mm_loadu_pd(double const *dp)
468{
469  return __builtin_ia32_loadupd(dp);
470}
471
472static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
473_mm_load_sd(double const *dp)
474{
475  return (__m128d){ *dp, 0.0 };
476}
477
478static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
479_mm_loadh_pd(__m128d a, double const *dp)
480{
481  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
482}
483
484static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
485_mm_loadl_pd(__m128d a, double const *dp)
486{
487  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
488}
489
490static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
491_mm_set_sd(double w)
492{
493  return (__m128d){ w, 0 };
494}
495
496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497_mm_set1_pd(double w)
498{
499  return (__m128d){ w, w };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_set_pd(double w, double x)
504{
505  return (__m128d){ x, w };
506}
507
508static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
509_mm_setr_pd(double w, double x)
510{
511  return (__m128d){ w, x };
512}
513
514static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
515_mm_setzero_pd(void)
516{
517  return (__m128d){ 0, 0 };
518}
519
520static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
521_mm_move_sd(__m128d a, __m128d b)
522{
523  return (__m128d){ b[0], a[1] };
524}
525
526static __inline__ void __attribute__((__always_inline__, __nodebug__))
527_mm_store_sd(double *dp, __m128d a)
528{
529  dp[0] = a[0];
530}
531
532static __inline__ void __attribute__((__always_inline__, __nodebug__))
533_mm_store1_pd(double *dp, __m128d a)
534{
535  dp[0] = a[0];
536  dp[1] = a[0];
537}
538
539static __inline__ void __attribute__((__always_inline__, __nodebug__))
540_mm_store_pd(double *dp, __m128d a)
541{
542  *(__m128d *)dp = a;
543}
544
545static __inline__ void __attribute__((__always_inline__, __nodebug__))
546_mm_storeu_pd(double *dp, __m128d a)
547{
548  __builtin_ia32_storeupd(dp, a);
549}
550
551static __inline__ void __attribute__((__always_inline__, __nodebug__))
552_mm_storer_pd(double *dp, __m128d a)
553{
554  dp[0] = a[1];
555  dp[1] = a[0];
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_storeh_pd(double *dp, __m128d a)
560{
561  dp[0] = a[1];
562}
563
564static __inline__ void __attribute__((__always_inline__, __nodebug__))
565_mm_storel_pd(double *dp, __m128d a)
566{
567  dp[0] = a[0];
568}
569
570static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
571_mm_add_epi8(__m128i a, __m128i b)
572{
573  return (__m128i)((__v16qi)a + (__v16qi)b);
574}
575
576static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
577_mm_add_epi16(__m128i a, __m128i b)
578{
579  return (__m128i)((__v8hi)a + (__v8hi)b);
580}
581
582static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
583_mm_add_epi32(__m128i a, __m128i b)
584{
585  return (__m128i)((__v4si)a + (__v4si)b);
586}
587
588static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
589_mm_add_si64(__m64 a, __m64 b)
590{
591  return a + b;
592}
593
594static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
595_mm_add_epi64(__m128i a, __m128i b)
596{
597  return a + b;
598}
599
600static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
601_mm_adds_epi8(__m128i a, __m128i b)
602{
603  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
604}
605
606static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
607_mm_adds_epi16(__m128i a, __m128i b)
608{
609  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
610}
611
612static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
613_mm_adds_epu8(__m128i a, __m128i b)
614{
615  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
616}
617
618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619_mm_adds_epu16(__m128i a, __m128i b)
620{
621  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
622}
623
624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625_mm_avg_epu8(__m128i a, __m128i b)
626{
627  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
628}
629
630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631_mm_avg_epu16(__m128i a, __m128i b)
632{
633  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
634}
635
636static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
637_mm_madd_epi16(__m128i a, __m128i b)
638{
639  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
640}
641
642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643_mm_max_epi16(__m128i a, __m128i b)
644{
645  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
646}
647
648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649_mm_max_epu8(__m128i a, __m128i b)
650{
651  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
652}
653
654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655_mm_min_epi16(__m128i a, __m128i b)
656{
657  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
658}
659
660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661_mm_min_epu8(__m128i a, __m128i b)
662{
663  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
664}
665
666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667_mm_mulhi_epi16(__m128i a, __m128i b)
668{
669  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
670}
671
672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673_mm_mulhi_epu16(__m128i a, __m128i b)
674{
675  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
676}
677
678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679_mm_mullo_epi16(__m128i a, __m128i b)
680{
681  return (__m128i)((__v8hi)a * (__v8hi)b);
682}
683
684static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
685_mm_mul_su32(__m64 a, __m64 b)
686{
687  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
688}
689
690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691_mm_mul_epu32(__m128i a, __m128i b)
692{
693  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
694}
695
696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697_mm_sad_epu8(__m128i a, __m128i b)
698{
699  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
700}
701
702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703_mm_sub_epi8(__m128i a, __m128i b)
704{
705  return (__m128i)((__v16qi)a - (__v16qi)b);
706}
707
708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709_mm_sub_epi16(__m128i a, __m128i b)
710{
711  return (__m128i)((__v8hi)a - (__v8hi)b);
712}
713
714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715_mm_sub_epi32(__m128i a, __m128i b)
716{
717  return (__m128i)((__v4si)a - (__v4si)b);
718}
719
720static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
721_mm_sub_si64(__m64 a, __m64 b)
722{
723  return a - b;
724}
725
726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727_mm_sub_epi64(__m128i a, __m128i b)
728{
729  return a - b;
730}
731
732static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
733_mm_subs_epi8(__m128i a, __m128i b)
734{
735  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
736}
737
738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739_mm_subs_epi16(__m128i a, __m128i b)
740{
741  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
742}
743
744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745_mm_subs_epu8(__m128i a, __m128i b)
746{
747  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
748}
749
750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751_mm_subs_epu16(__m128i a, __m128i b)
752{
753  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
754}
755
756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757_mm_and_si128(__m128i a, __m128i b)
758{
759  return a & b;
760}
761
762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763_mm_andnot_si128(__m128i a, __m128i b)
764{
765  return ~a & b;
766}
767
768static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
769_mm_or_si128(__m128i a, __m128i b)
770{
771  return a | b;
772}
773
774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775_mm_xor_si128(__m128i a, __m128i b)
776{
777  return a ^ b;
778}
779
780static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781_mm_slli_si128(__m128i a, int imm)
782{
783  return __builtin_ia32_pslldqi128(a, imm * 8);
784}
785
786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787_mm_slli_epi16(__m128i a, int count)
788{
789  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
790}
791
792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793_mm_sll_epi16(__m128i a, __m128i count)
794{
795  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
796}
797
798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799_mm_slli_epi32(__m128i a, int count)
800{
801  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
802}
803
804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805_mm_sll_epi32(__m128i a, __m128i count)
806{
807  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
808}
809
810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811_mm_slli_epi64(__m128i a, int count)
812{
813  return __builtin_ia32_psllqi128(a, count);
814}
815
816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817_mm_sll_epi64(__m128i a, __m128i count)
818{
819  return __builtin_ia32_psllq128(a, count);
820}
821
822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823_mm_srai_epi16(__m128i a, int count)
824{
825  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
826}
827
828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829_mm_sra_epi16(__m128i a, __m128i count)
830{
831  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
832}
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_srai_epi32(__m128i a, int count)
836{
837  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_sra_epi32(__m128i a, __m128i count)
842{
843  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
844}
845
846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847_mm_srli_si128(__m128i a, int imm)
848{
849  return __builtin_ia32_psrldqi128(a, imm * 8);
850}
851
852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853_mm_srli_epi16(__m128i a, int count)
854{
855  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
856}
857
858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859_mm_srl_epi16(__m128i a, __m128i count)
860{
861  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
862}
863
864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865_mm_srli_epi32(__m128i a, int count)
866{
867  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
868}
869
870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871_mm_srl_epi32(__m128i a, __m128i count)
872{
873  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
874}
875
876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877_mm_srli_epi64(__m128i a, int count)
878{
879  return __builtin_ia32_psrlqi128(a, count);
880}
881
882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883_mm_srl_epi64(__m128i a, __m128i count)
884{
885  return __builtin_ia32_psrlq128(a, count);
886}
887
888static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
889_mm_cmpeq_epi8(__m128i a, __m128i b)
890{
891  return (__m128i)((__v16qi)a == (__v16qi)b);
892}
893
894static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
895_mm_cmpeq_epi16(__m128i a, __m128i b)
896{
897  return (__m128i)((__v8hi)a == (__v8hi)b);
898}
899
900static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
901_mm_cmpeq_epi32(__m128i a, __m128i b)
902{
903  return (__m128i)((__v4si)a == (__v4si)b);
904}
905
906static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
907_mm_cmpgt_epi8(__m128i a, __m128i b)
908{
909  return (__m128i)((__v16qi)a > (__v16qi)b);
910}
911
912static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
913_mm_cmpgt_epi16(__m128i a, __m128i b)
914{
915  return (__m128i)((__v8hi)a > (__v8hi)b);
916}
917
918static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
919_mm_cmpgt_epi32(__m128i a, __m128i b)
920{
921  return (__m128i)((__v4si)a > (__v4si)b);
922}
923
924static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
925_mm_cmplt_epi8(__m128i a, __m128i b)
926{
927  return _mm_cmpgt_epi8(b,a);
928}
929
930static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
931_mm_cmplt_epi16(__m128i a, __m128i b)
932{
933  return _mm_cmpgt_epi16(b,a);
934}
935
936static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
937_mm_cmplt_epi32(__m128i a, __m128i b)
938{
939  return _mm_cmpgt_epi32(b,a);
940}
941
942#ifdef __x86_64__
943static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
944_mm_cvtsi64_sd(__m128d a, long long b)
945{
946  a[0] = b;
947  return a;
948}
949
950static __inline__ long long __attribute__((__always_inline__, __nodebug__))
951_mm_cvtsd_si64(__m128d a)
952{
953  return __builtin_ia32_cvtsd2si64(a);
954}
955
956static __inline__ long long __attribute__((__always_inline__, __nodebug__))
957_mm_cvttsd_si64(__m128d a)
958{
959  return a[0];
960}
961#endif
962
963static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
964_mm_cvtepi32_ps(__m128i a)
965{
966  return __builtin_ia32_cvtdq2ps((__v4si)a);
967}
968
969static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
970_mm_cvtps_epi32(__m128 a)
971{
972  return (__m128i)__builtin_ia32_cvtps2dq(a);
973}
974
975static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
976_mm_cvttps_epi32(__m128 a)
977{
978  return (__m128i)__builtin_ia32_cvttps2dq(a);
979}
980
981static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
982_mm_cvtsi32_si128(int a)
983{
984  return (__m128i)(__v4si){ a, 0, 0, 0 };
985}
986
987#ifdef __x86_64__
988static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989_mm_cvtsi64_si128(long long a)
990{
991  return (__m128i){ a, 0 };
992}
993#endif
994
995static __inline__ int __attribute__((__always_inline__, __nodebug__))
996_mm_cvtsi128_si32(__m128i a)
997{
998  __v4si b = (__v4si)a;
999  return b[0];
1000}
1001
1002#ifdef __x86_64__
1003static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1004_mm_cvtsi128_si64(__m128i a)
1005{
1006  return a[0];
1007}
1008#endif
1009
1010static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1011_mm_load_si128(__m128i const *p)
1012{
1013  return *p;
1014}
1015
1016static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1017_mm_loadu_si128(__m128i const *p)
1018{
1019  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1020}
1021
1022static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1023_mm_loadl_epi64(__m128i const *p)
1024{
1025  return (__m128i) { *(long long*)p, 0};
1026}
1027
1028static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1029_mm_set_epi64x(long long q1, long long q0)
1030{
1031  return (__m128i){ q0, q1 };
1032}
1033
1034static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1035_mm_set_epi64(__m64 q1, __m64 q0)
1036{
1037  return (__m128i){ (long long)q0, (long long)q1 };
1038}
1039
1040static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041_mm_set_epi32(int i3, int i2, int i1, int i0)
1042{
1043  return (__m128i)(__v4si){ i0, i1, i2, i3};
1044}
1045
1046static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1047_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1048{
1049  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1050}
1051
1052static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1053_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1054{
1055  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1056}
1057
1058static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1059_mm_set1_epi64x(long long q)
1060{
1061  return (__m128i){ q, q };
1062}
1063
1064static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1065_mm_set1_epi64(__m64 q)
1066{
1067  return (__m128i){ (long long)q, (long long)q };
1068}
1069
1070static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1071_mm_set1_epi32(int i)
1072{
1073  return (__m128i)(__v4si){ i, i, i, i };
1074}
1075
1076static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1077_mm_set1_epi16(short w)
1078{
1079  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1080}
1081
1082static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1083_mm_set1_epi8(char b)
1084{
1085  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1086}
1087
1088static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1089_mm_setr_epi64(__m64 q0, __m64 q1)
1090{
1091  return (__m128i){ (long long)q0, (long long)q1 };
1092}
1093
1094static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1095_mm_setr_epi32(int i0, int i1, int i2, int i3)
1096{
1097  return (__m128i)(__v4si){ i0, i1, i2, i3};
1098}
1099
1100static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1101_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1102{
1103  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1104}
1105
1106static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1107_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1108{
1109  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1110}
1111
1112static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1113_mm_setzero_si128(void)
1114{
1115  return (__m128i){ 0LL, 0LL };
1116}
1117
1118static __inline__ void __attribute__((__always_inline__, __nodebug__))
1119_mm_store_si128(__m128i *p, __m128i b)
1120{
1121  *p = b;
1122}
1123
1124static __inline__ void __attribute__((__always_inline__, __nodebug__))
1125_mm_storeu_si128(__m128i *p, __m128i b)
1126{
1127  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1128}
1129
1130static __inline__ void __attribute__((__always_inline__, __nodebug__))
1131_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1132{
1133  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1134}
1135
1136static __inline__ void __attribute__((__always_inline__, __nodebug__))
1137_mm_storel_epi64(__m128i *p, __m128i a)
1138{
1139  __builtin_ia32_storelv4si((__v2si *)p, a);
1140}
1141
1142static __inline__ void __attribute__((__always_inline__, __nodebug__))
1143_mm_stream_pd(double *p, __m128d a)
1144{
1145  __builtin_ia32_movntpd(p, a);
1146}
1147
1148static __inline__ void __attribute__((__always_inline__, __nodebug__))
1149_mm_stream_si128(__m128i *p, __m128i a)
1150{
1151  __builtin_ia32_movntdq(p, a);
1152}
1153
1154static __inline__ void __attribute__((__always_inline__, __nodebug__))
1155_mm_stream_si32(int *p, int a)
1156{
1157  __builtin_ia32_movnti(p, a);
1158}
1159
1160static __inline__ void __attribute__((__always_inline__, __nodebug__))
1161_mm_clflush(void const *p)
1162{
1163  __builtin_ia32_clflush(p);
1164}
1165
1166static __inline__ void __attribute__((__always_inline__, __nodebug__))
1167_mm_lfence(void)
1168{
1169  __builtin_ia32_lfence();
1170}
1171
1172static __inline__ void __attribute__((__always_inline__, __nodebug__))
1173_mm_mfence(void)
1174{
1175  __builtin_ia32_mfence();
1176}
1177
1178static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1179_mm_packs_epi16(__m128i a, __m128i b)
1180{
1181  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1182}
1183
1184static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1185_mm_packs_epi32(__m128i a, __m128i b)
1186{
1187  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1188}
1189
1190static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1191_mm_packus_epi16(__m128i a, __m128i b)
1192{
1193  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1194}
1195
1196static __inline__ int __attribute__((__always_inline__, __nodebug__))
1197_mm_extract_epi16(__m128i a, int imm)
1198{
1199  __v8hi b = (__v8hi)a;
1200  return (unsigned short)b[imm];
1201}
1202
1203static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1204_mm_insert_epi16(__m128i a, int b, int imm)
1205{
1206  __v8hi c = (__v8hi)a;
1207  c[imm & 7] = b;
1208  return (__m128i)c;
1209}
1210
1211static __inline__ int __attribute__((__always_inline__, __nodebug__))
1212_mm_movemask_epi8(__m128i a)
1213{
1214  return __builtin_ia32_pmovmskb128((__v16qi)a);
1215}
1216
1217#define _mm_shuffle_epi32(a, imm) \
1218  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \
1219                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1220                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1221#define _mm_shufflelo_epi16(a, imm) \
1222  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \
1223                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1224                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1225                                    4, 5, 6, 7))
1226#define _mm_shufflehi_epi16(a, imm) \
1227  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \
1228                                    4 + (((imm) & 0x03) >> 0), \
1229                                    4 + (((imm) & 0x0c) >> 2), \
1230                                    4 + (((imm) & 0x30) >> 4), \
1231                                    4 + (((imm) & 0xc0) >> 6)))
1232
1233static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1234_mm_unpackhi_epi8(__m128i a, __m128i b)
1235{
1236  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1237}
1238
1239static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1240_mm_unpackhi_epi16(__m128i a, __m128i b)
1241{
1242  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1243}
1244
1245static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1246_mm_unpackhi_epi32(__m128i a, __m128i b)
1247{
1248  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1249}
1250
1251static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1252_mm_unpackhi_epi64(__m128i a, __m128i b)
1253{
1254  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1255}
1256
1257static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1258_mm_unpacklo_epi8(__m128i a, __m128i b)
1259{
1260  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1261}
1262
1263static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1264_mm_unpacklo_epi16(__m128i a, __m128i b)
1265{
1266  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1267}
1268
1269static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1270_mm_unpacklo_epi32(__m128i a, __m128i b)
1271{
1272  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1273}
1274
1275static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1276_mm_unpacklo_epi64(__m128i a, __m128i b)
1277{
1278  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1279}
1280
1281static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1282_mm_movepi64_pi64(__m128i a)
1283{
1284  return (__m64)a[0];
1285}
1286
1287static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1288_mm_movpi64_pi64(__m64 a)
1289{
1290  return (__m128i){ (long long)a, 0 };
1291}
1292
1293static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1294_mm_move_epi64(__m128i a)
1295{
1296  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1297}
1298
1299static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1300_mm_unpackhi_pd(__m128d a, __m128d b)
1301{
1302  return __builtin_shufflevector(a, b, 1, 2+1);
1303}
1304
1305static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1306_mm_unpacklo_pd(__m128d a, __m128d b)
1307{
1308  return __builtin_shufflevector(a, b, 0, 2+0);
1309}
1310
1311static __inline__ int __attribute__((__always_inline__, __nodebug__))
1312_mm_movemask_pd(__m128d a)
1313{
1314  return __builtin_ia32_movmskpd(a);
1315}
1316
1317#define _mm_shuffle_pd(a, b, i) \
1318  (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1319                                                       (((i) & 2) >> 1) + 2))
1320
1321static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1322_mm_castpd_ps(__m128d in)
1323{
1324  return (__m128)in;
1325}
1326
1327static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1328_mm_castpd_si128(__m128d in)
1329{
1330  return (__m128i)in;
1331}
1332
1333static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1334_mm_castps_pd(__m128 in)
1335{
1336  return (__m128d)in;
1337}
1338
1339static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1340_mm_castps_si128(__m128 in)
1341{
1342  return (__m128i)in;
1343}
1344
1345static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1346_mm_castsi128_ps(__m128i in)
1347{
1348  return (__m128)in;
1349}
1350
1351static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1352_mm_castsi128_pd(__m128i in)
1353{
1354  return (__m128d)in;
1355}
1356
1357static __inline__ void __attribute__((__always_inline__, __nodebug__))
1358_mm_pause(void)
1359{
1360  __asm__ volatile ("pause");
1361}
1362
1363#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1364
1365#endif /* __SSE2__ */
1366
1367#endif /* __EMMINTRIN_H */
1368