emmintrin.h revision 221345
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36/* Type defines.  */
37typedef double __v2df __attribute__ ((__vector_size__ (16)));
38typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39typedef short __v8hi __attribute__((__vector_size__(16)));
40typedef char __v16qi __attribute__((__vector_size__(16)));
41
42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43_mm_add_sd(__m128d a, __m128d b)
44{
45  a[0] += b[0];
46  return a;
47}
48
49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50_mm_add_pd(__m128d a, __m128d b)
51{
52  return a + b;
53}
54
55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56_mm_sub_sd(__m128d a, __m128d b)
57{
58  a[0] -= b[0];
59  return a;
60}
61
62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63_mm_sub_pd(__m128d a, __m128d b)
64{
65  return a - b;
66}
67
68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69_mm_mul_sd(__m128d a, __m128d b)
70{
71  a[0] *= b[0];
72  return a;
73}
74
75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76_mm_mul_pd(__m128d a, __m128d b)
77{
78  return a * b;
79}
80
81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82_mm_div_sd(__m128d a, __m128d b)
83{
84  a[0] /= b[0];
85  return a;
86}
87
88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89_mm_div_pd(__m128d a, __m128d b)
90{
91  return a / b;
92}
93
94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95_mm_sqrt_sd(__m128d a, __m128d b)
96{
97  __m128d c = __builtin_ia32_sqrtsd(b);
98  return (__m128d) { c[0], a[1] };
99}
100
101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_pd(__m128d a)
103{
104  return __builtin_ia32_sqrtpd(a);
105}
106
107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108_mm_min_sd(__m128d a, __m128d b)
109{
110  return __builtin_ia32_minsd(a, b);
111}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comineq_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdneq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_ucomieq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_ucomisdeq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomilt_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdlt(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomile_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdle(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomigt_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdgt(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomineq_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdneq(a, b);
357}
358
359static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
360_mm_cvtpd_ps(__m128d a)
361{
362  return __builtin_ia32_cvtpd2ps(a);
363}
364
365static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
366_mm_cvtps_pd(__m128 a)
367{
368  return __builtin_ia32_cvtps2pd(a);
369}
370
371static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
372_mm_cvtepi32_pd(__m128i a)
373{
374  return __builtin_ia32_cvtdq2pd((__v4si)a);
375}
376
377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
378_mm_cvtpd_epi32(__m128d a)
379{
380  return __builtin_ia32_cvtpd2dq(a);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_cvtsd_si32(__m128d a)
385{
386  return __builtin_ia32_cvtsd2si(a);
387}
388
389static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
390_mm_cvtsd_ss(__m128 a, __m128d b)
391{
392  a[0] = b[0];
393  return a;
394}
395
396static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
397_mm_cvtsi32_sd(__m128d a, int b)
398{
399  a[0] = b;
400  return a;
401}
402
403static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
404_mm_cvtss_sd(__m128d a, __m128 b)
405{
406  a[0] = b[0];
407  return a;
408}
409
410static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
411_mm_cvttpd_epi32(__m128d a)
412{
413  return (__m128i)__builtin_ia32_cvttpd2dq(a);
414}
415
416static __inline__ int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttsd_si32(__m128d a)
418{
419  return a[0];
420}
421
422static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
423_mm_cvtpd_pi32(__m128d a)
424{
425  return (__m64)__builtin_ia32_cvtpd2pi(a);
426}
427
428static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttpd_pi32(__m128d a)
430{
431  return (__m64)__builtin_ia32_cvttpd2pi(a);
432}
433
434static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpi32_pd(__m64 a)
436{
437  return __builtin_ia32_cvtpi2pd((__v2si)a);
438}
439
440static __inline__ double __attribute__((__always_inline__, __nodebug__))
441_mm_cvtsd_f64(__m128d a)
442{
443  return a[0];
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_load_pd(double const *dp)
448{
449  return *(__m128d*)dp;
450}
451
452static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
453_mm_load1_pd(double const *dp)
454{
455  return (__m128d){ dp[0], dp[0] };
456}
457
458#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
459
460static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
461_mm_loadr_pd(double const *dp)
462{
463  return (__m128d){ dp[1], dp[0] };
464}
465
466static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
467_mm_loadu_pd(double const *dp)
468{
469  return (__m128d){ dp[0], dp[1] };
470}
471
472static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
473_mm_load_sd(double const *dp)
474{
475  return (__m128d){ *dp, 0.0 };
476}
477
478static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
479_mm_loadh_pd(__m128d a, double const *dp)
480{
481  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
482}
483
484static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
485_mm_loadl_pd(__m128d a, double const *dp)
486{
487  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
488}
489
490static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
491_mm_set_sd(double w)
492{
493  return (__m128d){ w, 0 };
494}
495
496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497_mm_set1_pd(double w)
498{
499  return (__m128d){ w, w };
500}
501
502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503_mm_set_pd(double w, double x)
504{
505  return (__m128d){ x, w };
506}
507
508static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
509_mm_setr_pd(double w, double x)
510{
511  return (__m128d){ w, x };
512}
513
514static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
515_mm_setzero_pd(void)
516{
517  return (__m128d){ 0, 0 };
518}
519
520static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
521_mm_move_sd(__m128d a, __m128d b)
522{
523  return (__m128d){ b[0], a[1] };
524}
525
526static __inline__ void __attribute__((__always_inline__, __nodebug__))
527_mm_store_sd(double *dp, __m128d a)
528{
529  dp[0] = a[0];
530}
531
532static __inline__ void __attribute__((__always_inline__, __nodebug__))
533_mm_store1_pd(double *dp, __m128d a)
534{
535  dp[0] = a[0];
536  dp[1] = a[0];
537}
538
539static __inline__ void __attribute__((__always_inline__, __nodebug__))
540_mm_store_pd(double *dp, __m128d a)
541{
542  *(__m128d *)dp = a;
543}
544
545static __inline__ void __attribute__((__always_inline__, __nodebug__))
546_mm_storeu_pd(double *dp, __m128d a)
547{
548  __builtin_ia32_storeupd(dp, a);
549}
550
551static __inline__ void __attribute__((__always_inline__, __nodebug__))
552_mm_storer_pd(double *dp, __m128d a)
553{
554  dp[0] = a[1];
555  dp[1] = a[0];
556}
557
558static __inline__ void __attribute__((__always_inline__, __nodebug__))
559_mm_storeh_pd(double *dp, __m128d a)
560{
561  dp[0] = a[1];
562}
563
564static __inline__ void __attribute__((__always_inline__, __nodebug__))
565_mm_storel_pd(double *dp, __m128d a)
566{
567  dp[0] = a[0];
568}
569
570static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
571_mm_add_epi8(__m128i a, __m128i b)
572{
573  return (__m128i)((__v16qi)a + (__v16qi)b);
574}
575
576static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
577_mm_add_epi16(__m128i a, __m128i b)
578{
579  return (__m128i)((__v8hi)a + (__v8hi)b);
580}
581
582static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
583_mm_add_epi32(__m128i a, __m128i b)
584{
585  return (__m128i)((__v4si)a + (__v4si)b);
586}
587
588static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
589_mm_add_si64(__m64 a, __m64 b)
590{
591  return a + b;
592}
593
594static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
595_mm_add_epi64(__m128i a, __m128i b)
596{
597  return a + b;
598}
599
600static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
601_mm_adds_epi8(__m128i a, __m128i b)
602{
603  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
604}
605
606static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
607_mm_adds_epi16(__m128i a, __m128i b)
608{
609  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
610}
611
612static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
613_mm_adds_epu8(__m128i a, __m128i b)
614{
615  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
616}
617
618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619_mm_adds_epu16(__m128i a, __m128i b)
620{
621  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
622}
623
624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625_mm_avg_epu8(__m128i a, __m128i b)
626{
627  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
628}
629
630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631_mm_avg_epu16(__m128i a, __m128i b)
632{
633  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
634}
635
636static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
637_mm_madd_epi16(__m128i a, __m128i b)
638{
639  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
640}
641
642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643_mm_max_epi16(__m128i a, __m128i b)
644{
645  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
646}
647
648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649_mm_max_epu8(__m128i a, __m128i b)
650{
651  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
652}
653
654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655_mm_min_epi16(__m128i a, __m128i b)
656{
657  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
658}
659
660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661_mm_min_epu8(__m128i a, __m128i b)
662{
663  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
664}
665
666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667_mm_mulhi_epi16(__m128i a, __m128i b)
668{
669  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
670}
671
672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673_mm_mulhi_epu16(__m128i a, __m128i b)
674{
675  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
676}
677
678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679_mm_mullo_epi16(__m128i a, __m128i b)
680{
681  return (__m128i)((__v8hi)a * (__v8hi)b);
682}
683
684static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
685_mm_mul_su32(__m64 a, __m64 b)
686{
687  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
688}
689
690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691_mm_mul_epu32(__m128i a, __m128i b)
692{
693  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
694}
695
696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697_mm_sad_epu8(__m128i a, __m128i b)
698{
699  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
700}
701
702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703_mm_sub_epi8(__m128i a, __m128i b)
704{
705  return (__m128i)((__v16qi)a - (__v16qi)b);
706}
707
708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709_mm_sub_epi16(__m128i a, __m128i b)
710{
711  return (__m128i)((__v8hi)a - (__v8hi)b);
712}
713
714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715_mm_sub_epi32(__m128i a, __m128i b)
716{
717  return (__m128i)((__v4si)a - (__v4si)b);
718}
719
720static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
721_mm_sub_si64(__m64 a, __m64 b)
722{
723  return a - b;
724}
725
726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727_mm_sub_epi64(__m128i a, __m128i b)
728{
729  return a - b;
730}
731
732static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
733_mm_subs_epi8(__m128i a, __m128i b)
734{
735  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
736}
737
738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739_mm_subs_epi16(__m128i a, __m128i b)
740{
741  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
742}
743
744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745_mm_subs_epu8(__m128i a, __m128i b)
746{
747  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
748}
749
750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751_mm_subs_epu16(__m128i a, __m128i b)
752{
753  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
754}
755
756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757_mm_and_si128(__m128i a, __m128i b)
758{
759  return a & b;
760}
761
762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763_mm_andnot_si128(__m128i a, __m128i b)
764{
765  return ~a & b;
766}
767
768static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
769_mm_or_si128(__m128i a, __m128i b)
770{
771  return a | b;
772}
773
774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775_mm_xor_si128(__m128i a, __m128i b)
776{
777  return a ^ b;
778}
779
780#define _mm_slli_si128(VEC, IMM) \
781  ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
782
783static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
784_mm_slli_epi16(__m128i a, int count)
785{
786  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
787}
788
789static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
790_mm_sll_epi16(__m128i a, __m128i count)
791{
792  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
793}
794
795static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
796_mm_slli_epi32(__m128i a, int count)
797{
798  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
799}
800
801static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
802_mm_sll_epi32(__m128i a, __m128i count)
803{
804  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
805}
806
807static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
808_mm_slli_epi64(__m128i a, int count)
809{
810  return __builtin_ia32_psllqi128(a, count);
811}
812
813static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
814_mm_sll_epi64(__m128i a, __m128i count)
815{
816  return __builtin_ia32_psllq128(a, count);
817}
818
819static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
820_mm_srai_epi16(__m128i a, int count)
821{
822  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
823}
824
825static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
826_mm_sra_epi16(__m128i a, __m128i count)
827{
828  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
829}
830
831static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
832_mm_srai_epi32(__m128i a, int count)
833{
834  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
835}
836
837static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
838_mm_sra_epi32(__m128i a, __m128i count)
839{
840  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
841}
842
843
844#define _mm_srli_si128(VEC, IMM) \
845  ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
846
847static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
848_mm_srli_epi16(__m128i a, int count)
849{
850  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
851}
852
853static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
854_mm_srl_epi16(__m128i a, __m128i count)
855{
856  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
857}
858
859static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
860_mm_srli_epi32(__m128i a, int count)
861{
862  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
863}
864
865static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
866_mm_srl_epi32(__m128i a, __m128i count)
867{
868  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
869}
870
871static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
872_mm_srli_epi64(__m128i a, int count)
873{
874  return __builtin_ia32_psrlqi128(a, count);
875}
876
877static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
878_mm_srl_epi64(__m128i a, __m128i count)
879{
880  return __builtin_ia32_psrlq128(a, count);
881}
882
883static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
884_mm_cmpeq_epi8(__m128i a, __m128i b)
885{
886  return (__m128i)((__v16qi)a == (__v16qi)b);
887}
888
889static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
890_mm_cmpeq_epi16(__m128i a, __m128i b)
891{
892  return (__m128i)((__v8hi)a == (__v8hi)b);
893}
894
895static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
896_mm_cmpeq_epi32(__m128i a, __m128i b)
897{
898  return (__m128i)((__v4si)a == (__v4si)b);
899}
900
901static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
902_mm_cmpgt_epi8(__m128i a, __m128i b)
903{
904  return (__m128i)((__v16qi)a > (__v16qi)b);
905}
906
907static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
908_mm_cmpgt_epi16(__m128i a, __m128i b)
909{
910  return (__m128i)((__v8hi)a > (__v8hi)b);
911}
912
913static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914_mm_cmpgt_epi32(__m128i a, __m128i b)
915{
916  return (__m128i)((__v4si)a > (__v4si)b);
917}
918
919static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920_mm_cmplt_epi8(__m128i a, __m128i b)
921{
922  return _mm_cmpgt_epi8(b,a);
923}
924
925static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
926_mm_cmplt_epi16(__m128i a, __m128i b)
927{
928  return _mm_cmpgt_epi16(b,a);
929}
930
931static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
932_mm_cmplt_epi32(__m128i a, __m128i b)
933{
934  return _mm_cmpgt_epi32(b,a);
935}
936
937#ifdef __x86_64__
938static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
939_mm_cvtsi64_sd(__m128d a, long long b)
940{
941  a[0] = b;
942  return a;
943}
944
945static __inline__ long long __attribute__((__always_inline__, __nodebug__))
946_mm_cvtsd_si64(__m128d a)
947{
948  return __builtin_ia32_cvtsd2si64(a);
949}
950
951static __inline__ long long __attribute__((__always_inline__, __nodebug__))
952_mm_cvttsd_si64(__m128d a)
953{
954  return a[0];
955}
956#endif
957
958static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
959_mm_cvtepi32_ps(__m128i a)
960{
961  return __builtin_ia32_cvtdq2ps((__v4si)a);
962}
963
964static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965_mm_cvtps_epi32(__m128 a)
966{
967  return (__m128i)__builtin_ia32_cvtps2dq(a);
968}
969
970static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971_mm_cvttps_epi32(__m128 a)
972{
973  return (__m128i)__builtin_ia32_cvttps2dq(a);
974}
975
976static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977_mm_cvtsi32_si128(int a)
978{
979  return (__m128i)(__v4si){ a, 0, 0, 0 };
980}
981
982#ifdef __x86_64__
983static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
984_mm_cvtsi64_si128(long long a)
985{
986  return (__m128i){ a, 0 };
987}
988#endif
989
990static __inline__ int __attribute__((__always_inline__, __nodebug__))
991_mm_cvtsi128_si32(__m128i a)
992{
993  __v4si b = (__v4si)a;
994  return b[0];
995}
996
997#ifdef __x86_64__
998static __inline__ long long __attribute__((__always_inline__, __nodebug__))
999_mm_cvtsi128_si64(__m128i a)
1000{
1001  return a[0];
1002}
1003#endif
1004
1005static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1006_mm_load_si128(__m128i const *p)
1007{
1008  return *p;
1009}
1010
1011static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1012_mm_loadu_si128(__m128i const *p)
1013{
1014  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1015}
1016
1017static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1018_mm_loadl_epi64(__m128i const *p)
1019{
1020  return (__m128i) { *(long long*)p, 0};
1021}
1022
1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1024_mm_set_epi64x(long long q1, long long q0)
1025{
1026  return (__m128i){ q0, q1 };
1027}
1028
1029static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1030_mm_set_epi64(__m64 q1, __m64 q0)
1031{
1032  return (__m128i){ (long long)q0, (long long)q1 };
1033}
1034
1035static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1036_mm_set_epi32(int i3, int i2, int i1, int i0)
1037{
1038  return (__m128i)(__v4si){ i0, i1, i2, i3};
1039}
1040
1041static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1042_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1043{
1044  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1045}
1046
1047static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1048_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1049{
1050  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1051}
1052
1053static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1054_mm_set1_epi64x(long long q)
1055{
1056  return (__m128i){ q, q };
1057}
1058
1059static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1060_mm_set1_epi64(__m64 q)
1061{
1062  return (__m128i){ (long long)q, (long long)q };
1063}
1064
1065static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1066_mm_set1_epi32(int i)
1067{
1068  return (__m128i)(__v4si){ i, i, i, i };
1069}
1070
1071static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1072_mm_set1_epi16(short w)
1073{
1074  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1075}
1076
1077static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078_mm_set1_epi8(char b)
1079{
1080  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1081}
1082
1083static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1084_mm_setr_epi64(__m64 q0, __m64 q1)
1085{
1086  return (__m128i){ (long long)q0, (long long)q1 };
1087}
1088
1089static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1090_mm_setr_epi32(int i0, int i1, int i2, int i3)
1091{
1092  return (__m128i)(__v4si){ i0, i1, i2, i3};
1093}
1094
1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1096_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1097{
1098  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1099}
1100
1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1102_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1103{
1104  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1105}
1106
1107static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1108_mm_setzero_si128(void)
1109{
1110  return (__m128i){ 0LL, 0LL };
1111}
1112
1113static __inline__ void __attribute__((__always_inline__, __nodebug__))
1114_mm_store_si128(__m128i *p, __m128i b)
1115{
1116  *p = b;
1117}
1118
1119static __inline__ void __attribute__((__always_inline__, __nodebug__))
1120_mm_storeu_si128(__m128i *p, __m128i b)
1121{
1122  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1123}
1124
1125static __inline__ void __attribute__((__always_inline__, __nodebug__))
1126_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1127{
1128  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1129}
1130
1131static __inline__ void __attribute__((__always_inline__, __nodebug__))
1132_mm_storel_epi64(__m128i *p, __m128i a)
1133{
1134  __builtin_ia32_storelv4si((__v2si *)p, a);
1135}
1136
1137static __inline__ void __attribute__((__always_inline__, __nodebug__))
1138_mm_stream_pd(double *p, __m128d a)
1139{
1140  __builtin_ia32_movntpd(p, a);
1141}
1142
1143static __inline__ void __attribute__((__always_inline__, __nodebug__))
1144_mm_stream_si128(__m128i *p, __m128i a)
1145{
1146  __builtin_ia32_movntdq(p, a);
1147}
1148
1149static __inline__ void __attribute__((__always_inline__, __nodebug__))
1150_mm_stream_si32(int *p, int a)
1151{
1152  __builtin_ia32_movnti(p, a);
1153}
1154
1155static __inline__ void __attribute__((__always_inline__, __nodebug__))
1156_mm_clflush(void const *p)
1157{
1158  __builtin_ia32_clflush(p);
1159}
1160
1161static __inline__ void __attribute__((__always_inline__, __nodebug__))
1162_mm_lfence(void)
1163{
1164  __builtin_ia32_lfence();
1165}
1166
1167static __inline__ void __attribute__((__always_inline__, __nodebug__))
1168_mm_mfence(void)
1169{
1170  __builtin_ia32_mfence();
1171}
1172
1173static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1174_mm_packs_epi16(__m128i a, __m128i b)
1175{
1176  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1177}
1178
1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1180_mm_packs_epi32(__m128i a, __m128i b)
1181{
1182  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1183}
1184
1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1186_mm_packus_epi16(__m128i a, __m128i b)
1187{
1188  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1189}
1190
1191static __inline__ int __attribute__((__always_inline__, __nodebug__))
1192_mm_extract_epi16(__m128i a, int imm)
1193{
1194  __v8hi b = (__v8hi)a;
1195  return (unsigned short)b[imm];
1196}
1197
1198static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1199_mm_insert_epi16(__m128i a, int b, int imm)
1200{
1201  __v8hi c = (__v8hi)a;
1202  c[imm & 7] = b;
1203  return (__m128i)c;
1204}
1205
1206static __inline__ int __attribute__((__always_inline__, __nodebug__))
1207_mm_movemask_epi8(__m128i a)
1208{
1209  return __builtin_ia32_pmovmskb128((__v16qi)a);
1210}
1211
1212#define _mm_shuffle_epi32(a, imm) \
1213  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1214                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1215                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1216
1217
1218#define _mm_shufflelo_epi16(a, imm) \
1219  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1220                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1222                                    4, 5, 6, 7))
1223#define _mm_shufflehi_epi16(a, imm) \
1224  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1225                                    4 + (((imm) & 0x03) >> 0), \
1226                                    4 + (((imm) & 0x0c) >> 2), \
1227                                    4 + (((imm) & 0x30) >> 4), \
1228                                    4 + (((imm) & 0xc0) >> 6)))
1229
1230static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1231_mm_unpackhi_epi8(__m128i a, __m128i b)
1232{
1233  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1234}
1235
1236static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1237_mm_unpackhi_epi16(__m128i a, __m128i b)
1238{
1239  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1240}
1241
1242static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1243_mm_unpackhi_epi32(__m128i a, __m128i b)
1244{
1245  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1246}
1247
1248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1249_mm_unpackhi_epi64(__m128i a, __m128i b)
1250{
1251  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1252}
1253
1254static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1255_mm_unpacklo_epi8(__m128i a, __m128i b)
1256{
1257  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1258}
1259
1260static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1261_mm_unpacklo_epi16(__m128i a, __m128i b)
1262{
1263  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1264}
1265
1266static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1267_mm_unpacklo_epi32(__m128i a, __m128i b)
1268{
1269  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1270}
1271
1272static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273_mm_unpacklo_epi64(__m128i a, __m128i b)
1274{
1275  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1276}
1277
1278static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1279_mm_movepi64_pi64(__m128i a)
1280{
1281  return (__m64)a[0];
1282}
1283
1284static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1285_mm_movpi64_pi64(__m64 a)
1286{
1287  return (__m128i){ (long long)a, 0 };
1288}
1289
1290static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1291_mm_move_epi64(__m128i a)
1292{
1293  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1294}
1295
1296static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1297_mm_unpackhi_pd(__m128d a, __m128d b)
1298{
1299  return __builtin_shufflevector(a, b, 1, 2+1);
1300}
1301
1302static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1303_mm_unpacklo_pd(__m128d a, __m128d b)
1304{
1305  return __builtin_shufflevector(a, b, 0, 2+0);
1306}
1307
1308static __inline__ int __attribute__((__always_inline__, __nodebug__))
1309_mm_movemask_pd(__m128d a)
1310{
1311  return __builtin_ia32_movmskpd(a);
1312}
1313
1314#define _mm_shuffle_pd(a, b, i) \
1315  (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1316                                                       (((i) & 2) >> 1) + 2))
1317
1318static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1319_mm_castpd_ps(__m128d in)
1320{
1321  return (__m128)in;
1322}
1323
1324static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1325_mm_castpd_si128(__m128d in)
1326{
1327  return (__m128i)in;
1328}
1329
1330static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1331_mm_castps_pd(__m128 in)
1332{
1333  return (__m128d)in;
1334}
1335
1336static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1337_mm_castps_si128(__m128 in)
1338{
1339  return (__m128i)in;
1340}
1341
1342static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1343_mm_castsi128_ps(__m128i in)
1344{
1345  return (__m128)in;
1346}
1347
1348static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1349_mm_castsi128_pd(__m128i in)
1350{
1351  return (__m128d)in;
1352}
1353
1354static __inline__ void __attribute__((__always_inline__, __nodebug__))
1355_mm_pause(void)
1356{
1357  __asm__ volatile ("pause");
1358}
1359
1360#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1361
1362#endif /* __SSE2__ */
1363
1364#endif /* __EMMINTRIN_H */
1365