emmintrin.h revision 193725
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef int __v4si __attribute__((__vector_size__(16)));
37typedef short __v8hi __attribute__((__vector_size__(16)));
38typedef char __v16qi __attribute__((__vector_size__(16)));
39
40static inline __m128d __attribute__((__always_inline__, __nodebug__))
41_mm_add_sd(__m128d a, __m128d b)
42{
43  a[0] += b[0];
44  return a;
45}
46
47static inline __m128d __attribute__((__always_inline__, __nodebug__))
48_mm_add_pd(__m128d a, __m128d b)
49{
50  return a + b;
51}
52
53static inline __m128d __attribute__((__always_inline__, __nodebug__))
54_mm_sub_sd(__m128d a, __m128d b)
55{
56  a[0] -= b[0];
57  return a;
58}
59
60static inline __m128d __attribute__((__always_inline__, __nodebug__))
61_mm_sub_pd(__m128d a, __m128d b)
62{
63  return a - b;
64}
65
66static inline __m128d __attribute__((__always_inline__, __nodebug__))
67_mm_mul_sd(__m128d a, __m128d b)
68{
69  a[0] *= b[0];
70  return a;
71}
72
73static inline __m128d __attribute__((__always_inline__, __nodebug__))
74_mm_mul_pd(__m128d a, __m128d b)
75{
76  return a * b;
77}
78
79static inline __m128d __attribute__((__always_inline__, __nodebug__))
80_mm_div_sd(__m128d a, __m128d b)
81{
82  a[0] /= b[0];
83  return a;
84}
85
86static inline __m128d __attribute__((__always_inline__, __nodebug__))
87_mm_div_pd(__m128d a, __m128d b)
88{
89  return a / b;
90}
91
92static inline __m128d __attribute__((__always_inline__, __nodebug__))
93_mm_sqrt_sd(__m128d a, __m128d b)
94{
95  __m128d c = __builtin_ia32_sqrtsd(b);
96  return (__m128d) { c[0], a[1] };
97}
98
99static inline __m128d __attribute__((__always_inline__, __nodebug__))
100_mm_sqrt_pd(__m128d a)
101{
102  return __builtin_ia32_sqrtpd(a);
103}
104
105static inline __m128d __attribute__((__always_inline__, __nodebug__))
106_mm_min_sd(__m128d a, __m128d b)
107{
108  return __builtin_ia32_minsd(a, b);
109}
110
111static inline __m128d __attribute__((__always_inline__, __nodebug__))
112_mm_min_pd(__m128d a, __m128d b)
113{
114  return __builtin_ia32_minpd(a, b);
115}
116
117static inline __m128d __attribute__((__always_inline__, __nodebug__))
118_mm_max_sd(__m128d a, __m128d b)
119{
120  return __builtin_ia32_maxsd(a, b);
121}
122
123static inline __m128d __attribute__((__always_inline__, __nodebug__))
124_mm_max_pd(__m128d a, __m128d b)
125{
126  return __builtin_ia32_maxpd(a, b);
127}
128
129static inline __m128d __attribute__((__always_inline__, __nodebug__))
130_mm_and_pd(__m128d a, __m128d b)
131{
132  return (__m128d)((__v4si)a & (__v4si)b);
133}
134
135static inline __m128d __attribute__((__always_inline__, __nodebug__))
136_mm_andnot_pd(__m128d a, __m128d b)
137{
138  return (__m128d)(~(__v4si)a & (__v4si)b);
139}
140
141static inline __m128d __attribute__((__always_inline__, __nodebug__))
142_mm_or_pd(__m128d a, __m128d b)
143{
144  return (__m128d)((__v4si)a | (__v4si)b);
145}
146
147static inline __m128d __attribute__((__always_inline__, __nodebug__))
148_mm_xor_pd(__m128d a, __m128d b)
149{
150  return (__m128d)((__v4si)a ^ (__v4si)b);
151}
152
153static inline __m128d __attribute__((__always_inline__, __nodebug__))
154_mm_cmpeq_pd(__m128d a, __m128d b)
155{
156  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
157}
158
159static inline __m128d __attribute__((__always_inline__, __nodebug__))
160_mm_cmplt_pd(__m128d a, __m128d b)
161{
162  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
163}
164
165static inline __m128d __attribute__((__always_inline__, __nodebug__))
166_mm_cmple_pd(__m128d a, __m128d b)
167{
168  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
169}
170
171static inline __m128d __attribute__((__always_inline__, __nodebug__))
172_mm_cmpgt_pd(__m128d a, __m128d b)
173{
174  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
175}
176
177static inline __m128d __attribute__((__always_inline__, __nodebug__))
178_mm_cmpge_pd(__m128d a, __m128d b)
179{
180  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
181}
182
183static inline __m128d __attribute__((__always_inline__, __nodebug__))
184_mm_cmpord_pd(__m128d a, __m128d b)
185{
186  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
187}
188
189static inline __m128d __attribute__((__always_inline__, __nodebug__))
190_mm_cmpunord_pd(__m128d a, __m128d b)
191{
192  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
193}
194
195static inline __m128d __attribute__((__always_inline__, __nodebug__))
196_mm_cmpneq_pd(__m128d a, __m128d b)
197{
198  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
199}
200
201static inline __m128d __attribute__((__always_inline__, __nodebug__))
202_mm_cmpnlt_pd(__m128d a, __m128d b)
203{
204  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
205}
206
207static inline __m128d __attribute__((__always_inline__, __nodebug__))
208_mm_cmpnle_pd(__m128d a, __m128d b)
209{
210  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
211}
212
213static inline __m128d __attribute__((__always_inline__, __nodebug__))
214_mm_cmpngt_pd(__m128d a, __m128d b)
215{
216  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
217}
218
219static inline __m128d __attribute__((__always_inline__, __nodebug__))
220_mm_cmpnge_pd(__m128d a, __m128d b)
221{
222  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
223}
224
225static inline __m128d __attribute__((__always_inline__, __nodebug__))
226_mm_cmpeq_sd(__m128d a, __m128d b)
227{
228  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
229}
230
231static inline __m128d __attribute__((__always_inline__, __nodebug__))
232_mm_cmplt_sd(__m128d a, __m128d b)
233{
234  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
235}
236
237static inline __m128d __attribute__((__always_inline__, __nodebug__))
238_mm_cmple_sd(__m128d a, __m128d b)
239{
240  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
241}
242
243static inline __m128d __attribute__((__always_inline__, __nodebug__))
244_mm_cmpgt_sd(__m128d a, __m128d b)
245{
246  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
247}
248
249static inline __m128d __attribute__((__always_inline__, __nodebug__))
250_mm_cmpge_sd(__m128d a, __m128d b)
251{
252  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
253}
254
255static inline __m128d __attribute__((__always_inline__, __nodebug__))
256_mm_cmpord_sd(__m128d a, __m128d b)
257{
258  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
259}
260
261static inline __m128d __attribute__((__always_inline__, __nodebug__))
262_mm_cmpunord_sd(__m128d a, __m128d b)
263{
264  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
265}
266
267static inline __m128d __attribute__((__always_inline__, __nodebug__))
268_mm_cmpneq_sd(__m128d a, __m128d b)
269{
270  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
271}
272
273static inline __m128d __attribute__((__always_inline__, __nodebug__))
274_mm_cmpnlt_sd(__m128d a, __m128d b)
275{
276  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
277}
278
279static inline __m128d __attribute__((__always_inline__, __nodebug__))
280_mm_cmpnle_sd(__m128d a, __m128d b)
281{
282  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
283}
284
285static inline __m128d __attribute__((__always_inline__, __nodebug__))
286_mm_cmpngt_sd(__m128d a, __m128d b)
287{
288  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
289}
290
291static inline __m128d __attribute__((__always_inline__, __nodebug__))
292_mm_cmpnge_sd(__m128d a, __m128d b)
293{
294  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
295}
296
297static inline int __attribute__((__always_inline__, __nodebug__))
298_mm_comieq_sd(__m128d a, __m128d b)
299{
300  return __builtin_ia32_comisdeq(a, b);
301}
302
303static inline int __attribute__((__always_inline__, __nodebug__))
304_mm_comilt_sd(__m128d a, __m128d b)
305{
306  return __builtin_ia32_comisdlt(a, b);
307}
308
309static inline int __attribute__((__always_inline__, __nodebug__))
310_mm_comile_sd(__m128d a, __m128d b)
311{
312  return __builtin_ia32_comisdle(a, b);
313}
314
315static inline int __attribute__((__always_inline__, __nodebug__))
316_mm_comigt_sd(__m128d a, __m128d b)
317{
318  return __builtin_ia32_comisdgt(a, b);
319}
320
321static inline int __attribute__((__always_inline__, __nodebug__))
322_mm_comineq_sd(__m128d a, __m128d b)
323{
324  return __builtin_ia32_comisdneq(a, b);
325}
326
327static inline int __attribute__((__always_inline__, __nodebug__))
328_mm_ucomieq_sd(__m128d a, __m128d b)
329{
330  return __builtin_ia32_ucomisdeq(a, b);
331}
332
333static inline int __attribute__((__always_inline__, __nodebug__))
334_mm_ucomilt_sd(__m128d a, __m128d b)
335{
336  return __builtin_ia32_ucomisdlt(a, b);
337}
338
339static inline int __attribute__((__always_inline__, __nodebug__))
340_mm_ucomile_sd(__m128d a, __m128d b)
341{
342  return __builtin_ia32_ucomisdle(a, b);
343}
344
345static inline int __attribute__((__always_inline__, __nodebug__))
346_mm_ucomigt_sd(__m128d a, __m128d b)
347{
348  return __builtin_ia32_ucomisdgt(a, b);
349}
350
351static inline int __attribute__((__always_inline__, __nodebug__))
352_mm_ucomineq_sd(__m128d a, __m128d b)
353{
354  return __builtin_ia32_ucomisdneq(a, b);
355}
356
357static inline __m128 __attribute__((__always_inline__, __nodebug__))
358_mm_cvtpd_ps(__m128d a)
359{
360  return __builtin_ia32_cvtpd2ps(a);
361}
362
363static inline __m128d __attribute__((__always_inline__, __nodebug__))
364_mm_cvtps_pd(__m128 a)
365{
366  return __builtin_ia32_cvtps2pd(a);
367}
368
369static inline __m128d __attribute__((__always_inline__, __nodebug__))
370_mm_cvtepi32_pd(__m128i a)
371{
372  return __builtin_ia32_cvtdq2pd((__v4si)a);
373}
374
375static inline __m128i __attribute__((__always_inline__, __nodebug__))
376_mm_cvtpd_epi32(__m128d a)
377{
378  return __builtin_ia32_cvtpd2dq(a);
379}
380
381static inline int __attribute__((__always_inline__, __nodebug__))
382_mm_cvtsd_si32(__m128d a)
383{
384  return __builtin_ia32_cvtsd2si(a);
385}
386
387static inline __m128 __attribute__((__always_inline__, __nodebug__))
388_mm_cvtsd_ss(__m128 a, __m128d b)
389{
390  a[0] = b[0];
391  return a;
392}
393
394static inline __m128d __attribute__((__always_inline__, __nodebug__))
395_mm_cvtsi32_sd(__m128d a, int b)
396{
397  a[0] = b;
398  return a;
399}
400
401static inline __m128d __attribute__((__always_inline__, __nodebug__))
402_mm_cvtss_sd(__m128d a, __m128 b)
403{
404  a[0] = b[0];
405  return a;
406}
407
408static inline __m128i __attribute__((__always_inline__, __nodebug__))
409_mm_cvttpd_epi32(__m128d a)
410{
411  return (__m128i)__builtin_ia32_cvttpd2dq(a);
412}
413
414static inline int __attribute__((__always_inline__, __nodebug__))
415_mm_cvttsd_si32(__m128d a)
416{
417  return a[0];
418}
419
420static inline __m64 __attribute__((__always_inline__, __nodebug__))
421_mm_cvtpd_pi32(__m128d a)
422{
423  return (__m64)__builtin_ia32_cvtpd2pi(a);
424}
425
426static inline __m64 __attribute__((__always_inline__, __nodebug__))
427_mm_cvttpd_pi32(__m128d a)
428{
429  return (__m64)__builtin_ia32_cvttpd2pi(a);
430}
431
432static inline __m128d __attribute__((__always_inline__, __nodebug__))
433_mm_cvtpi32_pd(__m64 a)
434{
435  return __builtin_ia32_cvtpi2pd((__v2si)a);
436}
437
438static inline double __attribute__((__always_inline__, __nodebug__))
439_mm_cvtsd_f64(__m128d a)
440{
441  return a[0];
442}
443
444static inline __m128d __attribute__((__always_inline__, __nodebug__))
445_mm_load_pd(double const *dp)
446{
447  return *(__m128d*)dp;
448}
449
450static inline __m128d __attribute__((__always_inline__, __nodebug__))
451_mm_load1_pd(double const *dp)
452{
453  return (__m128d){ dp[0], dp[0] };
454}
455
456#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
457
458static inline __m128d __attribute__((__always_inline__, __nodebug__))
459_mm_loadr_pd(double const *dp)
460{
461  return (__m128d){ dp[1], dp[0] };
462}
463
464static inline __m128d __attribute__((__always_inline__, __nodebug__))
465_mm_loadu_pd(double const *dp)
466{
467  return __builtin_ia32_loadupd(dp);
468}
469
470static inline __m128d __attribute__((__always_inline__, __nodebug__))
471_mm_load_sd(double const *dp)
472{
473  return (__m128d){ *dp, 0.0 };
474}
475
476static inline __m128d __attribute__((__always_inline__, __nodebug__))
477_mm_loadh_pd(__m128d a, double const *dp)
478{
479  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
480}
481
482static inline __m128d __attribute__((__always_inline__, __nodebug__))
483_mm_loadl_pd(__m128d a, double const *dp)
484{
485  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
486}
487
488static inline __m128d __attribute__((__always_inline__, __nodebug__))
489_mm_set_sd(double w)
490{
491  return (__m128d){ w, 0 };
492}
493
494static inline __m128d __attribute__((__always_inline__, __nodebug__))
495_mm_set1_pd(double w)
496{
497  return (__m128d){ w, w };
498}
499
500static inline __m128d __attribute__((__always_inline__, __nodebug__))
501_mm_set_pd(double w, double x)
502{
503  return (__m128d){ w, x };
504}
505
506static inline __m128d __attribute__((__always_inline__, __nodebug__))
507_mm_setr_pd(double w, double x)
508{
509  return (__m128d){ x, w };
510}
511
512static inline __m128d __attribute__((__always_inline__, __nodebug__))
513_mm_setzero_pd(void)
514{
515  return (__m128d){ 0, 0 };
516}
517
518static inline __m128d __attribute__((__always_inline__, __nodebug__))
519_mm_move_sd(__m128d a, __m128d b)
520{
521  return (__m128d){ b[0], a[1] };
522}
523
524static inline void __attribute__((__always_inline__, __nodebug__))
525_mm_store_sd(double *dp, __m128d a)
526{
527  dp[0] = a[0];
528}
529
530static inline void __attribute__((__always_inline__, __nodebug__))
531_mm_store1_pd(double *dp, __m128d a)
532{
533  dp[0] = a[0];
534  dp[1] = a[0];
535}
536
537static inline void __attribute__((__always_inline__, __nodebug__))
538_mm_store_pd(double *dp, __m128d a)
539{
540  *(__m128d *)dp = a;
541}
542
543static inline void __attribute__((__always_inline__, __nodebug__))
544_mm_storeu_pd(double *dp, __m128d a)
545{
546  __builtin_ia32_storeupd(dp, a);
547}
548
549static inline void __attribute__((__always_inline__, __nodebug__))
550_mm_storer_pd(double *dp, __m128d a)
551{
552  dp[0] = a[1];
553  dp[1] = a[0];
554}
555
556static inline void __attribute__((__always_inline__, __nodebug__))
557_mm_storeh_pd(double *dp, __m128d a)
558{
559  dp[0] = a[1];
560}
561
562static inline void __attribute__((__always_inline__, __nodebug__))
563_mm_storel_pd(double *dp, __m128d a)
564{
565  dp[0] = a[0];
566}
567
568static inline __m128i __attribute__((__always_inline__, __nodebug__))
569_mm_add_epi8(__m128i a, __m128i b)
570{
571  return (__m128i)((__v16qi)a + (__v16qi)b);
572}
573
574static inline __m128i __attribute__((__always_inline__, __nodebug__))
575_mm_add_epi16(__m128i a, __m128i b)
576{
577  return (__m128i)((__v8hi)a + (__v8hi)b);
578}
579
580static inline __m128i __attribute__((__always_inline__, __nodebug__))
581_mm_add_epi32(__m128i a, __m128i b)
582{
583  return (__m128i)((__v4si)a + (__v4si)b);
584}
585
586static inline __m64 __attribute__((__always_inline__, __nodebug__))
587_mm_add_si64(__m64 a, __m64 b)
588{
589  return a + b;
590}
591
592static inline __m128i __attribute__((__always_inline__, __nodebug__))
593_mm_add_epi64(__m128i a, __m128i b)
594{
595  return a + b;
596}
597
598static inline __m128i __attribute__((__always_inline__, __nodebug__))
599_mm_adds_epi8(__m128i a, __m128i b)
600{
601  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
602}
603
604static inline __m128i __attribute__((__always_inline__, __nodebug__))
605_mm_adds_epi16(__m128i a, __m128i b)
606{
607  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
608}
609
610static inline __m128i __attribute__((__always_inline__, __nodebug__))
611_mm_adds_epu8(__m128i a, __m128i b)
612{
613  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
614}
615
616static inline __m128i __attribute__((__always_inline__, __nodebug__))
617_mm_adds_epu16(__m128i a, __m128i b)
618{
619  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
620}
621
622static inline __m128i __attribute__((__always_inline__, __nodebug__))
623_mm_avg_epu8(__m128i a, __m128i b)
624{
625  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
626}
627
628static inline __m128i __attribute__((__always_inline__, __nodebug__))
629_mm_avg_epu16(__m128i a, __m128i b)
630{
631  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
632}
633
634static inline __m128i __attribute__((__always_inline__, __nodebug__))
635_mm_madd_epi16(__m128i a, __m128i b)
636{
637  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
638}
639
640static inline __m128i __attribute__((__always_inline__, __nodebug__))
641_mm_max_epi16(__m128i a, __m128i b)
642{
643  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
644}
645
646static inline __m128i __attribute__((__always_inline__, __nodebug__))
647_mm_max_epu8(__m128i a, __m128i b)
648{
649  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
650}
651
652static inline __m128i __attribute__((__always_inline__, __nodebug__))
653_mm_min_epi16(__m128i a, __m128i b)
654{
655  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
656}
657
658static inline __m128i __attribute__((__always_inline__, __nodebug__))
659_mm_min_epu8(__m128i a, __m128i b)
660{
661  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
662}
663
664static inline __m128i __attribute__((__always_inline__, __nodebug__))
665_mm_mulhi_epi16(__m128i a, __m128i b)
666{
667  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
668}
669
670static inline __m128i __attribute__((__always_inline__, __nodebug__))
671_mm_mulhi_epu16(__m128i a, __m128i b)
672{
673  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
674}
675
676static inline __m128i __attribute__((__always_inline__, __nodebug__))
677_mm_mullo_epi16(__m128i a, __m128i b)
678{
679  return (__m128i)((__v8hi)a * (__v8hi)b);
680}
681
682static inline __m64 __attribute__((__always_inline__, __nodebug__))
683_mm_mul_su32(__m64 a, __m64 b)
684{
685  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
686}
687
688static inline __m128i __attribute__((__always_inline__, __nodebug__))
689_mm_mul_epu32(__m128i a, __m128i b)
690{
691  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
692}
693
694static inline __m128i __attribute__((__always_inline__, __nodebug__))
695_mm_sad_epu8(__m128i a, __m128i b)
696{
697  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
698}
699
700static inline __m128i __attribute__((__always_inline__, __nodebug__))
701_mm_sub_epi8(__m128i a, __m128i b)
702{
703  return (__m128i)((__v16qi)a - (__v16qi)b);
704}
705
706static inline __m128i __attribute__((__always_inline__, __nodebug__))
707_mm_sub_epi16(__m128i a, __m128i b)
708{
709  return (__m128i)((__v8hi)a - (__v8hi)b);
710}
711
712static inline __m128i __attribute__((__always_inline__, __nodebug__))
713_mm_sub_epi32(__m128i a, __m128i b)
714{
715  return (__m128i)((__v4si)a - (__v4si)b);
716}
717
718static inline __m64 __attribute__((__always_inline__, __nodebug__))
719_mm_sub_si64(__m64 a, __m64 b)
720{
721  return a - b;
722}
723
724static inline __m128i __attribute__((__always_inline__, __nodebug__))
725_mm_sub_epi64(__m128i a, __m128i b)
726{
727  return a - b;
728}
729
730static inline __m128i __attribute__((__always_inline__, __nodebug__))
731_mm_subs_epi8(__m128i a, __m128i b)
732{
733  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
734}
735
736static inline __m128i __attribute__((__always_inline__, __nodebug__))
737_mm_subs_epi16(__m128i a, __m128i b)
738{
739  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
740}
741
742static inline __m128i __attribute__((__always_inline__, __nodebug__))
743_mm_subs_epu8(__m128i a, __m128i b)
744{
745  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
746}
747
748static inline __m128i __attribute__((__always_inline__, __nodebug__))
749_mm_subs_epu16(__m128i a, __m128i b)
750{
751  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
752}
753
754static inline __m128i __attribute__((__always_inline__, __nodebug__))
755_mm_and_si128(__m128i a, __m128i b)
756{
757  return a & b;
758}
759
760static inline __m128i __attribute__((__always_inline__, __nodebug__))
761_mm_andnot_si128(__m128i a, __m128i b)
762{
763  return ~a & b;
764}
765
766static inline __m128i __attribute__((__always_inline__, __nodebug__))
767_mm_or_si128(__m128i a, __m128i b)
768{
769  return a | b;
770}
771
772static inline __m128i __attribute__((__always_inline__, __nodebug__))
773_mm_xor_si128(__m128i a, __m128i b)
774{
775  return a ^ b;
776}
777
778static inline __m128i __attribute__((__always_inline__, __nodebug__))
779_mm_slli_si128(__m128i a, int imm)
780{
781  return __builtin_ia32_pslldqi128(a, imm * 8);
782}
783
784static inline __m128i __attribute__((__always_inline__, __nodebug__))
785_mm_slli_epi16(__m128i a, int count)
786{
787  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
788}
789
790static inline __m128i __attribute__((__always_inline__, __nodebug__))
791_mm_sll_epi16(__m128i a, __m128i count)
792{
793  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
794}
795
796static inline __m128i __attribute__((__always_inline__, __nodebug__))
797_mm_slli_epi32(__m128i a, int count)
798{
799  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
800}
801
802static inline __m128i __attribute__((__always_inline__, __nodebug__))
803_mm_sll_epi32(__m128i a, __m128i count)
804{
805  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
806}
807
808static inline __m128i __attribute__((__always_inline__, __nodebug__))
809_mm_slli_epi64(__m128i a, int count)
810{
811  return __builtin_ia32_psllqi128(a, count);
812}
813
814static inline __m128i __attribute__((__always_inline__, __nodebug__))
815_mm_sll_epi64(__m128i a, __m128i count)
816{
817  return __builtin_ia32_psllq128(a, count);
818}
819
820static inline __m128i __attribute__((__always_inline__, __nodebug__))
821_mm_srai_epi16(__m128i a, int count)
822{
823  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
824}
825
826static inline __m128i __attribute__((__always_inline__, __nodebug__))
827_mm_sra_epi16(__m128i a, __m128i count)
828{
829  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
830}
831
832static inline __m128i __attribute__((__always_inline__, __nodebug__))
833_mm_srai_epi32(__m128i a, int count)
834{
835  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
836}
837
838static inline __m128i __attribute__((__always_inline__, __nodebug__))
839_mm_sra_epi32(__m128i a, __m128i count)
840{
841  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
842}
843
844static inline __m128i __attribute__((__always_inline__, __nodebug__))
845_mm_srli_si128(__m128i a, int imm)
846{
847  return __builtin_ia32_psrldqi128(a, imm * 8);
848}
849
850static inline __m128i __attribute__((__always_inline__, __nodebug__))
851_mm_srli_epi16(__m128i a, int count)
852{
853  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
854}
855
856static inline __m128i __attribute__((__always_inline__, __nodebug__))
857_mm_srl_epi16(__m128i a, __m128i count)
858{
859  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
860}
861
862static inline __m128i __attribute__((__always_inline__, __nodebug__))
863_mm_srli_epi32(__m128i a, int count)
864{
865  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
866}
867
868static inline __m128i __attribute__((__always_inline__, __nodebug__))
869_mm_srl_epi32(__m128i a, __m128i count)
870{
871  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
872}
873
874static inline __m128i __attribute__((__always_inline__, __nodebug__))
875_mm_srli_epi64(__m128i a, int count)
876{
877  return __builtin_ia32_psrlqi128(a, count);
878}
879
880static inline __m128i __attribute__((__always_inline__, __nodebug__))
881_mm_srl_epi64(__m128i a, __m128i count)
882{
883  return __builtin_ia32_psrlq128(a, count);
884}
885
886static inline __m128i __attribute__((__always_inline__, __nodebug__))
887_mm_cmpeq_epi8(__m128i a, __m128i b)
888{
889  return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
890}
891
892static inline __m128i __attribute__((__always_inline__, __nodebug__))
893_mm_cmpeq_epi16(__m128i a, __m128i b)
894{
895  return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
896}
897
898static inline __m128i __attribute__((__always_inline__, __nodebug__))
899_mm_cmpeq_epi32(__m128i a, __m128i b)
900{
901  return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
902}
903
904static inline __m128i __attribute__((__always_inline__, __nodebug__))
905_mm_cmpgt_epi8(__m128i a, __m128i b)
906{
907  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
908}
909
910static inline __m128i __attribute__((__always_inline__, __nodebug__))
911_mm_cmpgt_epi16(__m128i a, __m128i b)
912{
913  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
914}
915
916static inline __m128i __attribute__((__always_inline__, __nodebug__))
917_mm_cmpgt_epi32(__m128i a, __m128i b)
918{
919  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
920}
921
922static inline __m128i __attribute__((__always_inline__, __nodebug__))
923_mm_cmplt_epi8(__m128i a, __m128i b)
924{
925  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
926}
927
928static inline __m128i __attribute__((__always_inline__, __nodebug__))
929_mm_cmplt_epi16(__m128i a, __m128i b)
930{
931  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
932}
933
934static inline __m128i __attribute__((__always_inline__, __nodebug__))
935_mm_cmplt_epi32(__m128i a, __m128i b)
936{
937  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
938}
939
940#ifdef __x86_64__
941static inline __m128d __attribute__((__always_inline__, __nodebug__))
942_mm_cvtsi64_sd(__m128d a, long long b)
943{
944  a[0] = b;
945  return a;
946}
947
948static inline long long __attribute__((__always_inline__, __nodebug__))
949_mm_cvtsd_si64(__m128d a)
950{
951  return __builtin_ia32_cvtsd2si64(a);
952}
953
954static inline long long __attribute__((__always_inline__, __nodebug__))
955_mm_cvttsd_si64(__m128d a)
956{
957  return a[0];
958}
959#endif
960
961static inline __m128 __attribute__((__always_inline__, __nodebug__))
962_mm_cvtepi32_ps(__m128i a)
963{
964  return __builtin_ia32_cvtdq2ps((__v4si)a);
965}
966
967static inline __m128i __attribute__((__always_inline__, __nodebug__))
968_mm_cvtps_epi32(__m128 a)
969{
970  return (__m128i)__builtin_ia32_cvtps2dq(a);
971}
972
973static inline __m128i __attribute__((__always_inline__, __nodebug__))
974_mm_cvttps_epi32(__m128 a)
975{
976  return (__m128i)__builtin_ia32_cvttps2dq(a);
977}
978
979static inline __m128i __attribute__((__always_inline__, __nodebug__))
980_mm_cvtsi32_si128(int a)
981{
982  return (__m128i)(__v4si){ a, 0, 0, 0 };
983}
984
985#ifdef __x86_64__
986static inline __m128i __attribute__((__always_inline__, __nodebug__))
987_mm_cvtsi64_si128(long long a)
988{
989  return (__m128i){ a, 0 };
990}
991#endif
992
993static inline int __attribute__((__always_inline__, __nodebug__))
994_mm_cvtsi128_si32(__m128i a)
995{
996  __v4si b = (__v4si)a;
997  return b[0];
998}
999
1000#ifdef __x86_64__
1001static inline long long __attribute__((__always_inline__, __nodebug__))
1002_mm_cvtsi128_si64(__m128i a)
1003{
1004  return a[0];
1005}
1006#endif
1007
1008static inline __m128i __attribute__((__always_inline__, __nodebug__))
1009_mm_load_si128(__m128i const *p)
1010{
1011  return *p;
1012}
1013
1014static inline __m128i __attribute__((__always_inline__, __nodebug__))
1015_mm_loadu_si128(__m128i const *p)
1016{
1017  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1018}
1019
1020static inline __m128i __attribute__((__always_inline__, __nodebug__))
1021_mm_loadl_epi64(__m128i const *p)
1022{
1023  return (__m128i) { *(long long*)p, 0};
1024}
1025
1026static inline __m128i __attribute__((__always_inline__, __nodebug__))
1027_mm_set_epi64(__m64 q1, __m64 q0)
1028{
1029  return (__m128i){ (long long)q0, (long long)q1 };
1030}
1031
1032static inline __m128i __attribute__((__always_inline__, __nodebug__))
1033_mm_set_epi32(int i3, int i2, int i1, int i0)
1034{
1035  return (__m128i)(__v4si){ i0, i1, i2, i3};
1036}
1037
1038static inline __m128i __attribute__((__always_inline__, __nodebug__))
1039_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1040{
1041  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1042}
1043
1044static inline __m128i __attribute__((__always_inline__, __nodebug__))
1045_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1046{
1047  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1048}
1049
1050static inline __m128i __attribute__((__always_inline__, __nodebug__))
1051_mm_set1_epi64(__m64 q)
1052{
1053  return (__m128i){ (long long)q, (long long)q };
1054}
1055
1056static inline __m128i __attribute__((__always_inline__, __nodebug__))
1057_mm_set1_epi32(int i)
1058{
1059  return (__m128i)(__v4si){ i, i, i, i };
1060}
1061
1062static inline __m128i __attribute__((__always_inline__, __nodebug__))
1063_mm_set1_epi16(short w)
1064{
1065  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1066}
1067
1068static inline __m128i __attribute__((__always_inline__, __nodebug__))
1069_mm_set1_epi8(char b)
1070{
1071  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1072}
1073
1074static inline __m128i __attribute__((__always_inline__, __nodebug__))
1075_mm_setr_epi64(__m64 q0, __m64 q1)
1076{
1077  return (__m128i){ (long long)q0, (long long)q1 };
1078}
1079
1080static inline __m128i __attribute__((__always_inline__, __nodebug__))
1081_mm_setr_epi32(int i0, int i1, int i2, int i3)
1082{
1083  return (__m128i)(__v4si){ i0, i1, i2, i3};
1084}
1085
1086static inline __m128i __attribute__((__always_inline__, __nodebug__))
1087_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1088{
1089  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1090}
1091
1092static inline __m128i __attribute__((__always_inline__, __nodebug__))
1093_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1094{
1095  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1096}
1097
1098static inline __m128i __attribute__((__always_inline__, __nodebug__))
1099_mm_setzero_si128(void)
1100{
1101  return (__m128i){ 0LL, 0LL };
1102}
1103
1104static inline void __attribute__((__always_inline__, __nodebug__))
1105_mm_store_si128(__m128i *p, __m128i b)
1106{
1107  *p = b;
1108}
1109
1110static inline void __attribute__((__always_inline__, __nodebug__))
1111_mm_storeu_si128(__m128i *p, __m128i b)
1112{
1113  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1114}
1115
1116static inline void __attribute__((__always_inline__, __nodebug__))
1117_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1118{
1119  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1120}
1121
1122static inline void __attribute__((__always_inline__, __nodebug__))
1123_mm_storel_epi64(__m128i *p, __m128i a)
1124{
1125  __builtin_ia32_storelv4si((__v2si *)p, a);
1126}
1127
1128static inline void __attribute__((__always_inline__, __nodebug__))
1129_mm_stream_pd(double *p, __m128d a)
1130{
1131  __builtin_ia32_movntpd(p, a);
1132}
1133
1134static inline void __attribute__((__always_inline__, __nodebug__))
1135_mm_stream_si128(__m128i *p, __m128i a)
1136{
1137  __builtin_ia32_movntdq(p, a);
1138}
1139
1140static inline void __attribute__((__always_inline__, __nodebug__))
1141_mm_stream_si32(int *p, int a)
1142{
1143  __builtin_ia32_movnti(p, a);
1144}
1145
1146static inline void __attribute__((__always_inline__, __nodebug__))
1147_mm_clflush(void const *p)
1148{
1149  __builtin_ia32_clflush(p);
1150}
1151
1152static inline void __attribute__((__always_inline__, __nodebug__))
1153_mm_lfence(void)
1154{
1155  __builtin_ia32_lfence();
1156}
1157
1158static inline void __attribute__((__always_inline__, __nodebug__))
1159_mm_mfence(void)
1160{
1161  __builtin_ia32_mfence();
1162}
1163
1164static inline __m128i __attribute__((__always_inline__, __nodebug__))
1165_mm_packs_epi16(__m128i a, __m128i b)
1166{
1167  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1168}
1169
1170static inline __m128i __attribute__((__always_inline__, __nodebug__))
1171_mm_packs_epi32(__m128i a, __m128i b)
1172{
1173  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1174}
1175
1176static inline __m128i __attribute__((__always_inline__, __nodebug__))
1177_mm_packus_epi16(__m128i a, __m128i b)
1178{
1179  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1180}
1181
1182static inline int __attribute__((__always_inline__, __nodebug__))
1183_mm_extract_epi16(__m128i a, int imm)
1184{
1185  __v8hi b = (__v8hi)a;
1186  return b[imm];
1187}
1188
1189static inline __m128i __attribute__((__always_inline__, __nodebug__))
1190_mm_insert_epi16(__m128i a, int b, int imm)
1191{
1192  __v8hi c = (__v8hi)a;
1193  c[imm & 7] = b;
1194  return (__m128i)c;
1195}
1196
1197static inline int __attribute__((__always_inline__, __nodebug__))
1198_mm_movemask_epi8(__m128i a)
1199{
1200  return __builtin_ia32_pmovmskb128((__v16qi)a);
1201}
1202
1203#define _mm_shuffle_epi32(a, imm) \
1204  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \
1205                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1206                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1207#define _mm_shufflelo_epi16(a, imm) \
1208  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \
1209                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1210                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1211                                    4, 5, 6, 7))
1212#define _mm_shufflehi_epi16(a, imm) \
1213  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \
1214                                    4 + ((imm) & 0x3), 4 + ((imm) & 0xc) >> 2, \
1215                                    4 + ((imm) & 0x30) >> 4, \
1216                                    4 + ((imm) & 0xc0) >> 6))
1217
1218static inline __m128i __attribute__((__always_inline__, __nodebug__))
1219_mm_unpackhi_epi8(__m128i a, __m128i b)
1220{
1221  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1222}
1223
1224static inline __m128i __attribute__((__always_inline__, __nodebug__))
1225_mm_unpackhi_epi16(__m128i a, __m128i b)
1226{
1227  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1228}
1229
1230static inline __m128i __attribute__((__always_inline__, __nodebug__))
1231_mm_unpackhi_epi32(__m128i a, __m128i b)
1232{
1233  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1234}
1235
1236static inline __m128i __attribute__((__always_inline__, __nodebug__))
1237_mm_unpackhi_epi64(__m128i a, __m128i b)
1238{
1239  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1240}
1241
1242static inline __m128i __attribute__((__always_inline__, __nodebug__))
1243_mm_unpacklo_epi8(__m128i a, __m128i b)
1244{
1245  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1246}
1247
1248static inline __m128i __attribute__((__always_inline__, __nodebug__))
1249_mm_unpacklo_epi16(__m128i a, __m128i b)
1250{
1251  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1252}
1253
1254static inline __m128i __attribute__((__always_inline__, __nodebug__))
1255_mm_unpacklo_epi32(__m128i a, __m128i b)
1256{
1257  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1258}
1259
1260static inline __m128i __attribute__((__always_inline__, __nodebug__))
1261_mm_unpacklo_epi64(__m128i a, __m128i b)
1262{
1263  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1264}
1265
1266static inline __m64 __attribute__((__always_inline__, __nodebug__))
1267_mm_movepi64_pi64(__m128i a)
1268{
1269  return (__m64)a[0];
1270}
1271
1272static inline __m128i __attribute__((__always_inline__, __nodebug__))
1273_mm_movpi64_pi64(__m64 a)
1274{
1275  return (__m128i){ (long long)a, 0 };
1276}
1277
1278static inline __m128i __attribute__((__always_inline__, __nodebug__))
1279_mm_move_epi64(__m128i a)
1280{
1281  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1282}
1283
1284static inline __m128d __attribute__((__always_inline__, __nodebug__))
1285_mm_unpackhi_pd(__m128d a, __m128d b)
1286{
1287  return __builtin_shufflevector(a, b, 1, 2+1);
1288}
1289
1290static inline __m128d __attribute__((__always_inline__, __nodebug__))
1291_mm_unpacklo_pd(__m128d a, __m128d b)
1292{
1293  return __builtin_shufflevector(a, b, 0, 2+0);
1294}
1295
1296static inline int __attribute__((__always_inline__, __nodebug__))
1297_mm_movemask_pd(__m128d a)
1298{
1299  return __builtin_ia32_movmskpd(a);
1300}
1301
1302#define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \
1303                                                         (((i) & 2) >> 1) + 2))
1304
1305static inline __m128 __attribute__((__always_inline__, __nodebug__))
1306_mm_castpd_ps(__m128d in)
1307{
1308  return (__m128)in;
1309}
1310
1311static inline __m128i __attribute__((__always_inline__, __nodebug__))
1312_mm_castpd_si128(__m128d in)
1313{
1314  return (__m128i)in;
1315}
1316
1317static inline __m128d __attribute__((__always_inline__, __nodebug__))
1318_mm_castps_pd(__m128 in)
1319{
1320  return (__m128d)in;
1321}
1322
1323static inline __m128i __attribute__((__always_inline__, __nodebug__))
1324_mm_castps_si128(__m128 in)
1325{
1326  return (__m128i)in;
1327}
1328
1329static inline __m128 __attribute__((__always_inline__, __nodebug__))
1330_mm_castsi128_ps(__m128i in)
1331{
1332  return (__m128)in;
1333}
1334
1335static inline __m128d __attribute__((__always_inline__, __nodebug__))
1336_mm_castsi128_pd(__m128i in)
1337{
1338  return (__m128d)in;
1339}
1340
1341static inline void __attribute__((__always_inline__, __nodebug__))
1342_mm_pause(void)
1343{
1344  __asm__ volatile ("pause");
1345}
1346
1347#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1348
1349#endif /* __SSE2__ */
1350
1351#endif /* __EMMINTRIN_H */
1352