emmintrin.h revision 205408
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __EMMINTRIN_H
25#define __EMMINTRIN_H
26
27#ifndef __SSE2__
28#error "SSE2 instruction set not enabled"
29#else
30
31#include <xmmintrin.h>
32
33typedef double __m128d __attribute__((__vector_size__(16)));
34typedef long long __m128i __attribute__((__vector_size__(16)));
35
36typedef short __v8hi __attribute__((__vector_size__(16)));
37typedef char __v16qi __attribute__((__vector_size__(16)));
38
39static inline __m128d __attribute__((__always_inline__, __nodebug__))
40_mm_add_sd(__m128d a, __m128d b)
41{
42  a[0] += b[0];
43  return a;
44}
45
46static inline __m128d __attribute__((__always_inline__, __nodebug__))
47_mm_add_pd(__m128d a, __m128d b)
48{
49  return a + b;
50}
51
52static inline __m128d __attribute__((__always_inline__, __nodebug__))
53_mm_sub_sd(__m128d a, __m128d b)
54{
55  a[0] -= b[0];
56  return a;
57}
58
59static inline __m128d __attribute__((__always_inline__, __nodebug__))
60_mm_sub_pd(__m128d a, __m128d b)
61{
62  return a - b;
63}
64
65static inline __m128d __attribute__((__always_inline__, __nodebug__))
66_mm_mul_sd(__m128d a, __m128d b)
67{
68  a[0] *= b[0];
69  return a;
70}
71
72static inline __m128d __attribute__((__always_inline__, __nodebug__))
73_mm_mul_pd(__m128d a, __m128d b)
74{
75  return a * b;
76}
77
78static inline __m128d __attribute__((__always_inline__, __nodebug__))
79_mm_div_sd(__m128d a, __m128d b)
80{
81  a[0] /= b[0];
82  return a;
83}
84
85static inline __m128d __attribute__((__always_inline__, __nodebug__))
86_mm_div_pd(__m128d a, __m128d b)
87{
88  return a / b;
89}
90
91static inline __m128d __attribute__((__always_inline__, __nodebug__))
92_mm_sqrt_sd(__m128d a, __m128d b)
93{
94  __m128d c = __builtin_ia32_sqrtsd(b);
95  return (__m128d) { c[0], a[1] };
96}
97
98static inline __m128d __attribute__((__always_inline__, __nodebug__))
99_mm_sqrt_pd(__m128d a)
100{
101  return __builtin_ia32_sqrtpd(a);
102}
103
104static inline __m128d __attribute__((__always_inline__, __nodebug__))
105_mm_min_sd(__m128d a, __m128d b)
106{
107  return __builtin_ia32_minsd(a, b);
108}
109
110static inline __m128d __attribute__((__always_inline__, __nodebug__))
111_mm_min_pd(__m128d a, __m128d b)
112{
113  return __builtin_ia32_minpd(a, b);
114}
115
116static inline __m128d __attribute__((__always_inline__, __nodebug__))
117_mm_max_sd(__m128d a, __m128d b)
118{
119  return __builtin_ia32_maxsd(a, b);
120}
121
122static inline __m128d __attribute__((__always_inline__, __nodebug__))
123_mm_max_pd(__m128d a, __m128d b)
124{
125  return __builtin_ia32_maxpd(a, b);
126}
127
128static inline __m128d __attribute__((__always_inline__, __nodebug__))
129_mm_and_pd(__m128d a, __m128d b)
130{
131  return (__m128d)((__v4si)a & (__v4si)b);
132}
133
134static inline __m128d __attribute__((__always_inline__, __nodebug__))
135_mm_andnot_pd(__m128d a, __m128d b)
136{
137  return (__m128d)(~(__v4si)a & (__v4si)b);
138}
139
140static inline __m128d __attribute__((__always_inline__, __nodebug__))
141_mm_or_pd(__m128d a, __m128d b)
142{
143  return (__m128d)((__v4si)a | (__v4si)b);
144}
145
146static inline __m128d __attribute__((__always_inline__, __nodebug__))
147_mm_xor_pd(__m128d a, __m128d b)
148{
149  return (__m128d)((__v4si)a ^ (__v4si)b);
150}
151
152static inline __m128d __attribute__((__always_inline__, __nodebug__))
153_mm_cmpeq_pd(__m128d a, __m128d b)
154{
155  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
156}
157
158static inline __m128d __attribute__((__always_inline__, __nodebug__))
159_mm_cmplt_pd(__m128d a, __m128d b)
160{
161  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
162}
163
164static inline __m128d __attribute__((__always_inline__, __nodebug__))
165_mm_cmple_pd(__m128d a, __m128d b)
166{
167  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
168}
169
170static inline __m128d __attribute__((__always_inline__, __nodebug__))
171_mm_cmpgt_pd(__m128d a, __m128d b)
172{
173  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
174}
175
176static inline __m128d __attribute__((__always_inline__, __nodebug__))
177_mm_cmpge_pd(__m128d a, __m128d b)
178{
179  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
180}
181
182static inline __m128d __attribute__((__always_inline__, __nodebug__))
183_mm_cmpord_pd(__m128d a, __m128d b)
184{
185  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
186}
187
188static inline __m128d __attribute__((__always_inline__, __nodebug__))
189_mm_cmpunord_pd(__m128d a, __m128d b)
190{
191  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
192}
193
194static inline __m128d __attribute__((__always_inline__, __nodebug__))
195_mm_cmpneq_pd(__m128d a, __m128d b)
196{
197  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
198}
199
200static inline __m128d __attribute__((__always_inline__, __nodebug__))
201_mm_cmpnlt_pd(__m128d a, __m128d b)
202{
203  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
204}
205
206static inline __m128d __attribute__((__always_inline__, __nodebug__))
207_mm_cmpnle_pd(__m128d a, __m128d b)
208{
209  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
210}
211
212static inline __m128d __attribute__((__always_inline__, __nodebug__))
213_mm_cmpngt_pd(__m128d a, __m128d b)
214{
215  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
216}
217
218static inline __m128d __attribute__((__always_inline__, __nodebug__))
219_mm_cmpnge_pd(__m128d a, __m128d b)
220{
221  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
222}
223
224static inline __m128d __attribute__((__always_inline__, __nodebug__))
225_mm_cmpeq_sd(__m128d a, __m128d b)
226{
227  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
228}
229
230static inline __m128d __attribute__((__always_inline__, __nodebug__))
231_mm_cmplt_sd(__m128d a, __m128d b)
232{
233  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
234}
235
236static inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_cmple_sd(__m128d a, __m128d b)
238{
239  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
240}
241
242static inline __m128d __attribute__((__always_inline__, __nodebug__))
243_mm_cmpgt_sd(__m128d a, __m128d b)
244{
245  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
246}
247
248static inline __m128d __attribute__((__always_inline__, __nodebug__))
249_mm_cmpge_sd(__m128d a, __m128d b)
250{
251  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
252}
253
254static inline __m128d __attribute__((__always_inline__, __nodebug__))
255_mm_cmpord_sd(__m128d a, __m128d b)
256{
257  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
258}
259
260static inline __m128d __attribute__((__always_inline__, __nodebug__))
261_mm_cmpunord_sd(__m128d a, __m128d b)
262{
263  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
264}
265
266static inline __m128d __attribute__((__always_inline__, __nodebug__))
267_mm_cmpneq_sd(__m128d a, __m128d b)
268{
269  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
270}
271
272static inline __m128d __attribute__((__always_inline__, __nodebug__))
273_mm_cmpnlt_sd(__m128d a, __m128d b)
274{
275  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
276}
277
278static inline __m128d __attribute__((__always_inline__, __nodebug__))
279_mm_cmpnle_sd(__m128d a, __m128d b)
280{
281  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
282}
283
284static inline __m128d __attribute__((__always_inline__, __nodebug__))
285_mm_cmpngt_sd(__m128d a, __m128d b)
286{
287  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
288}
289
290static inline __m128d __attribute__((__always_inline__, __nodebug__))
291_mm_cmpnge_sd(__m128d a, __m128d b)
292{
293  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
294}
295
296static inline int __attribute__((__always_inline__, __nodebug__))
297_mm_comieq_sd(__m128d a, __m128d b)
298{
299  return __builtin_ia32_comisdeq(a, b);
300}
301
302static inline int __attribute__((__always_inline__, __nodebug__))
303_mm_comilt_sd(__m128d a, __m128d b)
304{
305  return __builtin_ia32_comisdlt(a, b);
306}
307
308static inline int __attribute__((__always_inline__, __nodebug__))
309_mm_comile_sd(__m128d a, __m128d b)
310{
311  return __builtin_ia32_comisdle(a, b);
312}
313
314static inline int __attribute__((__always_inline__, __nodebug__))
315_mm_comigt_sd(__m128d a, __m128d b)
316{
317  return __builtin_ia32_comisdgt(a, b);
318}
319
320static inline int __attribute__((__always_inline__, __nodebug__))
321_mm_comineq_sd(__m128d a, __m128d b)
322{
323  return __builtin_ia32_comisdneq(a, b);
324}
325
326static inline int __attribute__((__always_inline__, __nodebug__))
327_mm_ucomieq_sd(__m128d a, __m128d b)
328{
329  return __builtin_ia32_ucomisdeq(a, b);
330}
331
332static inline int __attribute__((__always_inline__, __nodebug__))
333_mm_ucomilt_sd(__m128d a, __m128d b)
334{
335  return __builtin_ia32_ucomisdlt(a, b);
336}
337
338static inline int __attribute__((__always_inline__, __nodebug__))
339_mm_ucomile_sd(__m128d a, __m128d b)
340{
341  return __builtin_ia32_ucomisdle(a, b);
342}
343
344static inline int __attribute__((__always_inline__, __nodebug__))
345_mm_ucomigt_sd(__m128d a, __m128d b)
346{
347  return __builtin_ia32_ucomisdgt(a, b);
348}
349
350static inline int __attribute__((__always_inline__, __nodebug__))
351_mm_ucomineq_sd(__m128d a, __m128d b)
352{
353  return __builtin_ia32_ucomisdneq(a, b);
354}
355
356static inline __m128 __attribute__((__always_inline__, __nodebug__))
357_mm_cvtpd_ps(__m128d a)
358{
359  return __builtin_ia32_cvtpd2ps(a);
360}
361
362static inline __m128d __attribute__((__always_inline__, __nodebug__))
363_mm_cvtps_pd(__m128 a)
364{
365  return __builtin_ia32_cvtps2pd(a);
366}
367
368static inline __m128d __attribute__((__always_inline__, __nodebug__))
369_mm_cvtepi32_pd(__m128i a)
370{
371  return __builtin_ia32_cvtdq2pd((__v4si)a);
372}
373
374static inline __m128i __attribute__((__always_inline__, __nodebug__))
375_mm_cvtpd_epi32(__m128d a)
376{
377  return __builtin_ia32_cvtpd2dq(a);
378}
379
380static inline int __attribute__((__always_inline__, __nodebug__))
381_mm_cvtsd_si32(__m128d a)
382{
383  return __builtin_ia32_cvtsd2si(a);
384}
385
386static inline __m128 __attribute__((__always_inline__, __nodebug__))
387_mm_cvtsd_ss(__m128 a, __m128d b)
388{
389  a[0] = b[0];
390  return a;
391}
392
393static inline __m128d __attribute__((__always_inline__, __nodebug__))
394_mm_cvtsi32_sd(__m128d a, int b)
395{
396  a[0] = b;
397  return a;
398}
399
400static inline __m128d __attribute__((__always_inline__, __nodebug__))
401_mm_cvtss_sd(__m128d a, __m128 b)
402{
403  a[0] = b[0];
404  return a;
405}
406
407static inline __m128i __attribute__((__always_inline__, __nodebug__))
408_mm_cvttpd_epi32(__m128d a)
409{
410  return (__m128i)__builtin_ia32_cvttpd2dq(a);
411}
412
413static inline int __attribute__((__always_inline__, __nodebug__))
414_mm_cvttsd_si32(__m128d a)
415{
416  return a[0];
417}
418
419static inline __m64 __attribute__((__always_inline__, __nodebug__))
420_mm_cvtpd_pi32(__m128d a)
421{
422  return (__m64)__builtin_ia32_cvtpd2pi(a);
423}
424
425static inline __m64 __attribute__((__always_inline__, __nodebug__))
426_mm_cvttpd_pi32(__m128d a)
427{
428  return (__m64)__builtin_ia32_cvttpd2pi(a);
429}
430
431static inline __m128d __attribute__((__always_inline__, __nodebug__))
432_mm_cvtpi32_pd(__m64 a)
433{
434  return __builtin_ia32_cvtpi2pd((__v2si)a);
435}
436
437static inline double __attribute__((__always_inline__, __nodebug__))
438_mm_cvtsd_f64(__m128d a)
439{
440  return a[0];
441}
442
443static inline __m128d __attribute__((__always_inline__, __nodebug__))
444_mm_load_pd(double const *dp)
445{
446  return *(__m128d*)dp;
447}
448
449static inline __m128d __attribute__((__always_inline__, __nodebug__))
450_mm_load1_pd(double const *dp)
451{
452  return (__m128d){ dp[0], dp[0] };
453}
454
455#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
456
457static inline __m128d __attribute__((__always_inline__, __nodebug__))
458_mm_loadr_pd(double const *dp)
459{
460  return (__m128d){ dp[1], dp[0] };
461}
462
463static inline __m128d __attribute__((__always_inline__, __nodebug__))
464_mm_loadu_pd(double const *dp)
465{
466  return __builtin_ia32_loadupd(dp);
467}
468
469static inline __m128d __attribute__((__always_inline__, __nodebug__))
470_mm_load_sd(double const *dp)
471{
472  return (__m128d){ *dp, 0.0 };
473}
474
475static inline __m128d __attribute__((__always_inline__, __nodebug__))
476_mm_loadh_pd(__m128d a, double const *dp)
477{
478  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
479}
480
481static inline __m128d __attribute__((__always_inline__, __nodebug__))
482_mm_loadl_pd(__m128d a, double const *dp)
483{
484  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
485}
486
487static inline __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_set_sd(double w)
489{
490  return (__m128d){ w, 0 };
491}
492
493static inline __m128d __attribute__((__always_inline__, __nodebug__))
494_mm_set1_pd(double w)
495{
496  return (__m128d){ w, w };
497}
498
499static inline __m128d __attribute__((__always_inline__, __nodebug__))
500_mm_set_pd(double w, double x)
501{
502  return (__m128d){ x, w };
503}
504
505static inline __m128d __attribute__((__always_inline__, __nodebug__))
506_mm_setr_pd(double w, double x)
507{
508  return (__m128d){ w, x };
509}
510
511static inline __m128d __attribute__((__always_inline__, __nodebug__))
512_mm_setzero_pd(void)
513{
514  return (__m128d){ 0, 0 };
515}
516
517static inline __m128d __attribute__((__always_inline__, __nodebug__))
518_mm_move_sd(__m128d a, __m128d b)
519{
520  return (__m128d){ b[0], a[1] };
521}
522
523static inline void __attribute__((__always_inline__, __nodebug__))
524_mm_store_sd(double *dp, __m128d a)
525{
526  dp[0] = a[0];
527}
528
529static inline void __attribute__((__always_inline__, __nodebug__))
530_mm_store1_pd(double *dp, __m128d a)
531{
532  dp[0] = a[0];
533  dp[1] = a[0];
534}
535
536static inline void __attribute__((__always_inline__, __nodebug__))
537_mm_store_pd(double *dp, __m128d a)
538{
539  *(__m128d *)dp = a;
540}
541
542static inline void __attribute__((__always_inline__, __nodebug__))
543_mm_storeu_pd(double *dp, __m128d a)
544{
545  __builtin_ia32_storeupd(dp, a);
546}
547
548static inline void __attribute__((__always_inline__, __nodebug__))
549_mm_storer_pd(double *dp, __m128d a)
550{
551  dp[0] = a[1];
552  dp[1] = a[0];
553}
554
555static inline void __attribute__((__always_inline__, __nodebug__))
556_mm_storeh_pd(double *dp, __m128d a)
557{
558  dp[0] = a[1];
559}
560
561static inline void __attribute__((__always_inline__, __nodebug__))
562_mm_storel_pd(double *dp, __m128d a)
563{
564  dp[0] = a[0];
565}
566
567static inline __m128i __attribute__((__always_inline__, __nodebug__))
568_mm_add_epi8(__m128i a, __m128i b)
569{
570  return (__m128i)((__v16qi)a + (__v16qi)b);
571}
572
573static inline __m128i __attribute__((__always_inline__, __nodebug__))
574_mm_add_epi16(__m128i a, __m128i b)
575{
576  return (__m128i)((__v8hi)a + (__v8hi)b);
577}
578
579static inline __m128i __attribute__((__always_inline__, __nodebug__))
580_mm_add_epi32(__m128i a, __m128i b)
581{
582  return (__m128i)((__v4si)a + (__v4si)b);
583}
584
585static inline __m64 __attribute__((__always_inline__, __nodebug__))
586_mm_add_si64(__m64 a, __m64 b)
587{
588  return a + b;
589}
590
591static inline __m128i __attribute__((__always_inline__, __nodebug__))
592_mm_add_epi64(__m128i a, __m128i b)
593{
594  return a + b;
595}
596
597static inline __m128i __attribute__((__always_inline__, __nodebug__))
598_mm_adds_epi8(__m128i a, __m128i b)
599{
600  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
601}
602
603static inline __m128i __attribute__((__always_inline__, __nodebug__))
604_mm_adds_epi16(__m128i a, __m128i b)
605{
606  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
607}
608
609static inline __m128i __attribute__((__always_inline__, __nodebug__))
610_mm_adds_epu8(__m128i a, __m128i b)
611{
612  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
613}
614
615static inline __m128i __attribute__((__always_inline__, __nodebug__))
616_mm_adds_epu16(__m128i a, __m128i b)
617{
618  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
619}
620
621static inline __m128i __attribute__((__always_inline__, __nodebug__))
622_mm_avg_epu8(__m128i a, __m128i b)
623{
624  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
625}
626
627static inline __m128i __attribute__((__always_inline__, __nodebug__))
628_mm_avg_epu16(__m128i a, __m128i b)
629{
630  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
631}
632
633static inline __m128i __attribute__((__always_inline__, __nodebug__))
634_mm_madd_epi16(__m128i a, __m128i b)
635{
636  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
637}
638
639static inline __m128i __attribute__((__always_inline__, __nodebug__))
640_mm_max_epi16(__m128i a, __m128i b)
641{
642  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
643}
644
645static inline __m128i __attribute__((__always_inline__, __nodebug__))
646_mm_max_epu8(__m128i a, __m128i b)
647{
648  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
649}
650
651static inline __m128i __attribute__((__always_inline__, __nodebug__))
652_mm_min_epi16(__m128i a, __m128i b)
653{
654  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
655}
656
657static inline __m128i __attribute__((__always_inline__, __nodebug__))
658_mm_min_epu8(__m128i a, __m128i b)
659{
660  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
661}
662
663static inline __m128i __attribute__((__always_inline__, __nodebug__))
664_mm_mulhi_epi16(__m128i a, __m128i b)
665{
666  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
667}
668
669static inline __m128i __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_epu16(__m128i a, __m128i b)
671{
672  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
673}
674
675static inline __m128i __attribute__((__always_inline__, __nodebug__))
676_mm_mullo_epi16(__m128i a, __m128i b)
677{
678  return (__m128i)((__v8hi)a * (__v8hi)b);
679}
680
681static inline __m64 __attribute__((__always_inline__, __nodebug__))
682_mm_mul_su32(__m64 a, __m64 b)
683{
684  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
685}
686
687static inline __m128i __attribute__((__always_inline__, __nodebug__))
688_mm_mul_epu32(__m128i a, __m128i b)
689{
690  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
691}
692
693static inline __m128i __attribute__((__always_inline__, __nodebug__))
694_mm_sad_epu8(__m128i a, __m128i b)
695{
696  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
697}
698
699static inline __m128i __attribute__((__always_inline__, __nodebug__))
700_mm_sub_epi8(__m128i a, __m128i b)
701{
702  return (__m128i)((__v16qi)a - (__v16qi)b);
703}
704
705static inline __m128i __attribute__((__always_inline__, __nodebug__))
706_mm_sub_epi16(__m128i a, __m128i b)
707{
708  return (__m128i)((__v8hi)a - (__v8hi)b);
709}
710
711static inline __m128i __attribute__((__always_inline__, __nodebug__))
712_mm_sub_epi32(__m128i a, __m128i b)
713{
714  return (__m128i)((__v4si)a - (__v4si)b);
715}
716
717static inline __m64 __attribute__((__always_inline__, __nodebug__))
718_mm_sub_si64(__m64 a, __m64 b)
719{
720  return a - b;
721}
722
723static inline __m128i __attribute__((__always_inline__, __nodebug__))
724_mm_sub_epi64(__m128i a, __m128i b)
725{
726  return a - b;
727}
728
729static inline __m128i __attribute__((__always_inline__, __nodebug__))
730_mm_subs_epi8(__m128i a, __m128i b)
731{
732  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
733}
734
735static inline __m128i __attribute__((__always_inline__, __nodebug__))
736_mm_subs_epi16(__m128i a, __m128i b)
737{
738  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
739}
740
741static inline __m128i __attribute__((__always_inline__, __nodebug__))
742_mm_subs_epu8(__m128i a, __m128i b)
743{
744  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
745}
746
747static inline __m128i __attribute__((__always_inline__, __nodebug__))
748_mm_subs_epu16(__m128i a, __m128i b)
749{
750  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
751}
752
753static inline __m128i __attribute__((__always_inline__, __nodebug__))
754_mm_and_si128(__m128i a, __m128i b)
755{
756  return a & b;
757}
758
759static inline __m128i __attribute__((__always_inline__, __nodebug__))
760_mm_andnot_si128(__m128i a, __m128i b)
761{
762  return ~a & b;
763}
764
765static inline __m128i __attribute__((__always_inline__, __nodebug__))
766_mm_or_si128(__m128i a, __m128i b)
767{
768  return a | b;
769}
770
771static inline __m128i __attribute__((__always_inline__, __nodebug__))
772_mm_xor_si128(__m128i a, __m128i b)
773{
774  return a ^ b;
775}
776
777static inline __m128i __attribute__((__always_inline__, __nodebug__))
778_mm_slli_si128(__m128i a, int imm)
779{
780  return __builtin_ia32_pslldqi128(a, imm * 8);
781}
782
783static inline __m128i __attribute__((__always_inline__, __nodebug__))
784_mm_slli_epi16(__m128i a, int count)
785{
786  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
787}
788
789static inline __m128i __attribute__((__always_inline__, __nodebug__))
790_mm_sll_epi16(__m128i a, __m128i count)
791{
792  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
793}
794
795static inline __m128i __attribute__((__always_inline__, __nodebug__))
796_mm_slli_epi32(__m128i a, int count)
797{
798  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
799}
800
801static inline __m128i __attribute__((__always_inline__, __nodebug__))
802_mm_sll_epi32(__m128i a, __m128i count)
803{
804  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
805}
806
807static inline __m128i __attribute__((__always_inline__, __nodebug__))
808_mm_slli_epi64(__m128i a, int count)
809{
810  return __builtin_ia32_psllqi128(a, count);
811}
812
813static inline __m128i __attribute__((__always_inline__, __nodebug__))
814_mm_sll_epi64(__m128i a, __m128i count)
815{
816  return __builtin_ia32_psllq128(a, count);
817}
818
819static inline __m128i __attribute__((__always_inline__, __nodebug__))
820_mm_srai_epi16(__m128i a, int count)
821{
822  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
823}
824
825static inline __m128i __attribute__((__always_inline__, __nodebug__))
826_mm_sra_epi16(__m128i a, __m128i count)
827{
828  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
829}
830
831static inline __m128i __attribute__((__always_inline__, __nodebug__))
832_mm_srai_epi32(__m128i a, int count)
833{
834  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
835}
836
837static inline __m128i __attribute__((__always_inline__, __nodebug__))
838_mm_sra_epi32(__m128i a, __m128i count)
839{
840  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
841}
842
843static inline __m128i __attribute__((__always_inline__, __nodebug__))
844_mm_srli_si128(__m128i a, int imm)
845{
846  return __builtin_ia32_psrldqi128(a, imm * 8);
847}
848
849static inline __m128i __attribute__((__always_inline__, __nodebug__))
850_mm_srli_epi16(__m128i a, int count)
851{
852  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
853}
854
855static inline __m128i __attribute__((__always_inline__, __nodebug__))
856_mm_srl_epi16(__m128i a, __m128i count)
857{
858  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
859}
860
861static inline __m128i __attribute__((__always_inline__, __nodebug__))
862_mm_srli_epi32(__m128i a, int count)
863{
864  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
865}
866
867static inline __m128i __attribute__((__always_inline__, __nodebug__))
868_mm_srl_epi32(__m128i a, __m128i count)
869{
870  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
871}
872
873static inline __m128i __attribute__((__always_inline__, __nodebug__))
874_mm_srli_epi64(__m128i a, int count)
875{
876  return __builtin_ia32_psrlqi128(a, count);
877}
878
879static inline __m128i __attribute__((__always_inline__, __nodebug__))
880_mm_srl_epi64(__m128i a, __m128i count)
881{
882  return __builtin_ia32_psrlq128(a, count);
883}
884
885static inline __m128i __attribute__((__always_inline__, __nodebug__))
886_mm_cmpeq_epi8(__m128i a, __m128i b)
887{
888  return (__m128i)((__v16qi)a == (__v16qi)b);
889}
890
891static inline __m128i __attribute__((__always_inline__, __nodebug__))
892_mm_cmpeq_epi16(__m128i a, __m128i b)
893{
894  return (__m128i)((__v8hi)a == (__v8hi)b);
895}
896
897static inline __m128i __attribute__((__always_inline__, __nodebug__))
898_mm_cmpeq_epi32(__m128i a, __m128i b)
899{
900  return (__m128i)((__v4si)a == (__v4si)b);
901}
902
903static inline __m128i __attribute__((__always_inline__, __nodebug__))
904_mm_cmpgt_epi8(__m128i a, __m128i b)
905{
906  return (__m128i)((__v16qi)a > (__v16qi)b);
907}
908
909static inline __m128i __attribute__((__always_inline__, __nodebug__))
910_mm_cmpgt_epi16(__m128i a, __m128i b)
911{
912  return (__m128i)((__v8hi)a > (__v8hi)b);
913}
914
915static inline __m128i __attribute__((__always_inline__, __nodebug__))
916_mm_cmpgt_epi32(__m128i a, __m128i b)
917{
918  return (__m128i)((__v4si)a > (__v4si)b);
919}
920
921static inline __m128i __attribute__((__always_inline__, __nodebug__))
922_mm_cmplt_epi8(__m128i a, __m128i b)
923{
924  return _mm_cmpgt_epi8(b,a);
925}
926
927static inline __m128i __attribute__((__always_inline__, __nodebug__))
928_mm_cmplt_epi16(__m128i a, __m128i b)
929{
930  return _mm_cmpgt_epi16(b,a);
931}
932
933static inline __m128i __attribute__((__always_inline__, __nodebug__))
934_mm_cmplt_epi32(__m128i a, __m128i b)
935{
936  return _mm_cmpgt_epi32(b,a);
937}
938
939#ifdef __x86_64__
940static inline __m128d __attribute__((__always_inline__, __nodebug__))
941_mm_cvtsi64_sd(__m128d a, long long b)
942{
943  a[0] = b;
944  return a;
945}
946
947static inline long long __attribute__((__always_inline__, __nodebug__))
948_mm_cvtsd_si64(__m128d a)
949{
950  return __builtin_ia32_cvtsd2si64(a);
951}
952
953static inline long long __attribute__((__always_inline__, __nodebug__))
954_mm_cvttsd_si64(__m128d a)
955{
956  return a[0];
957}
958#endif
959
960static inline __m128 __attribute__((__always_inline__, __nodebug__))
961_mm_cvtepi32_ps(__m128i a)
962{
963  return __builtin_ia32_cvtdq2ps((__v4si)a);
964}
965
966static inline __m128i __attribute__((__always_inline__, __nodebug__))
967_mm_cvtps_epi32(__m128 a)
968{
969  return (__m128i)__builtin_ia32_cvtps2dq(a);
970}
971
972static inline __m128i __attribute__((__always_inline__, __nodebug__))
973_mm_cvttps_epi32(__m128 a)
974{
975  return (__m128i)__builtin_ia32_cvttps2dq(a);
976}
977
978static inline __m128i __attribute__((__always_inline__, __nodebug__))
979_mm_cvtsi32_si128(int a)
980{
981  return (__m128i)(__v4si){ a, 0, 0, 0 };
982}
983
984#ifdef __x86_64__
985static inline __m128i __attribute__((__always_inline__, __nodebug__))
986_mm_cvtsi64_si128(long long a)
987{
988  return (__m128i){ a, 0 };
989}
990#endif
991
992static inline int __attribute__((__always_inline__, __nodebug__))
993_mm_cvtsi128_si32(__m128i a)
994{
995  __v4si b = (__v4si)a;
996  return b[0];
997}
998
999#ifdef __x86_64__
1000static inline long long __attribute__((__always_inline__, __nodebug__))
1001_mm_cvtsi128_si64(__m128i a)
1002{
1003  return a[0];
1004}
1005#endif
1006
1007static inline __m128i __attribute__((__always_inline__, __nodebug__))
1008_mm_load_si128(__m128i const *p)
1009{
1010  return *p;
1011}
1012
1013static inline __m128i __attribute__((__always_inline__, __nodebug__))
1014_mm_loadu_si128(__m128i const *p)
1015{
1016  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
1017}
1018
1019static inline __m128i __attribute__((__always_inline__, __nodebug__))
1020_mm_loadl_epi64(__m128i const *p)
1021{
1022  return (__m128i) { *(long long*)p, 0};
1023}
1024
1025static inline __m128i __attribute__((__always_inline__, __nodebug__))
1026_mm_set_epi64x(long long q1, long long q0)
1027{
1028  return (__m128i){ q0, q1 };
1029}
1030
1031static inline __m128i __attribute__((__always_inline__, __nodebug__))
1032_mm_set_epi64(__m64 q1, __m64 q0)
1033{
1034  return (__m128i){ (long long)q0, (long long)q1 };
1035}
1036
1037static inline __m128i __attribute__((__always_inline__, __nodebug__))
1038_mm_set_epi32(int i3, int i2, int i1, int i0)
1039{
1040  return (__m128i)(__v4si){ i0, i1, i2, i3};
1041}
1042
1043static inline __m128i __attribute__((__always_inline__, __nodebug__))
1044_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1045{
1046  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1047}
1048
1049static inline __m128i __attribute__((__always_inline__, __nodebug__))
1050_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1051{
1052  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1053}
1054
1055static inline __m128i __attribute__((__always_inline__, __nodebug__))
1056_mm_set1_epi64x(long long q)
1057{
1058  return (__m128i){ q, q };
1059}
1060
1061static inline __m128i __attribute__((__always_inline__, __nodebug__))
1062_mm_set1_epi64(__m64 q)
1063{
1064  return (__m128i){ (long long)q, (long long)q };
1065}
1066
1067static inline __m128i __attribute__((__always_inline__, __nodebug__))
1068_mm_set1_epi32(int i)
1069{
1070  return (__m128i)(__v4si){ i, i, i, i };
1071}
1072
1073static inline __m128i __attribute__((__always_inline__, __nodebug__))
1074_mm_set1_epi16(short w)
1075{
1076  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1077}
1078
1079static inline __m128i __attribute__((__always_inline__, __nodebug__))
1080_mm_set1_epi8(char b)
1081{
1082  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1083}
1084
1085static inline __m128i __attribute__((__always_inline__, __nodebug__))
1086_mm_setr_epi64(__m64 q0, __m64 q1)
1087{
1088  return (__m128i){ (long long)q0, (long long)q1 };
1089}
1090
1091static inline __m128i __attribute__((__always_inline__, __nodebug__))
1092_mm_setr_epi32(int i0, int i1, int i2, int i3)
1093{
1094  return (__m128i)(__v4si){ i0, i1, i2, i3};
1095}
1096
1097static inline __m128i __attribute__((__always_inline__, __nodebug__))
1098_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1099{
1100  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1101}
1102
1103static inline __m128i __attribute__((__always_inline__, __nodebug__))
1104_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1105{
1106  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1107}
1108
1109static inline __m128i __attribute__((__always_inline__, __nodebug__))
1110_mm_setzero_si128(void)
1111{
1112  return (__m128i){ 0LL, 0LL };
1113}
1114
1115static inline void __attribute__((__always_inline__, __nodebug__))
1116_mm_store_si128(__m128i *p, __m128i b)
1117{
1118  *p = b;
1119}
1120
1121static inline void __attribute__((__always_inline__, __nodebug__))
1122_mm_storeu_si128(__m128i *p, __m128i b)
1123{
1124  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1125}
1126
1127static inline void __attribute__((__always_inline__, __nodebug__))
1128_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1129{
1130  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1131}
1132
1133static inline void __attribute__((__always_inline__, __nodebug__))
1134_mm_storel_epi64(__m128i *p, __m128i a)
1135{
1136  __builtin_ia32_storelv4si((__v2si *)p, a);
1137}
1138
1139static inline void __attribute__((__always_inline__, __nodebug__))
1140_mm_stream_pd(double *p, __m128d a)
1141{
1142  __builtin_ia32_movntpd(p, a);
1143}
1144
1145static inline void __attribute__((__always_inline__, __nodebug__))
1146_mm_stream_si128(__m128i *p, __m128i a)
1147{
1148  __builtin_ia32_movntdq(p, a);
1149}
1150
1151static inline void __attribute__((__always_inline__, __nodebug__))
1152_mm_stream_si32(int *p, int a)
1153{
1154  __builtin_ia32_movnti(p, a);
1155}
1156
1157static inline void __attribute__((__always_inline__, __nodebug__))
1158_mm_clflush(void const *p)
1159{
1160  __builtin_ia32_clflush(p);
1161}
1162
1163static inline void __attribute__((__always_inline__, __nodebug__))
1164_mm_lfence(void)
1165{
1166  __builtin_ia32_lfence();
1167}
1168
1169static inline void __attribute__((__always_inline__, __nodebug__))
1170_mm_mfence(void)
1171{
1172  __builtin_ia32_mfence();
1173}
1174
1175static inline __m128i __attribute__((__always_inline__, __nodebug__))
1176_mm_packs_epi16(__m128i a, __m128i b)
1177{
1178  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1179}
1180
1181static inline __m128i __attribute__((__always_inline__, __nodebug__))
1182_mm_packs_epi32(__m128i a, __m128i b)
1183{
1184  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1185}
1186
1187static inline __m128i __attribute__((__always_inline__, __nodebug__))
1188_mm_packus_epi16(__m128i a, __m128i b)
1189{
1190  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1191}
1192
1193static inline int __attribute__((__always_inline__, __nodebug__))
1194_mm_extract_epi16(__m128i a, int imm)
1195{
1196  __v8hi b = (__v8hi)a;
1197  return b[imm];
1198}
1199
1200static inline __m128i __attribute__((__always_inline__, __nodebug__))
1201_mm_insert_epi16(__m128i a, int b, int imm)
1202{
1203  __v8hi c = (__v8hi)a;
1204  c[imm & 7] = b;
1205  return (__m128i)c;
1206}
1207
1208static inline int __attribute__((__always_inline__, __nodebug__))
1209_mm_movemask_epi8(__m128i a)
1210{
1211  return __builtin_ia32_pmovmskb128((__v16qi)a);
1212}
1213
1214#define _mm_shuffle_epi32(a, imm) \
1215  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \
1216                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1217                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1218#define _mm_shufflelo_epi16(a, imm) \
1219  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \
1220                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1222                                    4, 5, 6, 7))
1223#define _mm_shufflehi_epi16(a, imm) \
1224  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \
1225                                    4 + ((imm) & 0x3), 4 + ((imm) & 0xc) >> 2, \
1226                                    4 + ((imm) & 0x30) >> 4, \
1227                                    4 + ((imm) & 0xc0) >> 6))
1228
1229static inline __m128i __attribute__((__always_inline__, __nodebug__))
1230_mm_unpackhi_epi8(__m128i a, __m128i b)
1231{
1232  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1233}
1234
1235static inline __m128i __attribute__((__always_inline__, __nodebug__))
1236_mm_unpackhi_epi16(__m128i a, __m128i b)
1237{
1238  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1239}
1240
1241static inline __m128i __attribute__((__always_inline__, __nodebug__))
1242_mm_unpackhi_epi32(__m128i a, __m128i b)
1243{
1244  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1245}
1246
1247static inline __m128i __attribute__((__always_inline__, __nodebug__))
1248_mm_unpackhi_epi64(__m128i a, __m128i b)
1249{
1250  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1251}
1252
1253static inline __m128i __attribute__((__always_inline__, __nodebug__))
1254_mm_unpacklo_epi8(__m128i a, __m128i b)
1255{
1256  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1257}
1258
1259static inline __m128i __attribute__((__always_inline__, __nodebug__))
1260_mm_unpacklo_epi16(__m128i a, __m128i b)
1261{
1262  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1263}
1264
1265static inline __m128i __attribute__((__always_inline__, __nodebug__))
1266_mm_unpacklo_epi32(__m128i a, __m128i b)
1267{
1268  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1269}
1270
1271static inline __m128i __attribute__((__always_inline__, __nodebug__))
1272_mm_unpacklo_epi64(__m128i a, __m128i b)
1273{
1274  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1275}
1276
1277static inline __m64 __attribute__((__always_inline__, __nodebug__))
1278_mm_movepi64_pi64(__m128i a)
1279{
1280  return (__m64)a[0];
1281}
1282
1283static inline __m128i __attribute__((__always_inline__, __nodebug__))
1284_mm_movpi64_pi64(__m64 a)
1285{
1286  return (__m128i){ (long long)a, 0 };
1287}
1288
1289static inline __m128i __attribute__((__always_inline__, __nodebug__))
1290_mm_move_epi64(__m128i a)
1291{
1292  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1293}
1294
1295static inline __m128d __attribute__((__always_inline__, __nodebug__))
1296_mm_unpackhi_pd(__m128d a, __m128d b)
1297{
1298  return __builtin_shufflevector(a, b, 1, 2+1);
1299}
1300
1301static inline __m128d __attribute__((__always_inline__, __nodebug__))
1302_mm_unpacklo_pd(__m128d a, __m128d b)
1303{
1304  return __builtin_shufflevector(a, b, 0, 2+0);
1305}
1306
1307static inline int __attribute__((__always_inline__, __nodebug__))
1308_mm_movemask_pd(__m128d a)
1309{
1310  return __builtin_ia32_movmskpd(a);
1311}
1312
1313#define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \
1314                                                         (((i) & 2) >> 1) + 2))
1315
1316static inline __m128 __attribute__((__always_inline__, __nodebug__))
1317_mm_castpd_ps(__m128d in)
1318{
1319  return (__m128)in;
1320}
1321
1322static inline __m128i __attribute__((__always_inline__, __nodebug__))
1323_mm_castpd_si128(__m128d in)
1324{
1325  return (__m128i)in;
1326}
1327
1328static inline __m128d __attribute__((__always_inline__, __nodebug__))
1329_mm_castps_pd(__m128 in)
1330{
1331  return (__m128d)in;
1332}
1333
1334static inline __m128i __attribute__((__always_inline__, __nodebug__))
1335_mm_castps_si128(__m128 in)
1336{
1337  return (__m128i)in;
1338}
1339
1340static inline __m128 __attribute__((__always_inline__, __nodebug__))
1341_mm_castsi128_ps(__m128i in)
1342{
1343  return (__m128)in;
1344}
1345
1346static inline __m128d __attribute__((__always_inline__, __nodebug__))
1347_mm_castsi128_pd(__m128i in)
1348{
1349  return (__m128d)in;
1350}
1351
1352static inline void __attribute__((__always_inline__, __nodebug__))
1353_mm_pause(void)
1354{
1355  __asm__ volatile ("pause");
1356}
1357
1358#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1359
1360#endif /* __SSE2__ */
1361
1362#endif /* __EMMINTRIN_H */
1363