emmintrin.h revision 223017
1254721Semaste/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2254721Semaste *
3254721Semaste * Permission is hereby granted, free of charge, to any person obtaining a copy
4254721Semaste * of this software and associated documentation files (the "Software"), to deal
5254721Semaste * in the Software without restriction, including without limitation the rights
6254721Semaste * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7254721Semaste * copies of the Software, and to permit persons to whom the Software is
8254721Semaste * furnished to do so, subject to the following conditions:
9254721Semaste *
10254721Semaste * The above copyright notice and this permission notice shall be included in
11254721Semaste * all copies or substantial portions of the Software.
12254721Semaste *
13254721Semaste * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14254721Semaste * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15254721Semaste * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16254721Semaste * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17254721Semaste * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18254721Semaste * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19254721Semaste * THE SOFTWARE.
20254721Semaste *
21254721Semaste *===-----------------------------------------------------------------------===
22254721Semaste */
23254721Semaste
24254721Semaste#ifndef __EMMINTRIN_H
25254721Semaste#define __EMMINTRIN_H
26254721Semaste
27254721Semaste#ifndef __SSE2__
28254721Semaste#error "SSE2 instruction set not enabled"
29254721Semaste#else
30254721Semaste
31254721Semaste#include <xmmintrin.h>
32254721Semaste
33254721Semastetypedef double __m128d __attribute__((__vector_size__(16)));
34254721Semastetypedef long long __m128i __attribute__((__vector_size__(16)));
35254721Semaste
36254721Semaste/* Type defines.  */
37254721Semastetypedef double __v2df __attribute__ ((__vector_size__ (16)));
38254721Semastetypedef long long __v2di __attribute__ ((__vector_size__ (16)));
39254721Semastetypedef short __v8hi __attribute__((__vector_size__(16)));
40254721Semastetypedef char __v16qi __attribute__((__vector_size__(16)));
41254721Semaste
42254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43254721Semaste_mm_add_sd(__m128d a, __m128d b)
44254721Semaste{
45254721Semaste  a[0] += b[0];
46254721Semaste  return a;
47254721Semaste}
48254721Semaste
49254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50254721Semaste_mm_add_pd(__m128d a, __m128d b)
51254721Semaste{
52254721Semaste  return a + b;
53254721Semaste}
54254721Semaste
55254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56254721Semaste_mm_sub_sd(__m128d a, __m128d b)
57254721Semaste{
58254721Semaste  a[0] -= b[0];
59254721Semaste  return a;
60254721Semaste}
61254721Semaste
62254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63254721Semaste_mm_sub_pd(__m128d a, __m128d b)
64254721Semaste{
65254721Semaste  return a - b;
66254721Semaste}
67254721Semaste
68254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69254721Semaste_mm_mul_sd(__m128d a, __m128d b)
70254721Semaste{
71254721Semaste  a[0] *= b[0];
72254721Semaste  return a;
73254721Semaste}
74254721Semaste
75254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76254721Semaste_mm_mul_pd(__m128d a, __m128d b)
77254721Semaste{
78254721Semaste  return a * b;
79254721Semaste}
80254721Semaste
81254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82254721Semaste_mm_div_sd(__m128d a, __m128d b)
83254721Semaste{
84254721Semaste  a[0] /= b[0];
85254721Semaste  return a;
86254721Semaste}
87254721Semaste
88254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89254721Semaste_mm_div_pd(__m128d a, __m128d b)
90254721Semaste{
91254721Semaste  return a / b;
92254721Semaste}
93254721Semaste
94254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95254721Semaste_mm_sqrt_sd(__m128d a, __m128d b)
96254721Semaste{
97254721Semaste  __m128d c = __builtin_ia32_sqrtsd(b);
98254721Semaste  return (__m128d) { c[0], a[1] };
99254721Semaste}
100254721Semaste
101254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102254721Semaste_mm_sqrt_pd(__m128d a)
103254721Semaste{
104254721Semaste  return __builtin_ia32_sqrtpd(a);
105254721Semaste}
106254721Semaste
107254721Semastestatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108254721Semaste_mm_min_sd(__m128d a, __m128d b)
109254721Semaste{
110254721Semaste  return __builtin_ia32_minsd(a, b);
111254721Semaste}
112
113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114_mm_min_pd(__m128d a, __m128d b)
115{
116  return __builtin_ia32_minpd(a, b);
117}
118
119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120_mm_max_sd(__m128d a, __m128d b)
121{
122  return __builtin_ia32_maxsd(a, b);
123}
124
125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126_mm_max_pd(__m128d a, __m128d b)
127{
128  return __builtin_ia32_maxpd(a, b);
129}
130
131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132_mm_and_pd(__m128d a, __m128d b)
133{
134  return (__m128d)((__v4si)a & (__v4si)b);
135}
136
137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138_mm_andnot_pd(__m128d a, __m128d b)
139{
140  return (__m128d)(~(__v4si)a & (__v4si)b);
141}
142
143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144_mm_or_pd(__m128d a, __m128d b)
145{
146  return (__m128d)((__v4si)a | (__v4si)b);
147}
148
149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150_mm_xor_pd(__m128d a, __m128d b)
151{
152  return (__m128d)((__v4si)a ^ (__v4si)b);
153}
154
155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156_mm_cmpeq_pd(__m128d a, __m128d b)
157{
158  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159}
160
161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162_mm_cmplt_pd(__m128d a, __m128d b)
163{
164  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165}
166
167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168_mm_cmple_pd(__m128d a, __m128d b)
169{
170  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171}
172
173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174_mm_cmpgt_pd(__m128d a, __m128d b)
175{
176  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177}
178
179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180_mm_cmpge_pd(__m128d a, __m128d b)
181{
182  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183}
184
185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186_mm_cmpord_pd(__m128d a, __m128d b)
187{
188  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189}
190
191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192_mm_cmpunord_pd(__m128d a, __m128d b)
193{
194  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195}
196
197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198_mm_cmpneq_pd(__m128d a, __m128d b)
199{
200  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201}
202
203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204_mm_cmpnlt_pd(__m128d a, __m128d b)
205{
206  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207}
208
209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210_mm_cmpnle_pd(__m128d a, __m128d b)
211{
212  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213}
214
215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216_mm_cmpngt_pd(__m128d a, __m128d b)
217{
218  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219}
220
221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222_mm_cmpnge_pd(__m128d a, __m128d b)
223{
224  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225}
226
227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228_mm_cmpeq_sd(__m128d a, __m128d b)
229{
230  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231}
232
233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234_mm_cmplt_sd(__m128d a, __m128d b)
235{
236  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237}
238
239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240_mm_cmple_sd(__m128d a, __m128d b)
241{
242  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243}
244
245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246_mm_cmpgt_sd(__m128d a, __m128d b)
247{
248  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249}
250
251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252_mm_cmpge_sd(__m128d a, __m128d b)
253{
254  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255}
256
257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258_mm_cmpord_sd(__m128d a, __m128d b)
259{
260  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261}
262
263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264_mm_cmpunord_sd(__m128d a, __m128d b)
265{
266  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267}
268
269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270_mm_cmpneq_sd(__m128d a, __m128d b)
271{
272  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273}
274
275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276_mm_cmpnlt_sd(__m128d a, __m128d b)
277{
278  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279}
280
281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282_mm_cmpnle_sd(__m128d a, __m128d b)
283{
284  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285}
286
287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288_mm_cmpngt_sd(__m128d a, __m128d b)
289{
290  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291}
292
293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_sd(__m128d a, __m128d b)
295{
296  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297}
298
299static __inline__ int __attribute__((__always_inline__, __nodebug__))
300_mm_comieq_sd(__m128d a, __m128d b)
301{
302  return __builtin_ia32_comisdeq(a, b);
303}
304
305static __inline__ int __attribute__((__always_inline__, __nodebug__))
306_mm_comilt_sd(__m128d a, __m128d b)
307{
308  return __builtin_ia32_comisdlt(a, b);
309}
310
311static __inline__ int __attribute__((__always_inline__, __nodebug__))
312_mm_comile_sd(__m128d a, __m128d b)
313{
314  return __builtin_ia32_comisdle(a, b);
315}
316
317static __inline__ int __attribute__((__always_inline__, __nodebug__))
318_mm_comigt_sd(__m128d a, __m128d b)
319{
320  return __builtin_ia32_comisdgt(a, b);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comineq_sd(__m128d a, __m128d b)
325{
326  return __builtin_ia32_comisdneq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_ucomieq_sd(__m128d a, __m128d b)
331{
332  return __builtin_ia32_ucomisdeq(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_ucomilt_sd(__m128d a, __m128d b)
337{
338  return __builtin_ia32_ucomisdlt(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_ucomile_sd(__m128d a, __m128d b)
343{
344  return __builtin_ia32_ucomisdle(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_ucomigt_sd(__m128d a, __m128d b)
349{
350  return __builtin_ia32_ucomisdgt(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_ucomineq_sd(__m128d a, __m128d b)
355{
356  return __builtin_ia32_ucomisdneq(a, b);
357}
358
359static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
360_mm_cvtpd_ps(__m128d a)
361{
362  return __builtin_ia32_cvtpd2ps(a);
363}
364
365static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
366_mm_cvtps_pd(__m128 a)
367{
368  return __builtin_ia32_cvtps2pd(a);
369}
370
371static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
372_mm_cvtepi32_pd(__m128i a)
373{
374  return __builtin_ia32_cvtdq2pd((__v4si)a);
375}
376
377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
378_mm_cvtpd_epi32(__m128d a)
379{
380  return __builtin_ia32_cvtpd2dq(a);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_cvtsd_si32(__m128d a)
385{
386  return __builtin_ia32_cvtsd2si(a);
387}
388
389static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
390_mm_cvtsd_ss(__m128 a, __m128d b)
391{
392  a[0] = b[0];
393  return a;
394}
395
396static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
397_mm_cvtsi32_sd(__m128d a, int b)
398{
399  a[0] = b;
400  return a;
401}
402
403static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
404_mm_cvtss_sd(__m128d a, __m128 b)
405{
406  a[0] = b[0];
407  return a;
408}
409
410static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
411_mm_cvttpd_epi32(__m128d a)
412{
413  return (__m128i)__builtin_ia32_cvttpd2dq(a);
414}
415
416static __inline__ int __attribute__((__always_inline__, __nodebug__))
417_mm_cvttsd_si32(__m128d a)
418{
419  return a[0];
420}
421
422static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
423_mm_cvtpd_pi32(__m128d a)
424{
425  return (__m64)__builtin_ia32_cvtpd2pi(a);
426}
427
428static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
429_mm_cvttpd_pi32(__m128d a)
430{
431  return (__m64)__builtin_ia32_cvttpd2pi(a);
432}
433
434static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
435_mm_cvtpi32_pd(__m64 a)
436{
437  return __builtin_ia32_cvtpi2pd((__v2si)a);
438}
439
440static __inline__ double __attribute__((__always_inline__, __nodebug__))
441_mm_cvtsd_f64(__m128d a)
442{
443  return a[0];
444}
445
446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447_mm_load_pd(double const *dp)
448{
449  return *(__m128d*)dp;
450}
451
452static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
453_mm_load1_pd(double const *dp)
454{
455  return (__m128d){ dp[0], dp[0] };
456}
457
458#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
459
460static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
461_mm_loadr_pd(double const *dp)
462{
463  return (__m128d){ dp[1], dp[0] };
464}
465
466static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
467_mm_loadu_pd(double const *dp)
468{
469  struct __loadu_pd {
470    __m128d v;
471  } __attribute__((packed, may_alias));
472  return ((struct __loadu_pd*)dp)->v;
473}
474
475static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
476_mm_load_sd(double const *dp)
477{
478  return (__m128d){ *dp, 0.0 };
479}
480
481static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
482_mm_loadh_pd(__m128d a, double const *dp)
483{
484  return (__m128d){ a[0], *dp };
485}
486
487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488_mm_loadl_pd(__m128d a, double const *dp)
489{
490  return (__m128d){ *dp, a[1] };
491}
492
493static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
494_mm_set_sd(double w)
495{
496  return (__m128d){ w, 0 };
497}
498
499static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
500_mm_set1_pd(double w)
501{
502  return (__m128d){ w, w };
503}
504
505static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
506_mm_set_pd(double w, double x)
507{
508  return (__m128d){ x, w };
509}
510
511static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
512_mm_setr_pd(double w, double x)
513{
514  return (__m128d){ w, x };
515}
516
517static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
518_mm_setzero_pd(void)
519{
520  return (__m128d){ 0, 0 };
521}
522
523static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
524_mm_move_sd(__m128d a, __m128d b)
525{
526  return (__m128d){ b[0], a[1] };
527}
528
529static __inline__ void __attribute__((__always_inline__, __nodebug__))
530_mm_store_sd(double *dp, __m128d a)
531{
532  dp[0] = a[0];
533}
534
535static __inline__ void __attribute__((__always_inline__, __nodebug__))
536_mm_store1_pd(double *dp, __m128d a)
537{
538  dp[0] = a[0];
539  dp[1] = a[0];
540}
541
542static __inline__ void __attribute__((__always_inline__, __nodebug__))
543_mm_store_pd(double *dp, __m128d a)
544{
545  *(__m128d *)dp = a;
546}
547
548static __inline__ void __attribute__((__always_inline__, __nodebug__))
549_mm_storeu_pd(double *dp, __m128d a)
550{
551  __builtin_ia32_storeupd(dp, a);
552}
553
554static __inline__ void __attribute__((__always_inline__, __nodebug__))
555_mm_storer_pd(double *dp, __m128d a)
556{
557  dp[0] = a[1];
558  dp[1] = a[0];
559}
560
561static __inline__ void __attribute__((__always_inline__, __nodebug__))
562_mm_storeh_pd(double *dp, __m128d a)
563{
564  dp[0] = a[1];
565}
566
567static __inline__ void __attribute__((__always_inline__, __nodebug__))
568_mm_storel_pd(double *dp, __m128d a)
569{
570  dp[0] = a[0];
571}
572
573static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
574_mm_add_epi8(__m128i a, __m128i b)
575{
576  return (__m128i)((__v16qi)a + (__v16qi)b);
577}
578
579static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
580_mm_add_epi16(__m128i a, __m128i b)
581{
582  return (__m128i)((__v8hi)a + (__v8hi)b);
583}
584
585static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
586_mm_add_epi32(__m128i a, __m128i b)
587{
588  return (__m128i)((__v4si)a + (__v4si)b);
589}
590
591static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
592_mm_add_si64(__m64 a, __m64 b)
593{
594  return a + b;
595}
596
597static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
598_mm_add_epi64(__m128i a, __m128i b)
599{
600  return a + b;
601}
602
603static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
604_mm_adds_epi8(__m128i a, __m128i b)
605{
606  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
607}
608
609static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
610_mm_adds_epi16(__m128i a, __m128i b)
611{
612  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
613}
614
615static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
616_mm_adds_epu8(__m128i a, __m128i b)
617{
618  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
619}
620
621static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
622_mm_adds_epu16(__m128i a, __m128i b)
623{
624  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
625}
626
627static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
628_mm_avg_epu8(__m128i a, __m128i b)
629{
630  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
631}
632
633static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
634_mm_avg_epu16(__m128i a, __m128i b)
635{
636  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
637}
638
639static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
640_mm_madd_epi16(__m128i a, __m128i b)
641{
642  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
643}
644
645static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
646_mm_max_epi16(__m128i a, __m128i b)
647{
648  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
649}
650
651static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
652_mm_max_epu8(__m128i a, __m128i b)
653{
654  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
655}
656
657static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
658_mm_min_epi16(__m128i a, __m128i b)
659{
660  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
661}
662
663static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
664_mm_min_epu8(__m128i a, __m128i b)
665{
666  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
667}
668
669static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_epi16(__m128i a, __m128i b)
671{
672  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
673}
674
675static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
676_mm_mulhi_epu16(__m128i a, __m128i b)
677{
678  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
679}
680
681static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
682_mm_mullo_epi16(__m128i a, __m128i b)
683{
684  return (__m128i)((__v8hi)a * (__v8hi)b);
685}
686
687static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
688_mm_mul_su32(__m64 a, __m64 b)
689{
690  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
691}
692
693static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
694_mm_mul_epu32(__m128i a, __m128i b)
695{
696  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
697}
698
699static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
700_mm_sad_epu8(__m128i a, __m128i b)
701{
702  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
703}
704
705static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
706_mm_sub_epi8(__m128i a, __m128i b)
707{
708  return (__m128i)((__v16qi)a - (__v16qi)b);
709}
710
711static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
712_mm_sub_epi16(__m128i a, __m128i b)
713{
714  return (__m128i)((__v8hi)a - (__v8hi)b);
715}
716
717static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
718_mm_sub_epi32(__m128i a, __m128i b)
719{
720  return (__m128i)((__v4si)a - (__v4si)b);
721}
722
723static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
724_mm_sub_si64(__m64 a, __m64 b)
725{
726  return a - b;
727}
728
729static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
730_mm_sub_epi64(__m128i a, __m128i b)
731{
732  return a - b;
733}
734
735static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
736_mm_subs_epi8(__m128i a, __m128i b)
737{
738  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
739}
740
741static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
742_mm_subs_epi16(__m128i a, __m128i b)
743{
744  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
745}
746
747static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
748_mm_subs_epu8(__m128i a, __m128i b)
749{
750  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
751}
752
753static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
754_mm_subs_epu16(__m128i a, __m128i b)
755{
756  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
757}
758
759static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
760_mm_and_si128(__m128i a, __m128i b)
761{
762  return a & b;
763}
764
765static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
766_mm_andnot_si128(__m128i a, __m128i b)
767{
768  return ~a & b;
769}
770
771static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
772_mm_or_si128(__m128i a, __m128i b)
773{
774  return a | b;
775}
776
777static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
778_mm_xor_si128(__m128i a, __m128i b)
779{
780  return a ^ b;
781}
782
783#define _mm_slli_si128(VEC, IMM) \
784  ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
785
786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787_mm_slli_epi16(__m128i a, int count)
788{
789  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
790}
791
792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793_mm_sll_epi16(__m128i a, __m128i count)
794{
795  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
796}
797
798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799_mm_slli_epi32(__m128i a, int count)
800{
801  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
802}
803
804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805_mm_sll_epi32(__m128i a, __m128i count)
806{
807  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
808}
809
810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811_mm_slli_epi64(__m128i a, int count)
812{
813  return __builtin_ia32_psllqi128(a, count);
814}
815
816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817_mm_sll_epi64(__m128i a, __m128i count)
818{
819  return __builtin_ia32_psllq128(a, count);
820}
821
822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823_mm_srai_epi16(__m128i a, int count)
824{
825  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
826}
827
828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829_mm_sra_epi16(__m128i a, __m128i count)
830{
831  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
832}
833
834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835_mm_srai_epi32(__m128i a, int count)
836{
837  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
838}
839
840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841_mm_sra_epi32(__m128i a, __m128i count)
842{
843  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
844}
845
846
847#define _mm_srli_si128(VEC, IMM) \
848  ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
849
850static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
851_mm_srli_epi16(__m128i a, int count)
852{
853  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
854}
855
856static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
857_mm_srl_epi16(__m128i a, __m128i count)
858{
859  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
860}
861
862static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
863_mm_srli_epi32(__m128i a, int count)
864{
865  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
866}
867
868static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
869_mm_srl_epi32(__m128i a, __m128i count)
870{
871  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
872}
873
874static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
875_mm_srli_epi64(__m128i a, int count)
876{
877  return __builtin_ia32_psrlqi128(a, count);
878}
879
880static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
881_mm_srl_epi64(__m128i a, __m128i count)
882{
883  return __builtin_ia32_psrlq128(a, count);
884}
885
886static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
887_mm_cmpeq_epi8(__m128i a, __m128i b)
888{
889  return (__m128i)((__v16qi)a == (__v16qi)b);
890}
891
892static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
893_mm_cmpeq_epi16(__m128i a, __m128i b)
894{
895  return (__m128i)((__v8hi)a == (__v8hi)b);
896}
897
898static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
899_mm_cmpeq_epi32(__m128i a, __m128i b)
900{
901  return (__m128i)((__v4si)a == (__v4si)b);
902}
903
904static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
905_mm_cmpgt_epi8(__m128i a, __m128i b)
906{
907  return (__m128i)((__v16qi)a > (__v16qi)b);
908}
909
910static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
911_mm_cmpgt_epi16(__m128i a, __m128i b)
912{
913  return (__m128i)((__v8hi)a > (__v8hi)b);
914}
915
916static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
917_mm_cmpgt_epi32(__m128i a, __m128i b)
918{
919  return (__m128i)((__v4si)a > (__v4si)b);
920}
921
922static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
923_mm_cmplt_epi8(__m128i a, __m128i b)
924{
925  return _mm_cmpgt_epi8(b,a);
926}
927
928static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
929_mm_cmplt_epi16(__m128i a, __m128i b)
930{
931  return _mm_cmpgt_epi16(b,a);
932}
933
934static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
935_mm_cmplt_epi32(__m128i a, __m128i b)
936{
937  return _mm_cmpgt_epi32(b,a);
938}
939
940#ifdef __x86_64__
941static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
942_mm_cvtsi64_sd(__m128d a, long long b)
943{
944  a[0] = b;
945  return a;
946}
947
948static __inline__ long long __attribute__((__always_inline__, __nodebug__))
949_mm_cvtsd_si64(__m128d a)
950{
951  return __builtin_ia32_cvtsd2si64(a);
952}
953
954static __inline__ long long __attribute__((__always_inline__, __nodebug__))
955_mm_cvttsd_si64(__m128d a)
956{
957  return a[0];
958}
959#endif
960
961static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
962_mm_cvtepi32_ps(__m128i a)
963{
964  return __builtin_ia32_cvtdq2ps((__v4si)a);
965}
966
967static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
968_mm_cvtps_epi32(__m128 a)
969{
970  return (__m128i)__builtin_ia32_cvtps2dq(a);
971}
972
973static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
974_mm_cvttps_epi32(__m128 a)
975{
976  return (__m128i)__builtin_ia32_cvttps2dq(a);
977}
978
979static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
980_mm_cvtsi32_si128(int a)
981{
982  return (__m128i)(__v4si){ a, 0, 0, 0 };
983}
984
985#ifdef __x86_64__
986static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
987_mm_cvtsi64_si128(long long a)
988{
989  return (__m128i){ a, 0 };
990}
991#endif
992
993static __inline__ int __attribute__((__always_inline__, __nodebug__))
994_mm_cvtsi128_si32(__m128i a)
995{
996  __v4si b = (__v4si)a;
997  return b[0];
998}
999
1000#ifdef __x86_64__
1001static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1002_mm_cvtsi128_si64(__m128i a)
1003{
1004  return a[0];
1005}
1006#endif
1007
1008static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1009_mm_load_si128(__m128i const *p)
1010{
1011  return *p;
1012}
1013
1014static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1015_mm_loadu_si128(__m128i const *p)
1016{
1017  struct __loadu_si128 {
1018    __m128i v;
1019  } __attribute__((packed, may_alias));
1020  return ((struct __loadu_si128*)p)->v;
1021}
1022
1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1024_mm_loadl_epi64(__m128i const *p)
1025{
1026  return (__m128i) { *(long long*)p, 0};
1027}
1028
1029static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1030_mm_set_epi64x(long long q1, long long q0)
1031{
1032  return (__m128i){ q0, q1 };
1033}
1034
1035static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1036_mm_set_epi64(__m64 q1, __m64 q0)
1037{
1038  return (__m128i){ (long long)q0, (long long)q1 };
1039}
1040
1041static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1042_mm_set_epi32(int i3, int i2, int i1, int i0)
1043{
1044  return (__m128i)(__v4si){ i0, i1, i2, i3};
1045}
1046
1047static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1048_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1049{
1050  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1051}
1052
1053static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1054_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1055{
1056  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1057}
1058
1059static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1060_mm_set1_epi64x(long long q)
1061{
1062  return (__m128i){ q, q };
1063}
1064
1065static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1066_mm_set1_epi64(__m64 q)
1067{
1068  return (__m128i){ (long long)q, (long long)q };
1069}
1070
1071static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1072_mm_set1_epi32(int i)
1073{
1074  return (__m128i)(__v4si){ i, i, i, i };
1075}
1076
1077static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078_mm_set1_epi16(short w)
1079{
1080  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1081}
1082
1083static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1084_mm_set1_epi8(char b)
1085{
1086  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1087}
1088
1089static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1090_mm_setr_epi64(__m64 q0, __m64 q1)
1091{
1092  return (__m128i){ (long long)q0, (long long)q1 };
1093}
1094
1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1096_mm_setr_epi32(int i0, int i1, int i2, int i3)
1097{
1098  return (__m128i)(__v4si){ i0, i1, i2, i3};
1099}
1100
1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1102_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1103{
1104  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1105}
1106
1107static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1108_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1109{
1110  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1111}
1112
1113static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1114_mm_setzero_si128(void)
1115{
1116  return (__m128i){ 0LL, 0LL };
1117}
1118
1119static __inline__ void __attribute__((__always_inline__, __nodebug__))
1120_mm_store_si128(__m128i *p, __m128i b)
1121{
1122  *p = b;
1123}
1124
1125static __inline__ void __attribute__((__always_inline__, __nodebug__))
1126_mm_storeu_si128(__m128i *p, __m128i b)
1127{
1128  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1129}
1130
1131static __inline__ void __attribute__((__always_inline__, __nodebug__))
1132_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1133{
1134  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1135}
1136
1137static __inline__ void __attribute__((__always_inline__, __nodebug__))
1138_mm_storel_epi64(__m128i *p, __m128i a)
1139{
1140  __builtin_ia32_storelv4si((__v2si *)p, a);
1141}
1142
1143static __inline__ void __attribute__((__always_inline__, __nodebug__))
1144_mm_stream_pd(double *p, __m128d a)
1145{
1146  __builtin_ia32_movntpd(p, a);
1147}
1148
1149static __inline__ void __attribute__((__always_inline__, __nodebug__))
1150_mm_stream_si128(__m128i *p, __m128i a)
1151{
1152  __builtin_ia32_movntdq(p, a);
1153}
1154
1155static __inline__ void __attribute__((__always_inline__, __nodebug__))
1156_mm_stream_si32(int *p, int a)
1157{
1158  __builtin_ia32_movnti(p, a);
1159}
1160
1161static __inline__ void __attribute__((__always_inline__, __nodebug__))
1162_mm_clflush(void const *p)
1163{
1164  __builtin_ia32_clflush(p);
1165}
1166
1167static __inline__ void __attribute__((__always_inline__, __nodebug__))
1168_mm_lfence(void)
1169{
1170  __builtin_ia32_lfence();
1171}
1172
1173static __inline__ void __attribute__((__always_inline__, __nodebug__))
1174_mm_mfence(void)
1175{
1176  __builtin_ia32_mfence();
1177}
1178
1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1180_mm_packs_epi16(__m128i a, __m128i b)
1181{
1182  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1183}
1184
1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1186_mm_packs_epi32(__m128i a, __m128i b)
1187{
1188  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1189}
1190
1191static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1192_mm_packus_epi16(__m128i a, __m128i b)
1193{
1194  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1195}
1196
1197static __inline__ int __attribute__((__always_inline__, __nodebug__))
1198_mm_extract_epi16(__m128i a, int imm)
1199{
1200  __v8hi b = (__v8hi)a;
1201  return (unsigned short)b[imm];
1202}
1203
1204static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1205_mm_insert_epi16(__m128i a, int b, int imm)
1206{
1207  __v8hi c = (__v8hi)a;
1208  c[imm & 7] = b;
1209  return (__m128i)c;
1210}
1211
1212static __inline__ int __attribute__((__always_inline__, __nodebug__))
1213_mm_movemask_epi8(__m128i a)
1214{
1215  return __builtin_ia32_pmovmskb128((__v16qi)a);
1216}
1217
1218#define _mm_shuffle_epi32(a, imm) \
1219  ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1220                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1222
1223
1224#define _mm_shufflelo_epi16(a, imm) \
1225  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1226                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1227                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1228                                    4, 5, 6, 7))
1229#define _mm_shufflehi_epi16(a, imm) \
1230  ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1231                                    4 + (((imm) & 0x03) >> 0), \
1232                                    4 + (((imm) & 0x0c) >> 2), \
1233                                    4 + (((imm) & 0x30) >> 4), \
1234                                    4 + (((imm) & 0xc0) >> 6)))
1235
1236static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1237_mm_unpackhi_epi8(__m128i a, __m128i b)
1238{
1239  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1240}
1241
1242static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1243_mm_unpackhi_epi16(__m128i a, __m128i b)
1244{
1245  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1246}
1247
1248static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1249_mm_unpackhi_epi32(__m128i a, __m128i b)
1250{
1251  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1252}
1253
1254static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1255_mm_unpackhi_epi64(__m128i a, __m128i b)
1256{
1257  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1258}
1259
1260static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1261_mm_unpacklo_epi8(__m128i a, __m128i b)
1262{
1263  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1264}
1265
1266static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1267_mm_unpacklo_epi16(__m128i a, __m128i b)
1268{
1269  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1270}
1271
1272static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273_mm_unpacklo_epi32(__m128i a, __m128i b)
1274{
1275  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1276}
1277
1278static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1279_mm_unpacklo_epi64(__m128i a, __m128i b)
1280{
1281  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1282}
1283
1284static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1285_mm_movepi64_pi64(__m128i a)
1286{
1287  return (__m64)a[0];
1288}
1289
1290static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1291_mm_movpi64_pi64(__m64 a)
1292{
1293  return (__m128i){ (long long)a, 0 };
1294}
1295
1296static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1297_mm_move_epi64(__m128i a)
1298{
1299  return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1300}
1301
1302static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1303_mm_unpackhi_pd(__m128d a, __m128d b)
1304{
1305  return __builtin_shufflevector(a, b, 1, 2+1);
1306}
1307
1308static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1309_mm_unpacklo_pd(__m128d a, __m128d b)
1310{
1311  return __builtin_shufflevector(a, b, 0, 2+0);
1312}
1313
1314static __inline__ int __attribute__((__always_inline__, __nodebug__))
1315_mm_movemask_pd(__m128d a)
1316{
1317  return __builtin_ia32_movmskpd(a);
1318}
1319
1320#define _mm_shuffle_pd(a, b, i) \
1321  (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1322                                                       (((i) & 2) >> 1) + 2))
1323
1324static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1325_mm_castpd_ps(__m128d in)
1326{
1327  return (__m128)in;
1328}
1329
1330static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1331_mm_castpd_si128(__m128d in)
1332{
1333  return (__m128i)in;
1334}
1335
1336static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1337_mm_castps_pd(__m128 in)
1338{
1339  return (__m128d)in;
1340}
1341
1342static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1343_mm_castps_si128(__m128 in)
1344{
1345  return (__m128i)in;
1346}
1347
1348static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1349_mm_castsi128_ps(__m128i in)
1350{
1351  return (__m128)in;
1352}
1353
1354static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1355_mm_castsi128_pd(__m128i in)
1356{
1357  return (__m128d)in;
1358}
1359
1360static __inline__ void __attribute__((__always_inline__, __nodebug__))
1361_mm_pause(void)
1362{
1363  __asm__ volatile ("pause");
1364}
1365
1366#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1367
1368#endif /* __SSE2__ */
1369
1370#endif /* __EMMINTRIN_H */
1371