xmmintrin.h revision 221345
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef int __v4si __attribute__((__vector_size__(16)));
34typedef float __v4sf __attribute__((__vector_size__(16)));
35typedef float __m128 __attribute__((__vector_size__(16)));
36
37// This header should only be included in a hosted environment as it depends on
38// a standard library to provide allocation routines.
39#if __STDC_HOSTED__
40#include <mm_malloc.h>
41#endif
42
43static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44_mm_add_ss(__m128 a, __m128 b)
45{
46  a[0] += b[0];
47  return a;
48}
49
50static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51_mm_add_ps(__m128 a, __m128 b)
52{
53  return a + b;
54}
55
56static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57_mm_sub_ss(__m128 a, __m128 b)
58{
59  a[0] -= b[0];
60  return a;
61}
62
63static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64_mm_sub_ps(__m128 a, __m128 b)
65{
66  return a - b;
67}
68
69static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70_mm_mul_ss(__m128 a, __m128 b)
71{
72  a[0] *= b[0];
73  return a;
74}
75
76static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77_mm_mul_ps(__m128 a, __m128 b)
78{
79  return a * b;
80}
81
82static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83_mm_div_ss(__m128 a, __m128 b)
84{
85  a[0] /= b[0];
86  return a;
87}
88
89static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90_mm_div_ps(__m128 a, __m128 b)
91{
92  return a / b;
93}
94
95static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96_mm_sqrt_ss(__m128 a)
97{
98  return __builtin_ia32_sqrtss(a);
99}
100
101static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_ps(__m128 a)
103{
104  return __builtin_ia32_sqrtps(a);
105}
106
107static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
108_mm_rcp_ss(__m128 a)
109{
110  return __builtin_ia32_rcpss(a);
111}
112
113static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
114_mm_rcp_ps(__m128 a)
115{
116  return __builtin_ia32_rcpps(a);
117}
118
119static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
120_mm_rsqrt_ss(__m128 a)
121{
122  return __builtin_ia32_rsqrtss(a);
123}
124
125static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
126_mm_rsqrt_ps(__m128 a)
127{
128  return __builtin_ia32_rsqrtps(a);
129}
130
131static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
132_mm_min_ss(__m128 a, __m128 b)
133{
134  return __builtin_ia32_minss(a, b);
135}
136
137static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
138_mm_min_ps(__m128 a, __m128 b)
139{
140  return __builtin_ia32_minps(a, b);
141}
142
143static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
144_mm_max_ss(__m128 a, __m128 b)
145{
146  return __builtin_ia32_maxss(a, b);
147}
148
149static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
150_mm_max_ps(__m128 a, __m128 b)
151{
152  return __builtin_ia32_maxps(a, b);
153}
154
155static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
156_mm_and_ps(__m128 a, __m128 b)
157{
158  return (__m128)((__v4si)a & (__v4si)b);
159}
160
161static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
162_mm_andnot_ps(__m128 a, __m128 b)
163{
164  return (__m128)(~(__v4si)a & (__v4si)b);
165}
166
167static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
168_mm_or_ps(__m128 a, __m128 b)
169{
170  return (__m128)((__v4si)a | (__v4si)b);
171}
172
173static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
174_mm_xor_ps(__m128 a, __m128 b)
175{
176  return (__m128)((__v4si)a ^ (__v4si)b);
177}
178
179static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
180_mm_cmpeq_ss(__m128 a, __m128 b)
181{
182  return (__m128)__builtin_ia32_cmpss(a, b, 0);
183}
184
185static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
186_mm_cmpeq_ps(__m128 a, __m128 b)
187{
188  return (__m128)__builtin_ia32_cmpps(a, b, 0);
189}
190
191static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
192_mm_cmplt_ss(__m128 a, __m128 b)
193{
194  return (__m128)__builtin_ia32_cmpss(a, b, 1);
195}
196
197static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
198_mm_cmplt_ps(__m128 a, __m128 b)
199{
200  return (__m128)__builtin_ia32_cmpps(a, b, 1);
201}
202
203static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
204_mm_cmple_ss(__m128 a, __m128 b)
205{
206  return (__m128)__builtin_ia32_cmpss(a, b, 2);
207}
208
209static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
210_mm_cmple_ps(__m128 a, __m128 b)
211{
212  return (__m128)__builtin_ia32_cmpps(a, b, 2);
213}
214
215static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
216_mm_cmpgt_ss(__m128 a, __m128 b)
217{
218  return (__m128)__builtin_ia32_cmpss(b, a, 1);
219}
220
221static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
222_mm_cmpgt_ps(__m128 a, __m128 b)
223{
224  return (__m128)__builtin_ia32_cmpps(b, a, 1);
225}
226
227static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
228_mm_cmpge_ss(__m128 a, __m128 b)
229{
230  return (__m128)__builtin_ia32_cmpss(b, a, 2);
231}
232
233static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
234_mm_cmpge_ps(__m128 a, __m128 b)
235{
236  return (__m128)__builtin_ia32_cmpps(b, a, 2);
237}
238
239static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
240_mm_cmpneq_ss(__m128 a, __m128 b)
241{
242  return (__m128)__builtin_ia32_cmpss(a, b, 4);
243}
244
245static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
246_mm_cmpneq_ps(__m128 a, __m128 b)
247{
248  return (__m128)__builtin_ia32_cmpps(a, b, 4);
249}
250
251static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
252_mm_cmpnlt_ss(__m128 a, __m128 b)
253{
254  return (__m128)__builtin_ia32_cmpss(a, b, 5);
255}
256
257static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
258_mm_cmpnlt_ps(__m128 a, __m128 b)
259{
260  return (__m128)__builtin_ia32_cmpps(a, b, 5);
261}
262
263static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
264_mm_cmpnle_ss(__m128 a, __m128 b)
265{
266  return (__m128)__builtin_ia32_cmpss(a, b, 6);
267}
268
269static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
270_mm_cmpnle_ps(__m128 a, __m128 b)
271{
272  return (__m128)__builtin_ia32_cmpps(a, b, 6);
273}
274
275static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
276_mm_cmpngt_ss(__m128 a, __m128 b)
277{
278  return (__m128)__builtin_ia32_cmpss(b, a, 5);
279}
280
281static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
282_mm_cmpngt_ps(__m128 a, __m128 b)
283{
284  return (__m128)__builtin_ia32_cmpps(b, a, 5);
285}
286
287static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
288_mm_cmpnge_ss(__m128 a, __m128 b)
289{
290  return (__m128)__builtin_ia32_cmpss(b, a, 6);
291}
292
293static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_ps(__m128 a, __m128 b)
295{
296  return (__m128)__builtin_ia32_cmpps(b, a, 6);
297}
298
299static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
300_mm_cmpord_ss(__m128 a, __m128 b)
301{
302  return (__m128)__builtin_ia32_cmpss(a, b, 7);
303}
304
305static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
306_mm_cmpord_ps(__m128 a, __m128 b)
307{
308  return (__m128)__builtin_ia32_cmpps(a, b, 7);
309}
310
311static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
312_mm_cmpunord_ss(__m128 a, __m128 b)
313{
314  return (__m128)__builtin_ia32_cmpss(a, b, 3);
315}
316
317static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
318_mm_cmpunord_ps(__m128 a, __m128 b)
319{
320  return (__m128)__builtin_ia32_cmpps(a, b, 3);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comieq_ss(__m128 a, __m128 b)
325{
326  return __builtin_ia32_comieq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comilt_ss(__m128 a, __m128 b)
331{
332  return __builtin_ia32_comilt(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_comile_ss(__m128 a, __m128 b)
337{
338  return __builtin_ia32_comile(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_comigt_ss(__m128 a, __m128 b)
343{
344  return __builtin_ia32_comigt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_comige_ss(__m128 a, __m128 b)
349{
350  return __builtin_ia32_comige(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_comineq_ss(__m128 a, __m128 b)
355{
356  return __builtin_ia32_comineq(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomieq_ss(__m128 a, __m128 b)
361{
362  return __builtin_ia32_ucomieq(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomilt_ss(__m128 a, __m128 b)
367{
368  return __builtin_ia32_ucomilt(a, b);
369}
370
371static __inline__ int __attribute__((__always_inline__, __nodebug__))
372_mm_ucomile_ss(__m128 a, __m128 b)
373{
374  return __builtin_ia32_ucomile(a, b);
375}
376
377static __inline__ int __attribute__((__always_inline__, __nodebug__))
378_mm_ucomigt_ss(__m128 a, __m128 b)
379{
380  return __builtin_ia32_ucomigt(a, b);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_ucomige_ss(__m128 a, __m128 b)
385{
386  return __builtin_ia32_ucomige(a, b);
387}
388
389static __inline__ int __attribute__((__always_inline__, __nodebug__))
390_mm_ucomineq_ss(__m128 a, __m128 b)
391{
392  return __builtin_ia32_ucomineq(a, b);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtss_si32(__m128 a)
397{
398  return __builtin_ia32_cvtss2si(a);
399}
400
401static __inline__ int __attribute__((__always_inline__, __nodebug__))
402_mm_cvt_ss2si(__m128 a)
403{
404  return _mm_cvtss_si32(a);
405}
406
407#ifdef __x86_64__
408
409static __inline__ long long __attribute__((__always_inline__, __nodebug__))
410_mm_cvtss_si64(__m128 a)
411{
412  return __builtin_ia32_cvtss2si64(a);
413}
414
415#endif
416
417static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
418_mm_cvtps_pi32(__m128 a)
419{
420  return (__m64)__builtin_ia32_cvtps2pi(a);
421}
422
423static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
424_mm_cvt_ps2pi(__m128 a)
425{
426  return _mm_cvtps_pi32(a);
427}
428
429static __inline__ int __attribute__((__always_inline__, __nodebug__))
430_mm_cvttss_si32(__m128 a)
431{
432  return a[0];
433}
434
435static __inline__ int __attribute__((__always_inline__, __nodebug__))
436_mm_cvtt_ss2si(__m128 a)
437{
438  return _mm_cvttss_si32(a);
439}
440
441static __inline__ long long __attribute__((__always_inline__, __nodebug__))
442_mm_cvttss_si64(__m128 a)
443{
444  return a[0];
445}
446
447static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
448_mm_cvttps_pi32(__m128 a)
449{
450  return (__m64)__builtin_ia32_cvttps2pi(a);
451}
452
453static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
454_mm_cvtt_ps2pi(__m128 a)
455{
456  return _mm_cvttps_pi32(a);
457}
458
459static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
460_mm_cvtsi32_ss(__m128 a, int b)
461{
462  a[0] = b;
463  return a;
464}
465
466static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
467_mm_cvt_si2ss(__m128 a, int b)
468{
469  return _mm_cvtsi32_ss(a, b);
470}
471
472#ifdef __x86_64__
473
474static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475_mm_cvtsi64_ss(__m128 a, long long b)
476{
477  a[0] = b;
478  return a;
479}
480
481#endif
482
483static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
484_mm_cvtpi32_ps(__m128 a, __m64 b)
485{
486  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
487}
488
489static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
490_mm_cvt_pi2ps(__m128 a, __m64 b)
491{
492  return _mm_cvtpi32_ps(a, b);
493}
494
495static __inline__ float __attribute__((__always_inline__, __nodebug__))
496_mm_cvtss_f32(__m128 a)
497{
498  return a[0];
499}
500
501static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
502_mm_loadh_pi(__m128 a, const __m64 *p)
503{
504  __m128 b;
505  b[0] = *(float*)p;
506  b[1] = *((float*)p+1);
507  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
508}
509
510static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
511_mm_loadl_pi(__m128 a, const __m64 *p)
512{
513  __m128 b;
514  b[0] = *(float*)p;
515  b[1] = *((float*)p+1);
516  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
517}
518
519static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
520_mm_load_ss(const float *p)
521{
522  return (__m128){ *p, 0, 0, 0 };
523}
524
525static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
526_mm_load1_ps(const float *p)
527{
528  return (__m128){ *p, *p, *p, *p };
529}
530
531#define        _mm_load_ps1(p) _mm_load1_ps(p)
532
533static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
534_mm_load_ps(const float *p)
535{
536  return *(__m128*)p;
537}
538
539static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
540_mm_loadu_ps(const float *p)
541{
542  return (__m128){ p[0], p[1], p[2], p[3] };
543}
544
545static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
546_mm_loadr_ps(const float *p)
547{
548  __m128 a = _mm_load_ps(p);
549  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
550}
551
552static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
553_mm_set_ss(float w)
554{
555  return (__m128){ w, 0, 0, 0 };
556}
557
558static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
559_mm_set1_ps(float w)
560{
561  return (__m128){ w, w, w, w };
562}
563
564// Microsoft specific.
565static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
566_mm_set_ps1(float w)
567{
568    return _mm_set1_ps(w);
569}
570
571static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
572_mm_set_ps(float z, float y, float x, float w)
573{
574  return (__m128){ w, x, y, z };
575}
576
577static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
578_mm_setr_ps(float z, float y, float x, float w)
579{
580  return (__m128){ z, y, x, w };
581}
582
583static __inline__ __m128 __attribute__((__always_inline__))
584_mm_setzero_ps(void)
585{
586  return (__m128){ 0, 0, 0, 0 };
587}
588
589static __inline__ void __attribute__((__always_inline__))
590_mm_storeh_pi(__m64 *p, __m128 a)
591{
592  __builtin_ia32_storehps((__v2si *)p, a);
593}
594
595static __inline__ void __attribute__((__always_inline__))
596_mm_storel_pi(__m64 *p, __m128 a)
597{
598  __builtin_ia32_storelps((__v2si *)p, a);
599}
600
601static __inline__ void __attribute__((__always_inline__))
602_mm_store_ss(float *p, __m128 a)
603{
604  *p = a[0];
605}
606
607static __inline__ void __attribute__((__always_inline__, __nodebug__))
608_mm_storeu_ps(float *p, __m128 a)
609{
610  __builtin_ia32_storeups(p, a);
611}
612
613static __inline__ void __attribute__((__always_inline__, __nodebug__))
614_mm_store1_ps(float *p, __m128 a)
615{
616  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
617  _mm_storeu_ps(p, a);
618}
619
620static __inline__ void __attribute__((__always_inline__, __nodebug__))
621_mm_store_ps1(float *p, __m128 a)
622{
623    return _mm_store1_ps(p, a);
624}
625
626static __inline__ void __attribute__((__always_inline__, __nodebug__))
627_mm_store_ps(float *p, __m128 a)
628{
629  *(__m128 *)p = a;
630}
631
632static __inline__ void __attribute__((__always_inline__, __nodebug__))
633_mm_storer_ps(float *p, __m128 a)
634{
635  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
636  _mm_store_ps(p, a);
637}
638
639#define _MM_HINT_T0 3
640#define _MM_HINT_T1 2
641#define _MM_HINT_T2 1
642#define _MM_HINT_NTA 0
643
644/* FIXME: We have to #define this because "sel" must be a constant integer, and
645   Sema doesn't do any form of constant propagation yet. */
646
647#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel))
648
649static __inline__ void __attribute__((__always_inline__, __nodebug__))
650_mm_stream_pi(__m64 *p, __m64 a)
651{
652  __builtin_ia32_movntq(p, a);
653}
654
655static __inline__ void __attribute__((__always_inline__, __nodebug__))
656_mm_stream_ps(float *p, __m128 a)
657{
658  __builtin_ia32_movntps(p, a);
659}
660
661static __inline__ void __attribute__((__always_inline__, __nodebug__))
662_mm_sfence(void)
663{
664  __builtin_ia32_sfence();
665}
666
667static __inline__ int __attribute__((__always_inline__, __nodebug__))
668_mm_extract_pi16(__m64 a, int n)
669{
670  __v4hi b = (__v4hi)a;
671  return (unsigned short)b[n & 3];
672}
673
674static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
675_mm_insert_pi16(__m64 a, int d, int n)
676{
677   __v4hi b = (__v4hi)a;
678   b[n & 3] = d;
679   return (__m64)b;
680}
681
682static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
683_mm_max_pi16(__m64 a, __m64 b)
684{
685  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
686}
687
688static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
689_mm_max_pu8(__m64 a, __m64 b)
690{
691  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
692}
693
694static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
695_mm_min_pi16(__m64 a, __m64 b)
696{
697  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
698}
699
700static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
701_mm_min_pu8(__m64 a, __m64 b)
702{
703  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
704}
705
706static __inline__ int __attribute__((__always_inline__, __nodebug__))
707_mm_movemask_pi8(__m64 a)
708{
709  return __builtin_ia32_pmovmskb((__v8qi)a);
710}
711
712static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
713_mm_mulhi_pu16(__m64 a, __m64 b)
714{
715  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
716}
717
718#define _mm_shuffle_pi16(a, n) \
719  ((__m64)__builtin_ia32_pshufw(a, n))
720
721static __inline__ void __attribute__((__always_inline__, __nodebug__))
722_mm_maskmove_si64(__m64 d, __m64 n, char *p)
723{
724  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
725}
726
727static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
728_mm_avg_pu8(__m64 a, __m64 b)
729{
730  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
731}
732
733static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
734_mm_avg_pu16(__m64 a, __m64 b)
735{
736  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
737}
738
739static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
740_mm_sad_pu8(__m64 a, __m64 b)
741{
742  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
743}
744
745static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
746_mm_getcsr(void)
747{
748  return __builtin_ia32_stmxcsr();
749}
750
751static __inline__ void __attribute__((__always_inline__, __nodebug__))
752_mm_setcsr(unsigned int i)
753{
754  __builtin_ia32_ldmxcsr(i);
755}
756
757#define _mm_shuffle_ps(a, b, mask) \
758        (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b),                \
759                                 (mask) & 0x3, ((mask) & 0xc) >> 2, \
760                                 (((mask) & 0x30) >> 4) + 4, \
761                                 (((mask) & 0xc0) >> 6) + 4))
762
763static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
764_mm_unpackhi_ps(__m128 a, __m128 b)
765{
766  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
767}
768
769static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
770_mm_unpacklo_ps(__m128 a, __m128 b)
771{
772  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
773}
774
775static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
776_mm_move_ss(__m128 a, __m128 b)
777{
778  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
779}
780
781static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
782_mm_movehl_ps(__m128 a, __m128 b)
783{
784  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
785}
786
787static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
788_mm_movelh_ps(__m128 a, __m128 b)
789{
790  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
791}
792
793static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
794_mm_cvtpi16_ps(__m64 a)
795{
796  __m64 b, c;
797  __m128 r;
798
799  b = _mm_setzero_si64();
800  b = _mm_cmpgt_pi16(b, a);
801  c = _mm_unpackhi_pi16(a, b);
802  r = _mm_setzero_ps();
803  r = _mm_cvtpi32_ps(r, c);
804  r = _mm_movelh_ps(r, r);
805  c = _mm_unpacklo_pi16(a, b);
806  r = _mm_cvtpi32_ps(r, c);
807
808  return r;
809}
810
811static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
812_mm_cvtpu16_ps(__m64 a)
813{
814  __m64 b, c;
815  __m128 r;
816
817  b = _mm_setzero_si64();
818  c = _mm_unpackhi_pi16(a, b);
819  r = _mm_setzero_ps();
820  r = _mm_cvtpi32_ps(r, c);
821  r = _mm_movelh_ps(r, r);
822  c = _mm_unpacklo_pi16(a, b);
823  r = _mm_cvtpi32_ps(r, c);
824
825  return r;
826}
827
828static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
829_mm_cvtpi8_ps(__m64 a)
830{
831  __m64 b;
832
833  b = _mm_setzero_si64();
834  b = _mm_cmpgt_pi8(b, a);
835  b = _mm_unpacklo_pi8(a, b);
836
837  return _mm_cvtpi16_ps(b);
838}
839
840static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
841_mm_cvtpu8_ps(__m64 a)
842{
843  __m64 b;
844
845  b = _mm_setzero_si64();
846  b = _mm_unpacklo_pi8(a, b);
847
848  return _mm_cvtpi16_ps(b);
849}
850
851static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
852_mm_cvtpi32x2_ps(__m64 a, __m64 b)
853{
854  __m128 c;
855
856  c = _mm_setzero_ps();
857  c = _mm_cvtpi32_ps(c, b);
858  c = _mm_movelh_ps(c, c);
859
860  return _mm_cvtpi32_ps(c, a);
861}
862
863static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
864_mm_cvtps_pi16(__m128 a)
865{
866  __m64 b, c;
867
868  b = _mm_cvtps_pi32(a);
869  a = _mm_movehl_ps(a, a);
870  c = _mm_cvtps_pi32(a);
871
872  return _mm_packs_pi16(b, c);
873}
874
875static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
876_mm_cvtps_pi8(__m128 a)
877{
878  __m64 b, c;
879
880  b = _mm_cvtps_pi16(a);
881  c = _mm_setzero_si64();
882
883  return _mm_packs_pi16(b, c);
884}
885
886static __inline__ int __attribute__((__always_inline__, __nodebug__))
887_mm_movemask_ps(__m128 a)
888{
889  return __builtin_ia32_movmskps(a);
890}
891
892#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
893
894#define _MM_EXCEPT_INVALID    (0x0001)
895#define _MM_EXCEPT_DENORM     (0x0002)
896#define _MM_EXCEPT_DIV_ZERO   (0x0004)
897#define _MM_EXCEPT_OVERFLOW   (0x0008)
898#define _MM_EXCEPT_UNDERFLOW  (0x0010)
899#define _MM_EXCEPT_INEXACT    (0x0020)
900#define _MM_EXCEPT_MASK       (0x003f)
901
902#define _MM_MASK_INVALID      (0x0080)
903#define _MM_MASK_DENORM       (0x0100)
904#define _MM_MASK_DIV_ZERO     (0x0200)
905#define _MM_MASK_OVERFLOW     (0x0400)
906#define _MM_MASK_UNDERFLOW    (0x0800)
907#define _MM_MASK_INEXACT      (0x1000)
908#define _MM_MASK_MASK         (0x1f80)
909
910#define _MM_ROUND_NEAREST     (0x0000)
911#define _MM_ROUND_DOWN        (0x2000)
912#define _MM_ROUND_UP          (0x4000)
913#define _MM_ROUND_TOWARD_ZERO (0x6000)
914#define _MM_ROUND_MASK        (0x6000)
915
916#define _MM_FLUSH_ZERO_MASK   (0x8000)
917#define _MM_FLUSH_ZERO_ON     (0x8000)
918#define _MM_FLUSH_ZERO_OFF    (0x8000)
919
920#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
921#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
922#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
923#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
924
925#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
926#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
927#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
928#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
929
930#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
931do { \
932  __m128 tmp3, tmp2, tmp1, tmp0; \
933  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
934  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
935  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
936  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
937  (row0) = _mm_movelh_ps(tmp0, tmp2); \
938  (row1) = _mm_movehl_ps(tmp2, tmp0); \
939  (row2) = _mm_movelh_ps(tmp1, tmp3); \
940  (row3) = _mm_movehl_ps(tmp3, tmp1); \
941} while (0)
942
943/* Aliases for compatibility. */
944#define _m_pextrw _mm_extract_pi16
945#define _m_pinsrw _mm_insert_pi16
946#define _m_pmaxsw _mm_max_pi16
947#define _m_pmaxub _mm_max_pu8
948#define _m_pminsw _mm_min_pi16
949#define _m_pminub _mm_min_pu8
950#define _m_pmovmskb _mm_movemask_pi8
951#define _m_pmulhuw _mm_mulhi_pu16
952#define _m_pshufw _mm_shuffle_pi16
953#define _m_maskmovq _mm_maskmove_si64
954#define _m_pavgb _mm_avg_pu8
955#define _m_pavgw _mm_avg_pu16
956#define _m_psadbw _mm_sad_pu8
957#define _m_ _mm_
958#define _m_ _mm_
959
960/* Ugly hack for backwards-compatibility (compatible with gcc) */
961#ifdef __SSE2__
962#include <emmintrin.h>
963#endif
964
965#endif /* __SSE__ */
966
967#endif /* __XMMINTRIN_H */
968