xmmintrin.h revision 223017
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
28#error "SSE instruction set not enabled"
29#else
30
31#include <mmintrin.h>
32
33typedef int __v4si __attribute__((__vector_size__(16)));
34typedef float __v4sf __attribute__((__vector_size__(16)));
35typedef float __m128 __attribute__((__vector_size__(16)));
36
37// This header should only be included in a hosted environment as it depends on
38// a standard library to provide allocation routines.
39#if __STDC_HOSTED__
40#include <mm_malloc.h>
41#endif
42
43static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
44_mm_add_ss(__m128 a, __m128 b)
45{
46  a[0] += b[0];
47  return a;
48}
49
50static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
51_mm_add_ps(__m128 a, __m128 b)
52{
53  return a + b;
54}
55
56static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
57_mm_sub_ss(__m128 a, __m128 b)
58{
59  a[0] -= b[0];
60  return a;
61}
62
63static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
64_mm_sub_ps(__m128 a, __m128 b)
65{
66  return a - b;
67}
68
69static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
70_mm_mul_ss(__m128 a, __m128 b)
71{
72  a[0] *= b[0];
73  return a;
74}
75
76static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
77_mm_mul_ps(__m128 a, __m128 b)
78{
79  return a * b;
80}
81
82static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
83_mm_div_ss(__m128 a, __m128 b)
84{
85  a[0] /= b[0];
86  return a;
87}
88
89static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
90_mm_div_ps(__m128 a, __m128 b)
91{
92  return a / b;
93}
94
95static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
96_mm_sqrt_ss(__m128 a)
97{
98  return __builtin_ia32_sqrtss(a);
99}
100
101static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
102_mm_sqrt_ps(__m128 a)
103{
104  return __builtin_ia32_sqrtps(a);
105}
106
107static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
108_mm_rcp_ss(__m128 a)
109{
110  return __builtin_ia32_rcpss(a);
111}
112
113static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
114_mm_rcp_ps(__m128 a)
115{
116  return __builtin_ia32_rcpps(a);
117}
118
119static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
120_mm_rsqrt_ss(__m128 a)
121{
122  return __builtin_ia32_rsqrtss(a);
123}
124
125static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
126_mm_rsqrt_ps(__m128 a)
127{
128  return __builtin_ia32_rsqrtps(a);
129}
130
131static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
132_mm_min_ss(__m128 a, __m128 b)
133{
134  return __builtin_ia32_minss(a, b);
135}
136
137static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
138_mm_min_ps(__m128 a, __m128 b)
139{
140  return __builtin_ia32_minps(a, b);
141}
142
143static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
144_mm_max_ss(__m128 a, __m128 b)
145{
146  return __builtin_ia32_maxss(a, b);
147}
148
149static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
150_mm_max_ps(__m128 a, __m128 b)
151{
152  return __builtin_ia32_maxps(a, b);
153}
154
155static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
156_mm_and_ps(__m128 a, __m128 b)
157{
158  return (__m128)((__v4si)a & (__v4si)b);
159}
160
161static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
162_mm_andnot_ps(__m128 a, __m128 b)
163{
164  return (__m128)(~(__v4si)a & (__v4si)b);
165}
166
167static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
168_mm_or_ps(__m128 a, __m128 b)
169{
170  return (__m128)((__v4si)a | (__v4si)b);
171}
172
173static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
174_mm_xor_ps(__m128 a, __m128 b)
175{
176  return (__m128)((__v4si)a ^ (__v4si)b);
177}
178
179static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
180_mm_cmpeq_ss(__m128 a, __m128 b)
181{
182  return (__m128)__builtin_ia32_cmpss(a, b, 0);
183}
184
185static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
186_mm_cmpeq_ps(__m128 a, __m128 b)
187{
188  return (__m128)__builtin_ia32_cmpps(a, b, 0);
189}
190
191static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
192_mm_cmplt_ss(__m128 a, __m128 b)
193{
194  return (__m128)__builtin_ia32_cmpss(a, b, 1);
195}
196
197static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
198_mm_cmplt_ps(__m128 a, __m128 b)
199{
200  return (__m128)__builtin_ia32_cmpps(a, b, 1);
201}
202
203static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
204_mm_cmple_ss(__m128 a, __m128 b)
205{
206  return (__m128)__builtin_ia32_cmpss(a, b, 2);
207}
208
209static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
210_mm_cmple_ps(__m128 a, __m128 b)
211{
212  return (__m128)__builtin_ia32_cmpps(a, b, 2);
213}
214
215static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
216_mm_cmpgt_ss(__m128 a, __m128 b)
217{
218  return (__m128)__builtin_ia32_cmpss(b, a, 1);
219}
220
221static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
222_mm_cmpgt_ps(__m128 a, __m128 b)
223{
224  return (__m128)__builtin_ia32_cmpps(b, a, 1);
225}
226
227static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
228_mm_cmpge_ss(__m128 a, __m128 b)
229{
230  return (__m128)__builtin_ia32_cmpss(b, a, 2);
231}
232
233static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
234_mm_cmpge_ps(__m128 a, __m128 b)
235{
236  return (__m128)__builtin_ia32_cmpps(b, a, 2);
237}
238
239static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
240_mm_cmpneq_ss(__m128 a, __m128 b)
241{
242  return (__m128)__builtin_ia32_cmpss(a, b, 4);
243}
244
245static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
246_mm_cmpneq_ps(__m128 a, __m128 b)
247{
248  return (__m128)__builtin_ia32_cmpps(a, b, 4);
249}
250
251static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
252_mm_cmpnlt_ss(__m128 a, __m128 b)
253{
254  return (__m128)__builtin_ia32_cmpss(a, b, 5);
255}
256
257static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
258_mm_cmpnlt_ps(__m128 a, __m128 b)
259{
260  return (__m128)__builtin_ia32_cmpps(a, b, 5);
261}
262
263static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
264_mm_cmpnle_ss(__m128 a, __m128 b)
265{
266  return (__m128)__builtin_ia32_cmpss(a, b, 6);
267}
268
269static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
270_mm_cmpnle_ps(__m128 a, __m128 b)
271{
272  return (__m128)__builtin_ia32_cmpps(a, b, 6);
273}
274
275static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
276_mm_cmpngt_ss(__m128 a, __m128 b)
277{
278  return (__m128)__builtin_ia32_cmpss(b, a, 5);
279}
280
281static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
282_mm_cmpngt_ps(__m128 a, __m128 b)
283{
284  return (__m128)__builtin_ia32_cmpps(b, a, 5);
285}
286
287static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
288_mm_cmpnge_ss(__m128 a, __m128 b)
289{
290  return (__m128)__builtin_ia32_cmpss(b, a, 6);
291}
292
293static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
294_mm_cmpnge_ps(__m128 a, __m128 b)
295{
296  return (__m128)__builtin_ia32_cmpps(b, a, 6);
297}
298
299static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
300_mm_cmpord_ss(__m128 a, __m128 b)
301{
302  return (__m128)__builtin_ia32_cmpss(a, b, 7);
303}
304
305static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
306_mm_cmpord_ps(__m128 a, __m128 b)
307{
308  return (__m128)__builtin_ia32_cmpps(a, b, 7);
309}
310
311static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
312_mm_cmpunord_ss(__m128 a, __m128 b)
313{
314  return (__m128)__builtin_ia32_cmpss(a, b, 3);
315}
316
317static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
318_mm_cmpunord_ps(__m128 a, __m128 b)
319{
320  return (__m128)__builtin_ia32_cmpps(a, b, 3);
321}
322
323static __inline__ int __attribute__((__always_inline__, __nodebug__))
324_mm_comieq_ss(__m128 a, __m128 b)
325{
326  return __builtin_ia32_comieq(a, b);
327}
328
329static __inline__ int __attribute__((__always_inline__, __nodebug__))
330_mm_comilt_ss(__m128 a, __m128 b)
331{
332  return __builtin_ia32_comilt(a, b);
333}
334
335static __inline__ int __attribute__((__always_inline__, __nodebug__))
336_mm_comile_ss(__m128 a, __m128 b)
337{
338  return __builtin_ia32_comile(a, b);
339}
340
341static __inline__ int __attribute__((__always_inline__, __nodebug__))
342_mm_comigt_ss(__m128 a, __m128 b)
343{
344  return __builtin_ia32_comigt(a, b);
345}
346
347static __inline__ int __attribute__((__always_inline__, __nodebug__))
348_mm_comige_ss(__m128 a, __m128 b)
349{
350  return __builtin_ia32_comige(a, b);
351}
352
353static __inline__ int __attribute__((__always_inline__, __nodebug__))
354_mm_comineq_ss(__m128 a, __m128 b)
355{
356  return __builtin_ia32_comineq(a, b);
357}
358
359static __inline__ int __attribute__((__always_inline__, __nodebug__))
360_mm_ucomieq_ss(__m128 a, __m128 b)
361{
362  return __builtin_ia32_ucomieq(a, b);
363}
364
365static __inline__ int __attribute__((__always_inline__, __nodebug__))
366_mm_ucomilt_ss(__m128 a, __m128 b)
367{
368  return __builtin_ia32_ucomilt(a, b);
369}
370
371static __inline__ int __attribute__((__always_inline__, __nodebug__))
372_mm_ucomile_ss(__m128 a, __m128 b)
373{
374  return __builtin_ia32_ucomile(a, b);
375}
376
377static __inline__ int __attribute__((__always_inline__, __nodebug__))
378_mm_ucomigt_ss(__m128 a, __m128 b)
379{
380  return __builtin_ia32_ucomigt(a, b);
381}
382
383static __inline__ int __attribute__((__always_inline__, __nodebug__))
384_mm_ucomige_ss(__m128 a, __m128 b)
385{
386  return __builtin_ia32_ucomige(a, b);
387}
388
389static __inline__ int __attribute__((__always_inline__, __nodebug__))
390_mm_ucomineq_ss(__m128 a, __m128 b)
391{
392  return __builtin_ia32_ucomineq(a, b);
393}
394
395static __inline__ int __attribute__((__always_inline__, __nodebug__))
396_mm_cvtss_si32(__m128 a)
397{
398  return __builtin_ia32_cvtss2si(a);
399}
400
401static __inline__ int __attribute__((__always_inline__, __nodebug__))
402_mm_cvt_ss2si(__m128 a)
403{
404  return _mm_cvtss_si32(a);
405}
406
407#ifdef __x86_64__
408
409static __inline__ long long __attribute__((__always_inline__, __nodebug__))
410_mm_cvtss_si64(__m128 a)
411{
412  return __builtin_ia32_cvtss2si64(a);
413}
414
415#endif
416
417static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
418_mm_cvtps_pi32(__m128 a)
419{
420  return (__m64)__builtin_ia32_cvtps2pi(a);
421}
422
423static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
424_mm_cvt_ps2pi(__m128 a)
425{
426  return _mm_cvtps_pi32(a);
427}
428
429static __inline__ int __attribute__((__always_inline__, __nodebug__))
430_mm_cvttss_si32(__m128 a)
431{
432  return a[0];
433}
434
435static __inline__ int __attribute__((__always_inline__, __nodebug__))
436_mm_cvtt_ss2si(__m128 a)
437{
438  return _mm_cvttss_si32(a);
439}
440
441static __inline__ long long __attribute__((__always_inline__, __nodebug__))
442_mm_cvttss_si64(__m128 a)
443{
444  return a[0];
445}
446
447static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
448_mm_cvttps_pi32(__m128 a)
449{
450  return (__m64)__builtin_ia32_cvttps2pi(a);
451}
452
453static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
454_mm_cvtt_ps2pi(__m128 a)
455{
456  return _mm_cvttps_pi32(a);
457}
458
459static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
460_mm_cvtsi32_ss(__m128 a, int b)
461{
462  a[0] = b;
463  return a;
464}
465
466static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
467_mm_cvt_si2ss(__m128 a, int b)
468{
469  return _mm_cvtsi32_ss(a, b);
470}
471
472#ifdef __x86_64__
473
474static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
475_mm_cvtsi64_ss(__m128 a, long long b)
476{
477  a[0] = b;
478  return a;
479}
480
481#endif
482
483static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
484_mm_cvtpi32_ps(__m128 a, __m64 b)
485{
486  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
487}
488
489static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
490_mm_cvt_pi2ps(__m128 a, __m64 b)
491{
492  return _mm_cvtpi32_ps(a, b);
493}
494
495static __inline__ float __attribute__((__always_inline__, __nodebug__))
496_mm_cvtss_f32(__m128 a)
497{
498  return a[0];
499}
500
501static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
502_mm_loadh_pi(__m128 a, const __m64 *p)
503{
504  __m128 b;
505  b[0] = *(float*)p;
506  b[1] = *((float*)p+1);
507  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
508}
509
510static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
511_mm_loadl_pi(__m128 a, const __m64 *p)
512{
513  __m128 b;
514  b[0] = *(float*)p;
515  b[1] = *((float*)p+1);
516  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
517}
518
519static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
520_mm_load_ss(const float *p)
521{
522  return (__m128){ *p, 0, 0, 0 };
523}
524
525static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
526_mm_load1_ps(const float *p)
527{
528  return (__m128){ *p, *p, *p, *p };
529}
530
531#define        _mm_load_ps1(p) _mm_load1_ps(p)
532
533static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
534_mm_load_ps(const float *p)
535{
536  return *(__m128*)p;
537}
538
539static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
540_mm_loadu_ps(const float *p)
541{
542  struct __loadu_ps {
543    __m128 v;
544  } __attribute__((packed, may_alias));
545  return ((struct __loadu_ps*)p)->v;
546}
547
548static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
549_mm_loadr_ps(const float *p)
550{
551  __m128 a = _mm_load_ps(p);
552  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
553}
554
555static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
556_mm_set_ss(float w)
557{
558  return (__m128){ w, 0, 0, 0 };
559}
560
561static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
562_mm_set1_ps(float w)
563{
564  return (__m128){ w, w, w, w };
565}
566
567// Microsoft specific.
568static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
569_mm_set_ps1(float w)
570{
571    return _mm_set1_ps(w);
572}
573
574static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
575_mm_set_ps(float z, float y, float x, float w)
576{
577  return (__m128){ w, x, y, z };
578}
579
580static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
581_mm_setr_ps(float z, float y, float x, float w)
582{
583  return (__m128){ z, y, x, w };
584}
585
586static __inline__ __m128 __attribute__((__always_inline__))
587_mm_setzero_ps(void)
588{
589  return (__m128){ 0, 0, 0, 0 };
590}
591
592static __inline__ void __attribute__((__always_inline__))
593_mm_storeh_pi(__m64 *p, __m128 a)
594{
595  __builtin_ia32_storehps((__v2si *)p, a);
596}
597
598static __inline__ void __attribute__((__always_inline__))
599_mm_storel_pi(__m64 *p, __m128 a)
600{
601  __builtin_ia32_storelps((__v2si *)p, a);
602}
603
604static __inline__ void __attribute__((__always_inline__))
605_mm_store_ss(float *p, __m128 a)
606{
607  *p = a[0];
608}
609
610static __inline__ void __attribute__((__always_inline__, __nodebug__))
611_mm_storeu_ps(float *p, __m128 a)
612{
613  __builtin_ia32_storeups(p, a);
614}
615
616static __inline__ void __attribute__((__always_inline__, __nodebug__))
617_mm_store1_ps(float *p, __m128 a)
618{
619  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
620  _mm_storeu_ps(p, a);
621}
622
623static __inline__ void __attribute__((__always_inline__, __nodebug__))
624_mm_store_ps1(float *p, __m128 a)
625{
626    return _mm_store1_ps(p, a);
627}
628
629static __inline__ void __attribute__((__always_inline__, __nodebug__))
630_mm_store_ps(float *p, __m128 a)
631{
632  *(__m128 *)p = a;
633}
634
635static __inline__ void __attribute__((__always_inline__, __nodebug__))
636_mm_storer_ps(float *p, __m128 a)
637{
638  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
639  _mm_store_ps(p, a);
640}
641
642#define _MM_HINT_T0 3
643#define _MM_HINT_T1 2
644#define _MM_HINT_T2 1
645#define _MM_HINT_NTA 0
646
647/* FIXME: We have to #define this because "sel" must be a constant integer, and
648   Sema doesn't do any form of constant propagation yet. */
649
650#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel))
651
652static __inline__ void __attribute__((__always_inline__, __nodebug__))
653_mm_stream_pi(__m64 *p, __m64 a)
654{
655  __builtin_ia32_movntq(p, a);
656}
657
658static __inline__ void __attribute__((__always_inline__, __nodebug__))
659_mm_stream_ps(float *p, __m128 a)
660{
661  __builtin_ia32_movntps(p, a);
662}
663
664static __inline__ void __attribute__((__always_inline__, __nodebug__))
665_mm_sfence(void)
666{
667  __builtin_ia32_sfence();
668}
669
670static __inline__ int __attribute__((__always_inline__, __nodebug__))
671_mm_extract_pi16(__m64 a, int n)
672{
673  __v4hi b = (__v4hi)a;
674  return (unsigned short)b[n & 3];
675}
676
677static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
678_mm_insert_pi16(__m64 a, int d, int n)
679{
680   __v4hi b = (__v4hi)a;
681   b[n & 3] = d;
682   return (__m64)b;
683}
684
685static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
686_mm_max_pi16(__m64 a, __m64 b)
687{
688  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
689}
690
691static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
692_mm_max_pu8(__m64 a, __m64 b)
693{
694  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
695}
696
697static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
698_mm_min_pi16(__m64 a, __m64 b)
699{
700  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
701}
702
703static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
704_mm_min_pu8(__m64 a, __m64 b)
705{
706  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
707}
708
709static __inline__ int __attribute__((__always_inline__, __nodebug__))
710_mm_movemask_pi8(__m64 a)
711{
712  return __builtin_ia32_pmovmskb((__v8qi)a);
713}
714
715static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
716_mm_mulhi_pu16(__m64 a, __m64 b)
717{
718  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
719}
720
721#define _mm_shuffle_pi16(a, n) \
722  ((__m64)__builtin_ia32_pshufw(a, n))
723
724static __inline__ void __attribute__((__always_inline__, __nodebug__))
725_mm_maskmove_si64(__m64 d, __m64 n, char *p)
726{
727  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
728}
729
730static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
731_mm_avg_pu8(__m64 a, __m64 b)
732{
733  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
734}
735
736static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
737_mm_avg_pu16(__m64 a, __m64 b)
738{
739  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
740}
741
742static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
743_mm_sad_pu8(__m64 a, __m64 b)
744{
745  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
746}
747
748static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
749_mm_getcsr(void)
750{
751  return __builtin_ia32_stmxcsr();
752}
753
754static __inline__ void __attribute__((__always_inline__, __nodebug__))
755_mm_setcsr(unsigned int i)
756{
757  __builtin_ia32_ldmxcsr(i);
758}
759
760#define _mm_shuffle_ps(a, b, mask) \
761        (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b),                \
762                                 (mask) & 0x3, ((mask) & 0xc) >> 2, \
763                                 (((mask) & 0x30) >> 4) + 4, \
764                                 (((mask) & 0xc0) >> 6) + 4))
765
766static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
767_mm_unpackhi_ps(__m128 a, __m128 b)
768{
769  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
770}
771
772static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
773_mm_unpacklo_ps(__m128 a, __m128 b)
774{
775  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
776}
777
778static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
779_mm_move_ss(__m128 a, __m128 b)
780{
781  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
782}
783
784static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
785_mm_movehl_ps(__m128 a, __m128 b)
786{
787  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
788}
789
790static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
791_mm_movelh_ps(__m128 a, __m128 b)
792{
793  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
794}
795
796static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
797_mm_cvtpi16_ps(__m64 a)
798{
799  __m64 b, c;
800  __m128 r;
801
802  b = _mm_setzero_si64();
803  b = _mm_cmpgt_pi16(b, a);
804  c = _mm_unpackhi_pi16(a, b);
805  r = _mm_setzero_ps();
806  r = _mm_cvtpi32_ps(r, c);
807  r = _mm_movelh_ps(r, r);
808  c = _mm_unpacklo_pi16(a, b);
809  r = _mm_cvtpi32_ps(r, c);
810
811  return r;
812}
813
814static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
815_mm_cvtpu16_ps(__m64 a)
816{
817  __m64 b, c;
818  __m128 r;
819
820  b = _mm_setzero_si64();
821  c = _mm_unpackhi_pi16(a, b);
822  r = _mm_setzero_ps();
823  r = _mm_cvtpi32_ps(r, c);
824  r = _mm_movelh_ps(r, r);
825  c = _mm_unpacklo_pi16(a, b);
826  r = _mm_cvtpi32_ps(r, c);
827
828  return r;
829}
830
831static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
832_mm_cvtpi8_ps(__m64 a)
833{
834  __m64 b;
835
836  b = _mm_setzero_si64();
837  b = _mm_cmpgt_pi8(b, a);
838  b = _mm_unpacklo_pi8(a, b);
839
840  return _mm_cvtpi16_ps(b);
841}
842
843static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
844_mm_cvtpu8_ps(__m64 a)
845{
846  __m64 b;
847
848  b = _mm_setzero_si64();
849  b = _mm_unpacklo_pi8(a, b);
850
851  return _mm_cvtpi16_ps(b);
852}
853
854static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
855_mm_cvtpi32x2_ps(__m64 a, __m64 b)
856{
857  __m128 c;
858
859  c = _mm_setzero_ps();
860  c = _mm_cvtpi32_ps(c, b);
861  c = _mm_movelh_ps(c, c);
862
863  return _mm_cvtpi32_ps(c, a);
864}
865
866static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
867_mm_cvtps_pi16(__m128 a)
868{
869  __m64 b, c;
870
871  b = _mm_cvtps_pi32(a);
872  a = _mm_movehl_ps(a, a);
873  c = _mm_cvtps_pi32(a);
874
875  return _mm_packs_pi16(b, c);
876}
877
878static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
879_mm_cvtps_pi8(__m128 a)
880{
881  __m64 b, c;
882
883  b = _mm_cvtps_pi16(a);
884  c = _mm_setzero_si64();
885
886  return _mm_packs_pi16(b, c);
887}
888
889static __inline__ int __attribute__((__always_inline__, __nodebug__))
890_mm_movemask_ps(__m128 a)
891{
892  return __builtin_ia32_movmskps(a);
893}
894
895#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
896
897#define _MM_EXCEPT_INVALID    (0x0001)
898#define _MM_EXCEPT_DENORM     (0x0002)
899#define _MM_EXCEPT_DIV_ZERO   (0x0004)
900#define _MM_EXCEPT_OVERFLOW   (0x0008)
901#define _MM_EXCEPT_UNDERFLOW  (0x0010)
902#define _MM_EXCEPT_INEXACT    (0x0020)
903#define _MM_EXCEPT_MASK       (0x003f)
904
905#define _MM_MASK_INVALID      (0x0080)
906#define _MM_MASK_DENORM       (0x0100)
907#define _MM_MASK_DIV_ZERO     (0x0200)
908#define _MM_MASK_OVERFLOW     (0x0400)
909#define _MM_MASK_UNDERFLOW    (0x0800)
910#define _MM_MASK_INEXACT      (0x1000)
911#define _MM_MASK_MASK         (0x1f80)
912
913#define _MM_ROUND_NEAREST     (0x0000)
914#define _MM_ROUND_DOWN        (0x2000)
915#define _MM_ROUND_UP          (0x4000)
916#define _MM_ROUND_TOWARD_ZERO (0x6000)
917#define _MM_ROUND_MASK        (0x6000)
918
919#define _MM_FLUSH_ZERO_MASK   (0x8000)
920#define _MM_FLUSH_ZERO_ON     (0x8000)
921#define _MM_FLUSH_ZERO_OFF    (0x8000)
922
923#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
924#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
925#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
926#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
927
928#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
929#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
930#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
931#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
932
933#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
934do { \
935  __m128 tmp3, tmp2, tmp1, tmp0; \
936  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
937  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
938  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
939  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
940  (row0) = _mm_movelh_ps(tmp0, tmp2); \
941  (row1) = _mm_movehl_ps(tmp2, tmp0); \
942  (row2) = _mm_movelh_ps(tmp1, tmp3); \
943  (row3) = _mm_movehl_ps(tmp3, tmp1); \
944} while (0)
945
946/* Aliases for compatibility. */
947#define _m_pextrw _mm_extract_pi16
948#define _m_pinsrw _mm_insert_pi16
949#define _m_pmaxsw _mm_max_pi16
950#define _m_pmaxub _mm_max_pu8
951#define _m_pminsw _mm_min_pi16
952#define _m_pminub _mm_min_pu8
953#define _m_pmovmskb _mm_movemask_pi8
954#define _m_pmulhuw _mm_mulhi_pu16
955#define _m_pshufw _mm_shuffle_pi16
956#define _m_maskmovq _mm_maskmove_si64
957#define _m_pavgb _mm_avg_pu8
958#define _m_pavgw _mm_avg_pu16
959#define _m_psadbw _mm_sad_pu8
960#define _m_ _mm_
961#define _m_ _mm_
962
963/* Ugly hack for backwards-compatibility (compatible with gcc) */
964#ifdef __SSE2__
965#include <emmintrin.h>
966#endif
967
968#endif /* __SSE__ */
969
970#endif /* __XMMINTRIN_H */
971