xmmintrin.h revision 194179
167754Smsmith/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
267754Smsmith *
367754Smsmith * Permission is hereby granted, free of charge, to any person obtaining a copy
4123315Snjl * of this software and associated documentation files (the "Software"), to deal
567754Smsmith * in the Software without restriction, including without limitation the rights
667754Smsmith * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
767754Smsmith * copies of the Software, and to permit persons to whom the Software is
867754Smsmith * furnished to do so, subject to the following conditions:
967754Smsmith *
1067754Smsmith * The above copyright notice and this permission notice shall be included in
1167754Smsmith * all copies or substantial portions of the Software.
12114237Snjl *
1370243Smsmith * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1467754Smsmith * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1567754Smsmith * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1667754Smsmith * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1767754Smsmith * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1867754Smsmith * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1967754Smsmith * THE SOFTWARE.
2067754Smsmith *
2167754Smsmith *===-----------------------------------------------------------------------===
2267754Smsmith */
2367754Smsmith
2467754Smsmith#ifndef __XMMINTRIN_H
2567754Smsmith#define __XMMINTRIN_H
2667754Smsmith
2767754Smsmith#ifndef __SSE__
2867754Smsmith#error "SSE instruction set not enabled"
2967754Smsmith#else
3067754Smsmith
3167754Smsmith#include <mmintrin.h>
3267754Smsmith
3367754Smsmithtypedef float __v4sf __attribute__((__vector_size__(16)));
3467754Smsmithtypedef float __m128 __attribute__((__vector_size__(16)));
3567754Smsmith
3667754Smsmith#include <mm_malloc.h>
3767754Smsmith
3867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
3967754Smsmith_mm_add_ss(__m128 a, __m128 b)
4067754Smsmith{
4167754Smsmith  a[0] += b[0];
4267754Smsmith  return a;
4367754Smsmith}
4467754Smsmith
4567754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
4667754Smsmith_mm_add_ps(__m128 a, __m128 b)
4767754Smsmith{
4867754Smsmith  return a + b;
4967754Smsmith}
5067754Smsmith
5167754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
5267754Smsmith_mm_sub_ss(__m128 a, __m128 b)
5367754Smsmith{
5467754Smsmith  a[0] -= b[0];
5567754Smsmith  return a;
5667754Smsmith}
5767754Smsmith
5867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
5967754Smsmith_mm_sub_ps(__m128 a, __m128 b)
6067754Smsmith{
6167754Smsmith  return a - b;
6267754Smsmith}
6367754Smsmith
6467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
6567754Smsmith_mm_mul_ss(__m128 a, __m128 b)
6667754Smsmith{
6767754Smsmith  a[0] *= b[0];
6867754Smsmith  return a;
6967754Smsmith}
7067754Smsmith
7167754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
7267754Smsmith_mm_mul_ps(__m128 a, __m128 b)
7367754Smsmith{
7467754Smsmith  return a * b;
7567754Smsmith}
7667754Smsmith
7767754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
7867754Smsmith_mm_div_ss(__m128 a, __m128 b)
7967754Smsmith{
8067754Smsmith  a[0] /= b[0];
8167754Smsmith  return a;
8267754Smsmith}
8367754Smsmith
8467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
8567754Smsmith_mm_div_ps(__m128 a, __m128 b)
8667754Smsmith{
8767754Smsmith  return a / b;
8867754Smsmith}
8967754Smsmith
9067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
9167754Smsmith_mm_sqrt_ss(__m128 a)
9267754Smsmith{
9367754Smsmith  return __builtin_ia32_sqrtss(a);
9467754Smsmith}
9567754Smsmith
9667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
9767754Smsmith_mm_sqrt_ps(__m128 a)
9867754Smsmith{
9967754Smsmith  return __builtin_ia32_sqrtps(a);
10067754Smsmith}
10167754Smsmith
10267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
10367754Smsmith_mm_rcp_ss(__m128 a)
10467754Smsmith{
10567754Smsmith  return __builtin_ia32_rcpss(a);
10667754Smsmith}
10767754Smsmith
10867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
10967754Smsmith_mm_rcp_ps(__m128 a)
11067754Smsmith{
11167754Smsmith  return __builtin_ia32_rcpps(a);
11267754Smsmith}
11367754Smsmith
11467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
11567754Smsmith_mm_rsqrt_ss(__m128 a)
11667754Smsmith{
11767754Smsmith  return __builtin_ia32_rsqrtss(a);
11867754Smsmith}
11967754Smsmith
12067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
12167754Smsmith_mm_rsqrt_ps(__m128 a)
12267754Smsmith{
12377424Smsmith  return __builtin_ia32_rsqrtps(a);
12491116Smsmith}
12567754Smsmith
12667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
12767754Smsmith_mm_min_ss(__m128 a, __m128 b)
12867754Smsmith{
12967754Smsmith  return __builtin_ia32_minss(a, b);
13067754Smsmith}
13167754Smsmith
132107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__))
13367754Smsmith_mm_min_ps(__m128 a, __m128 b)
13477424Smsmith{
13567754Smsmith  return __builtin_ia32_minps(a, b);
13667754Smsmith}
13767754Smsmith
138114237Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
139114237Snjl_mm_max_ss(__m128 a, __m128 b)
140107325Siwasaki{
14167754Smsmith  return __builtin_ia32_maxss(a, b);
14267754Smsmith}
14367754Smsmith
14467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
14567754Smsmith_mm_max_ps(__m128 a, __m128 b)
14667754Smsmith{
14767754Smsmith  return __builtin_ia32_maxps(a, b);
14867754Smsmith}
14967754Smsmith
15067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
15167754Smsmith_mm_and_ps(__m128 a, __m128 b)
15267754Smsmith{
15367754Smsmith  typedef int __v4si __attribute__((__vector_size__(16)));
15467754Smsmith  return (__m128)((__v4si)a & (__v4si)b);
15567754Smsmith}
15667754Smsmith
15767754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
15891116Smsmith_mm_andnot_ps(__m128 a, __m128 b)
15967754Smsmith{
16067754Smsmith  typedef int __v4si __attribute__((__vector_size__(16)));
16167754Smsmith  return (__m128)(~(__v4si)a & (__v4si)b);
16267754Smsmith}
16367754Smsmith
16491116Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
16567754Smsmith_mm_or_ps(__m128 a, __m128 b)
16671867Smsmith{
167102550Siwasaki  typedef int __v4si __attribute__((__vector_size__(16)));
16882367Smsmith  return (__m128)((__v4si)a | (__v4si)b);
16967754Smsmith}
170114237Snjl
17177424Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
17291116Smsmith_mm_xor_ps(__m128 a, __m128 b)
17371867Smsmith{
17471867Smsmith  typedef int __v4si __attribute__((__vector_size__(16)));
175123315Snjl  return (__m128)((__v4si)a ^ ~(__v4si)b);
17691116Smsmith}
17771867Smsmith
17880062Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
17971867Smsmith_mm_cmpeq_ss(__m128 a, __m128 b)
18067754Smsmith{
18171867Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 0);
18267754Smsmith}
18367754Smsmith
184114237Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
185107325Siwasaki_mm_cmpeq_ps(__m128 a, __m128 b)
18667754Smsmith{
18767754Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 0);
18867754Smsmith}
18967754Smsmith
19067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
19167754Smsmith_mm_cmplt_ss(__m128 a, __m128 b)
19299146Siwasaki{
19367754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 1);
19467754Smsmith}
195107325Siwasaki
19667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
19783174Smsmith_mm_cmplt_ps(__m128 a, __m128 b)
198123315Snjl{
199117521Snjl  return (__m128)__builtin_ia32_cmpps(a, b, 1);
200123315Snjl}
20167754Smsmith
20267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
20367754Smsmith_mm_cmple_ss(__m128 a, __m128 b)
20467754Smsmith{
20567754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 2);
20667754Smsmith}
20767754Smsmith
20867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
20967754Smsmith_mm_cmple_ps(__m128 a, __m128 b)
21067754Smsmith{
21167754Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 2);
21267754Smsmith}
21367754Smsmith
21467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
21567754Smsmith_mm_cmpgt_ss(__m128 a, __m128 b)
21667754Smsmith{
21767754Smsmith  return (__m128)__builtin_ia32_cmpss(b, a, 1);
21867754Smsmith}
21967754Smsmith
22067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
22167754Smsmith_mm_cmpgt_ps(__m128 a, __m128 b)
222107325Siwasaki{
22367754Smsmith  return (__m128)__builtin_ia32_cmpps(b, a, 1);
224117521Snjl}
225123315Snjl
226117521Snjlstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
227123315Snjl_mm_cmpge_ss(__m128 a, __m128 b)
22867754Smsmith{
22967754Smsmith  return (__m128)__builtin_ia32_cmpss(b, a, 2);
23067754Smsmith}
23167754Smsmith
23267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
23367754Smsmith_mm_cmpge_ps(__m128 a, __m128 b)
23467754Smsmith{
23567754Smsmith  return (__m128)__builtin_ia32_cmpps(b, a, 2);
23667754Smsmith}
23767754Smsmith
238107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__))
23967754Smsmith_mm_cmpneq_ss(__m128 a, __m128 b)
24077424Smsmith{
24167754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 4);
24267754Smsmith}
24367754Smsmith
24467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
245107325Siwasaki_mm_cmpneq_ps(__m128 a, __m128 b)
24667754Smsmith{
24767754Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 4);
24867754Smsmith}
24967754Smsmith
25067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
25167754Smsmith_mm_cmpnlt_ss(__m128 a, __m128 b)
25267754Smsmith{
25367754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 5);
25467754Smsmith}
25567754Smsmith
25667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
25767754Smsmith_mm_cmpnlt_ps(__m128 a, __m128 b)
25869450Smsmith{
25967754Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 5);
26067754Smsmith}
26167754Smsmith
26291116Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
26367754Smsmith_mm_cmpnle_ss(__m128 a, __m128 b)
26467754Smsmith{
26567754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 6);
26667754Smsmith}
26767754Smsmith
26867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
26991116Smsmith_mm_cmpnle_ps(__m128 a, __m128 b)
27067754Smsmith{
27167754Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 6);
27291116Smsmith}
27367754Smsmith
27467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
275114237Snjl_mm_cmpngt_ss(__m128 a, __m128 b)
276107325Siwasaki{
27767754Smsmith  return (__m128)__builtin_ia32_cmpss(b, a, 5);
278107325Siwasaki}
27967754Smsmith
280107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__))
281107325Siwasaki_mm_cmpngt_ps(__m128 a, __m128 b)
282107325Siwasaki{
283107325Siwasaki  return (__m128)__builtin_ia32_cmpps(b, a, 5);
28467754Smsmith}
285107325Siwasaki
286107325Siwasakistatic inline __m128 __attribute__((__always_inline__, __nodebug__))
287107325Siwasaki_mm_cmpnge_ss(__m128 a, __m128 b)
288107325Siwasaki{
289107325Siwasaki  return (__m128)__builtin_ia32_cmpss(b, a, 6);
29067754Smsmith}
29167754Smsmith
29267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
29367754Smsmith_mm_cmpnge_ps(__m128 a, __m128 b)
29467754Smsmith{
29587031Smsmith  return (__m128)__builtin_ia32_cmpps(b, a, 6);
29667754Smsmith}
29767754Smsmith
29867754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
29967754Smsmith_mm_cmpord_ss(__m128 a, __m128 b)
30067754Smsmith{
30167754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 7);
30267754Smsmith}
30391116Smsmith
30487031Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
30591116Smsmith_mm_cmpord_ps(__m128 a, __m128 b)
30687031Smsmith{
30787031Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 7);
30867754Smsmith}
30967754Smsmith
31067754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
31167754Smsmith_mm_cmpunord_ss(__m128 a, __m128 b)
31267754Smsmith{
31367754Smsmith  return (__m128)__builtin_ia32_cmpss(a, b, 3);
31467754Smsmith}
31567754Smsmith
31667754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
31767754Smsmith_mm_cmpunord_ps(__m128 a, __m128 b)
31867754Smsmith{
31991116Smsmith  return (__m128)__builtin_ia32_cmpps(a, b, 3);
32067754Smsmith}
32167754Smsmith
32267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
32367754Smsmith_mm_comieq_ss(__m128 a, __m128 b)
32467754Smsmith{
32567754Smsmith  return __builtin_ia32_comieq(a, b);
32667754Smsmith}
32767754Smsmith
32867754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
32967754Smsmith_mm_comilt_ss(__m128 a, __m128 b)
33067754Smsmith{
33167754Smsmith  return __builtin_ia32_comilt(a, b);
33267754Smsmith}
33367754Smsmith
334107325Siwasakistatic inline int __attribute__((__always_inline__, __nodebug__))
335107325Siwasaki_mm_comile_ss(__m128 a, __m128 b)
33667754Smsmith{
33767754Smsmith  return __builtin_ia32_comile(a, b);
33867754Smsmith}
33977424Smsmith
34067754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
34167754Smsmith_mm_comigt_ss(__m128 a, __m128 b)
34267754Smsmith{
343107325Siwasaki  return __builtin_ia32_comigt(a, b);
34467754Smsmith}
34567754Smsmith
34667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
34767754Smsmith_mm_comige_ss(__m128 a, __m128 b)
348107325Siwasaki{
34967754Smsmith  return __builtin_ia32_comige(a, b);
35067754Smsmith}
35167754Smsmith
35267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
35367754Smsmith_mm_comineq_ss(__m128 a, __m128 b)
35467754Smsmith{
35567754Smsmith  return __builtin_ia32_comineq(a, b);
35667754Smsmith}
35767754Smsmith
35891116Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
35991116Smsmith_mm_ucomieq_ss(__m128 a, __m128 b)
36067754Smsmith{
36167754Smsmith  return __builtin_ia32_ucomieq(a, b);
36267754Smsmith}
36367754Smsmith
36467754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
36567754Smsmith_mm_ucomilt_ss(__m128 a, __m128 b)
36667754Smsmith{
36791116Smsmith  return __builtin_ia32_ucomilt(a, b);
36867754Smsmith}
36967754Smsmith
37067754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
37167754Smsmith_mm_ucomile_ss(__m128 a, __m128 b)
37267754Smsmith{
37367754Smsmith  return __builtin_ia32_ucomile(a, b);
374102550Siwasaki}
37567754Smsmith
37667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
377102550Siwasaki_mm_ucomigt_ss(__m128 a, __m128 b)
37867754Smsmith{
37967754Smsmith  return __builtin_ia32_ucomigt(a, b);
38067754Smsmith}
38167754Smsmith
38267754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
38377424Smsmith_mm_ucomige_ss(__m128 a, __m128 b)
38467754Smsmith{
385114237Snjl  return __builtin_ia32_ucomige(a, b);
38667754Smsmith}
38767754Smsmith
38867754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
38967754Smsmith_mm_ucomineq_ss(__m128 a, __m128 b)
390107325Siwasaki{
39167754Smsmith  return __builtin_ia32_ucomineq(a, b);
39291116Smsmith}
39377424Smsmith
39467754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
39567754Smsmith_mm_cvtss_si32(__m128 a)
39667754Smsmith{
39777424Smsmith  return __builtin_ia32_cvtss2si(a);
39877424Smsmith}
39971867Smsmith
40071867Smsmith#ifdef __x86_64__
40191116Smsmith
40271867Smsmithstatic inline long long __attribute__((__always_inline__, __nodebug__))
40387031Smsmith_mm_cvtss_si64(__m128 a)
40471867Smsmith{
40571867Smsmith  return __builtin_ia32_cvtss2si64(a);
40671867Smsmith}
40767754Smsmith
40867754Smsmith#endif
40967754Smsmith
41067754Smsmithstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
41167754Smsmith_mm_cvtps_pi32(__m128 a)
41267754Smsmith{
41367754Smsmith  return (__m64)__builtin_ia32_cvtps2pi(a);
41487031Smsmith}
41567754Smsmith
41667754Smsmithstatic inline int __attribute__((__always_inline__, __nodebug__))
41767754Smsmith_mm_cvttss_si32(__m128 a)
41867754Smsmith{
41967754Smsmith  return a[0];
42067754Smsmith}
42167754Smsmith
42291116Smsmithstatic inline long long __attribute__((__always_inline__, __nodebug__))
42391116Smsmith_mm_cvttss_si64(__m128 a)
42467754Smsmith{
42567754Smsmith  return a[0];
426107325Siwasaki}
42767754Smsmith
42867754Smsmithstatic inline __m64 __attribute__((__always_inline__, __nodebug__))
42967754Smsmith_mm_cvttps_pi32(__m128 a)
43067754Smsmith{
43167754Smsmith  return (__m64)__builtin_ia32_cvttps2pi(a);
43267754Smsmith}
43367754Smsmith
43467754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
43567754Smsmith_mm_cvtsi32_ss(__m128 a, int b)
43667754Smsmith{
43767754Smsmith  a[0] = b;
43867754Smsmith  return a;
43967754Smsmith}
44091116Smsmith
44167754Smsmith#ifdef __x86_64__
44282367Smsmith
44387031Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
44467754Smsmith_mm_cvtsi64_ss(__m128 a, long long b)
44567754Smsmith{
44667754Smsmith  a[0] = b;
44767754Smsmith  return a;
44867754Smsmith}
44967754Smsmith
45067754Smsmith#endif
45167754Smsmith
45267754Smsmithstatic inline __m128 __attribute__((__always_inline__, __nodebug__))
45367754Smsmith_mm_cvtpi32_ps(__m128 a, __m64 b)
45467754Smsmith{
45567754Smsmith  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
45667754Smsmith}
45767754Smsmith
45867754Smsmithstatic inline float __attribute__((__always_inline__, __nodebug__))
45967754Smsmith_mm_cvtss_f32(__m128 a)
46067754Smsmith{
46167754Smsmith  return a[0];
46267754Smsmith}
46367754Smsmith
464static inline __m128 __attribute__((__always_inline__, __nodebug__))
465_mm_loadh_pi(__m128 a, __m64 const *p)
466{
467  __m128 b;
468  b[0] = *(float*)p;
469  b[1] = *((float*)p+1);
470  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
471}
472
473static inline __m128 __attribute__((__always_inline__, __nodebug__))
474_mm_loadl_pi(__m128 a, __m64 const *p)
475{
476  __m128 b;
477  b[0] = *(float*)p;
478  b[1] = *((float*)p+1);
479  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
480}
481
482static inline __m128 __attribute__((__always_inline__, __nodebug__))
483_mm_load_ss(float *p)
484{
485  return (__m128){ *p, 0, 0, 0 };
486}
487
488static inline __m128 __attribute__((__always_inline__, __nodebug__))
489_mm_load1_ps(float *p)
490{
491  return (__m128){ *p, *p, *p, *p };
492}
493
494#define        _mm_load_ps1(p) _mm_load1_ps(p)
495
496static inline __m128 __attribute__((__always_inline__, __nodebug__))
497_mm_load_ps(float *p)
498{
499  return *(__m128*)p;
500}
501
502static inline __m128 __attribute__((__always_inline__, __nodebug__))
503_mm_loadu_ps(float *p)
504{
505  return __builtin_ia32_loadups(p);
506}
507
508static inline __m128 __attribute__((__always_inline__, __nodebug__))
509_mm_loadr_ps(float *p)
510{
511  __m128 a = _mm_load_ps(p);
512  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
513}
514
515static inline __m128 __attribute__((__always_inline__, __nodebug__))
516_mm_set_ss(float w)
517{
518  return (__m128){ w, 0, 0, 0 };
519}
520
521static inline __m128 __attribute__((__always_inline__, __nodebug__))
522_mm_set1_ps(float w)
523{
524  return (__m128){ w, w, w, w };
525}
526
527// Microsoft specific.
528static inline __m128 __attribute__((__always_inline__, __nodebug__))
529_mm_set_ps1(float w)
530{
531    return _mm_set1_ps(w);
532}
533
534static inline __m128 __attribute__((__always_inline__, __nodebug__))
535_mm_set_ps(float z, float y, float x, float w)
536{
537  return (__m128){ w, x, y, z };
538}
539
540static inline __m128 __attribute__((__always_inline__, __nodebug__))
541_mm_setr_ps(float z, float y, float x, float w)
542{
543  return (__m128){ z, y, x, w };
544}
545
546static inline __m128 __attribute__((__always_inline__))
547_mm_setzero_ps(void)
548{
549  return (__m128){ 0, 0, 0, 0 };
550}
551
552static inline void __attribute__((__always_inline__))
553_mm_storeh_pi(__m64 *p, __m128 a)
554{
555  __builtin_ia32_storehps((__v2si *)p, a);
556}
557
558static inline void __attribute__((__always_inline__))
559_mm_storel_pi(__m64 *p, __m128 a)
560{
561  __builtin_ia32_storelps((__v2si *)p, a);
562}
563
564static inline void __attribute__((__always_inline__))
565_mm_store_ss(float *p, __m128 a)
566{
567  *p = a[0];
568}
569
570static inline void __attribute__((__always_inline__, __nodebug__))
571_mm_storeu_ps(float *p, __m128 a)
572{
573  __builtin_ia32_storeups(p, a);
574}
575
576static inline void __attribute__((__always_inline__, __nodebug__))
577_mm_store1_ps(float *p, __m128 a)
578{
579  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
580  _mm_storeu_ps(p, a);
581}
582
583static inline void __attribute__((__always_inline__, __nodebug__))
584_mm_store_ps(float *p, __m128 a)
585{
586  *(__m128 *)p = a;
587}
588
589static inline void __attribute__((__always_inline__, __nodebug__))
590_mm_storer_ps(float *p, __m128 a)
591{
592  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
593  _mm_store_ps(p, a);
594}
595
596#define _MM_HINT_T0 1
597#define _MM_HINT_T1 2
598#define _MM_HINT_T2 3
599#define _MM_HINT_NTA 0
600
601/* FIXME: We have to #define this because "sel" must be a constant integer, and
602   Sema doesn't do any form of constant propagation yet. */
603
604#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
605
606static inline void __attribute__((__always_inline__, __nodebug__))
607_mm_stream_pi(__m64 *p, __m64 a)
608{
609  __builtin_ia32_movntq(p, a);
610}
611
612static inline void __attribute__((__always_inline__, __nodebug__))
613_mm_stream_ps(float *p, __m128 a)
614{
615  __builtin_ia32_movntps(p, a);
616}
617
618static inline void __attribute__((__always_inline__, __nodebug__))
619_mm_sfence(void)
620{
621  __builtin_ia32_sfence();
622}
623
624static inline int __attribute__((__always_inline__, __nodebug__))
625_mm_extract_pi16(__m64 a, int n)
626{
627  __v4hi b = (__v4hi)a;
628  return (unsigned short)b[n & 3];
629}
630
631static inline __m64 __attribute__((__always_inline__, __nodebug__))
632_mm_insert_pi16(__m64 a, int d, int n)
633{
634   __v4hi b = (__v4hi)a;
635   b[n & 3] = d;
636   return (__m64)b;
637}
638
639static inline __m64 __attribute__((__always_inline__, __nodebug__))
640_mm_max_pi16(__m64 a, __m64 b)
641{
642  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
643}
644
645static inline __m64 __attribute__((__always_inline__, __nodebug__))
646_mm_max_pu8(__m64 a, __m64 b)
647{
648  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
649}
650
651static inline __m64 __attribute__((__always_inline__, __nodebug__))
652_mm_min_pi16(__m64 a, __m64 b)
653{
654  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
655}
656
657static inline __m64 __attribute__((__always_inline__, __nodebug__))
658_mm_min_pu8(__m64 a, __m64 b)
659{
660  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
661}
662
663static inline int __attribute__((__always_inline__, __nodebug__))
664_mm_movemask_pi8(__m64 a)
665{
666  return __builtin_ia32_pmovmskb((__v8qi)a);
667}
668
669static inline __m64 __attribute__((__always_inline__, __nodebug__))
670_mm_mulhi_pu16(__m64 a, __m64 b)
671{
672  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
673}
674
675#define _mm_shuffle_pi16(a, n) \
676  ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \
677                                  (n) & 0x3, ((n) & 0xc) >> 2, \
678                                  ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
679
680static inline void __attribute__((__always_inline__, __nodebug__))
681_mm_maskmove_si64(__m64 d, __m64 n, char *p)
682{
683  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
684}
685
686static inline __m64 __attribute__((__always_inline__, __nodebug__))
687_mm_avg_pu8(__m64 a, __m64 b)
688{
689  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
690}
691
692static inline __m64 __attribute__((__always_inline__, __nodebug__))
693_mm_avg_pu16(__m64 a, __m64 b)
694{
695  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
696}
697
698static inline __m64 __attribute__((__always_inline__, __nodebug__))
699_mm_sad_pu8(__m64 a, __m64 b)
700{
701  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
702}
703
704static inline unsigned int __attribute__((__always_inline__, __nodebug__))
705_mm_getcsr(void)
706{
707  return __builtin_ia32_stmxcsr();
708}
709
710static inline void __attribute__((__always_inline__, __nodebug__))
711_mm_setcsr(unsigned int i)
712{
713  __builtin_ia32_ldmxcsr(i);
714}
715
716#define _mm_shuffle_ps(a, b, mask) \
717        (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \
718                                 (((mask) & 0x30) >> 4) + 4, \
719                                 (((mask) & 0xc0) >> 6) + 4))
720
721static inline __m128 __attribute__((__always_inline__, __nodebug__))
722_mm_unpackhi_ps(__m128 a, __m128 b)
723{
724  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
725}
726
727static inline __m128 __attribute__((__always_inline__, __nodebug__))
728_mm_unpacklo_ps(__m128 a, __m128 b)
729{
730  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
731}
732
733static inline __m128 __attribute__((__always_inline__, __nodebug__))
734_mm_move_ss(__m128 a, __m128 b)
735{
736  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
737}
738
739static inline __m128 __attribute__((__always_inline__, __nodebug__))
740_mm_movehl_ps(__m128 a, __m128 b)
741{
742  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
743}
744
745static inline __m128 __attribute__((__always_inline__, __nodebug__))
746_mm_movelh_ps(__m128 a, __m128 b)
747{
748  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
749}
750
751static inline __m128 __attribute__((__always_inline__, __nodebug__))
752_mm_cvtpi16_ps(__m64 a)
753{
754  __m64 b, c;
755  __m128 r;
756
757  b = _mm_setzero_si64();
758  b = _mm_cmpgt_pi16(b, a);
759  c = _mm_unpackhi_pi16(a, b);
760  r = _mm_setzero_ps();
761  r = _mm_cvtpi32_ps(r, c);
762  r = _mm_movelh_ps(r, r);
763  c = _mm_unpacklo_pi16(a, b);
764  r = _mm_cvtpi32_ps(r, c);
765
766  return r;
767}
768
769static inline __m128 __attribute__((__always_inline__, __nodebug__))
770_mm_cvtpu16_ps(__m64 a)
771{
772  __m64 b, c;
773  __m128 r;
774
775  b = _mm_setzero_si64();
776  c = _mm_unpackhi_pi16(a, b);
777  r = _mm_setzero_ps();
778  r = _mm_cvtpi32_ps(r, c);
779  r = _mm_movelh_ps(r, r);
780  c = _mm_unpacklo_pi16(a, b);
781  r = _mm_cvtpi32_ps(r, c);
782
783  return r;
784}
785
786static inline __m128 __attribute__((__always_inline__, __nodebug__))
787_mm_cvtpi8_ps(__m64 a)
788{
789  __m64 b;
790
791  b = _mm_setzero_si64();
792  b = _mm_cmpgt_pi8(b, a);
793  b = _mm_unpacklo_pi8(a, b);
794
795  return _mm_cvtpi16_ps(b);
796}
797
798static inline __m128 __attribute__((__always_inline__, __nodebug__))
799_mm_cvtpu8_ps(__m64 a)
800{
801  __m64 b;
802
803  b = _mm_setzero_si64();
804  b = _mm_unpacklo_pi8(a, b);
805
806  return _mm_cvtpi16_ps(b);
807}
808
809static inline __m128 __attribute__((__always_inline__, __nodebug__))
810_mm_cvtpi32x2_ps(__m64 a, __m64 b)
811{
812  __m128 c;
813
814  c = _mm_setzero_ps();
815  c = _mm_cvtpi32_ps(c, b);
816  c = _mm_movelh_ps(c, c);
817
818  return _mm_cvtpi32_ps(c, a);
819}
820
821static inline __m64 __attribute__((__always_inline__, __nodebug__))
822_mm_cvtps_pi16(__m128 a)
823{
824  __m64 b, c;
825
826  b = _mm_cvtps_pi32(a);
827  a = _mm_movehl_ps(a, a);
828  c = _mm_cvtps_pi32(a);
829
830  return _mm_packs_pi16(b, c);
831}
832
833static inline __m64 __attribute__((__always_inline__, __nodebug__))
834_mm_cvtps_pi8(__m128 a)
835{
836  __m64 b, c;
837
838  b = _mm_cvtps_pi16(a);
839  c = _mm_setzero_si64();
840
841  return _mm_packs_pi16(b, c);
842}
843
844static inline int __attribute__((__always_inline__, __nodebug__))
845_mm_movemask_ps(__m128 a)
846{
847  return __builtin_ia32_movmskps(a);
848}
849
850#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
851
852#define _MM_EXCEPT_INVALID    (0x0001)
853#define _MM_EXCEPT_DENORM     (0x0002)
854#define _MM_EXCEPT_DIV_ZERO   (0x0004)
855#define _MM_EXCEPT_OVERFLOW   (0x0008)
856#define _MM_EXCEPT_UNDERFLOW  (0x0010)
857#define _MM_EXCEPT_INEXACT    (0x0020)
858#define _MM_EXCEPT_MASK       (0x003f)
859
860#define _MM_MASK_INVALID      (0x0080)
861#define _MM_MASK_DENORM       (0x0100)
862#define _MM_MASK_DIV_ZERO     (0x0200)
863#define _MM_MASK_OVERFLOW     (0x0400)
864#define _MM_MASK_UNDERFLOW    (0x0800)
865#define _MM_MASK_INEXACT      (0x1000)
866#define _MM_MASK_MASK         (0x1f80)
867
868#define _MM_ROUND_NEAREST     (0x0000)
869#define _MM_ROUND_DOWN        (0x2000)
870#define _MM_ROUND_UP          (0x4000)
871#define _MM_ROUND_TOWARD_ZERO (0x6000)
872#define _MM_ROUND_MASK        (0x6000)
873
874#define _MM_FLUSH_ZERO_MASK   (0x8000)
875#define _MM_FLUSH_ZERO_ON     (0x8000)
876#define _MM_FLUSH_ZERO_OFF    (0x8000)
877
878#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
879#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
880#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
881#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
882
883#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
884#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
885#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
886#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
887
888#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
889do { \
890  __m128 tmp3, tmp2, tmp1, tmp0; \
891  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
892  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
893  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
894  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
895  (row0) = _mm_movelh_ps(tmp0, tmp2); \
896  (row1) = _mm_movehl_ps(tmp2, tmp0); \
897  (row2) = _mm_movelh_ps(tmp1, tmp3); \
898  (row3) = _mm_movelh_ps(tmp3, tmp1); \
899} while (0)
900
901/* Ugly hack for backwards-compatibility (compatible with gcc) */
902#ifdef __SSE2__
903#include <emmintrin.h>
904#endif
905
906#endif /* __SSE__ */
907
908#endif /* __XMMINTRIN_H */
909