xmmintrin.h revision 169689
1169689Skan/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007
2169689Skan   Free Software Foundation, Inc.
390075Sobrien
4132718Skan   This file is part of GCC.
590075Sobrien
6132718Skan   GCC is free software; you can redistribute it and/or modify
790075Sobrien   it under the terms of the GNU General Public License as published by
890075Sobrien   the Free Software Foundation; either version 2, or (at your option)
990075Sobrien   any later version.
1090075Sobrien
11132718Skan   GCC is distributed in the hope that it will be useful,
1290075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1390075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1490075Sobrien   GNU General Public License for more details.
1590075Sobrien
1690075Sobrien   You should have received a copy of the GNU General Public License
17132718Skan   along with GCC; see the file COPYING.  If not, write to
18169689Skan   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19169689Skan   Boston, MA 02110-1301, USA.  */
2090075Sobrien
2190075Sobrien/* As a special exception, if you include this header file into source
2290075Sobrien   files compiled by GCC, this header file does not by itself cause
2390075Sobrien   the resulting executable to be covered by the GNU General Public
2490075Sobrien   License.  This exception does not however invalidate any other
2590075Sobrien   reasons why the executable file might be covered by the GNU General
2690075Sobrien   Public License.  */
2790075Sobrien
2890075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
29169689Skan   User Guide and Reference, version 9.0.  */
3090075Sobrien
3190075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3290075Sobrien#define _XMMINTRIN_H_INCLUDED
3390075Sobrien
34117395Skan#ifndef __SSE__
35117395Skan# error "SSE instruction set not enabled"
36117395Skan#else
37117395Skan
3890075Sobrien/* We need type definitions from the MMX header file.  */
3990075Sobrien#include <mmintrin.h>
4090075Sobrien
41169689Skan/* Get _mm_malloc () and _mm_free ().  */
42169689Skan#include <mm_malloc.h>
4390075Sobrien
44169689Skan/* The Intel API is flexible enough that we must allow aliasing with other
45169689Skan   vector types, and their scalar components.  */
46169689Skantypedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
47169689Skan
48132718Skan/* Internal data types for implementing the intrinsics.  */
49169689Skantypedef float __v4sf __attribute__ ((__vector_size__ (16)));
5090075Sobrien
5190075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
5290075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
5390075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
5490075Sobrien
5590075Sobrien/* Constants for use with _mm_prefetch.  */
5690075Sobrienenum _mm_hint
5790075Sobrien{
5890075Sobrien  _MM_HINT_T0 = 3,
5990075Sobrien  _MM_HINT_T1 = 2,
6090075Sobrien  _MM_HINT_T2 = 1,
6190075Sobrien  _MM_HINT_NTA = 0
6290075Sobrien};
6390075Sobrien
6490075Sobrien/* Bits in the MXCSR.  */
6590075Sobrien#define _MM_EXCEPT_MASK       0x003f
6690075Sobrien#define _MM_EXCEPT_INVALID    0x0001
6790075Sobrien#define _MM_EXCEPT_DENORM     0x0002
6890075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
6990075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
7090075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
7190075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
7290075Sobrien
7390075Sobrien#define _MM_MASK_MASK         0x1f80
7490075Sobrien#define _MM_MASK_INVALID      0x0080
7590075Sobrien#define _MM_MASK_DENORM       0x0100
7690075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
7790075Sobrien#define _MM_MASK_OVERFLOW     0x0400
7890075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
7990075Sobrien#define _MM_MASK_INEXACT      0x1000
8090075Sobrien
8190075Sobrien#define _MM_ROUND_MASK        0x6000
8290075Sobrien#define _MM_ROUND_NEAREST     0x0000
8390075Sobrien#define _MM_ROUND_DOWN        0x2000
8490075Sobrien#define _MM_ROUND_UP          0x4000
8590075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
8690075Sobrien
8790075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
8890075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
8990075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
9090075Sobrien
91169689Skan/* Create a vector of zeros.  */
92169689Skanstatic __inline __m128 __attribute__((__always_inline__))
93169689Skan_mm_setzero_ps (void)
94169689Skan{
95169689Skan  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
96169689Skan}
97169689Skan
9890075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
9990075Sobrien   floating-point) values of A and B; the upper three SPFP values are
10090075Sobrien   passed through from A.  */
10190075Sobrien
102169689Skanstatic __inline __m128 __attribute__((__always_inline__))
10390075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
10490075Sobrien{
10590075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
10690075Sobrien}
10790075Sobrien
108169689Skanstatic __inline __m128 __attribute__((__always_inline__))
10990075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
11090075Sobrien{
11190075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
11290075Sobrien}
11390075Sobrien
114169689Skanstatic __inline __m128 __attribute__((__always_inline__))
11590075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
11690075Sobrien{
11790075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
11890075Sobrien}
11990075Sobrien
120169689Skanstatic __inline __m128 __attribute__((__always_inline__))
12190075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
12290075Sobrien{
12390075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
12490075Sobrien}
12590075Sobrien
126169689Skanstatic __inline __m128 __attribute__((__always_inline__))
12790075Sobrien_mm_sqrt_ss (__m128 __A)
12890075Sobrien{
12990075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
13090075Sobrien}
13190075Sobrien
132169689Skanstatic __inline __m128 __attribute__((__always_inline__))
13390075Sobrien_mm_rcp_ss (__m128 __A)
13490075Sobrien{
13590075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
13690075Sobrien}
13790075Sobrien
138169689Skanstatic __inline __m128 __attribute__((__always_inline__))
13990075Sobrien_mm_rsqrt_ss (__m128 __A)
14090075Sobrien{
14190075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
14290075Sobrien}
14390075Sobrien
144169689Skanstatic __inline __m128 __attribute__((__always_inline__))
14590075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
14690075Sobrien{
14790075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
14890075Sobrien}
14990075Sobrien
150169689Skanstatic __inline __m128 __attribute__((__always_inline__))
15190075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
15290075Sobrien{
15390075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
15490075Sobrien}
15590075Sobrien
15690075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
15790075Sobrien
158169689Skanstatic __inline __m128 __attribute__((__always_inline__))
15990075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
16090075Sobrien{
16190075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
16290075Sobrien}
16390075Sobrien
164169689Skanstatic __inline __m128 __attribute__((__always_inline__))
16590075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
16690075Sobrien{
16790075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
16890075Sobrien}
16990075Sobrien
170169689Skanstatic __inline __m128 __attribute__((__always_inline__))
17190075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
17290075Sobrien{
17390075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
17490075Sobrien}
17590075Sobrien
176169689Skanstatic __inline __m128 __attribute__((__always_inline__))
17790075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
17890075Sobrien{
17990075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
18090075Sobrien}
18190075Sobrien
182169689Skanstatic __inline __m128 __attribute__((__always_inline__))
18390075Sobrien_mm_sqrt_ps (__m128 __A)
18490075Sobrien{
18590075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
18690075Sobrien}
18790075Sobrien
188169689Skanstatic __inline __m128 __attribute__((__always_inline__))
18990075Sobrien_mm_rcp_ps (__m128 __A)
19090075Sobrien{
19190075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
19290075Sobrien}
19390075Sobrien
194169689Skanstatic __inline __m128 __attribute__((__always_inline__))
19590075Sobrien_mm_rsqrt_ps (__m128 __A)
19690075Sobrien{
19790075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
19890075Sobrien}
19990075Sobrien
200169689Skanstatic __inline __m128 __attribute__((__always_inline__))
20190075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
20290075Sobrien{
20390075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
20490075Sobrien}
20590075Sobrien
206169689Skanstatic __inline __m128 __attribute__((__always_inline__))
20790075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
20890075Sobrien{
20990075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
21090075Sobrien}
21190075Sobrien
21290075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
21390075Sobrien
214169689Skanstatic __inline __m128 __attribute__((__always_inline__))
21590075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
21690075Sobrien{
21790075Sobrien  return __builtin_ia32_andps (__A, __B);
21890075Sobrien}
21990075Sobrien
220169689Skanstatic __inline __m128 __attribute__((__always_inline__))
22190075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
22290075Sobrien{
22390075Sobrien  return __builtin_ia32_andnps (__A, __B);
22490075Sobrien}
22590075Sobrien
226169689Skanstatic __inline __m128 __attribute__((__always_inline__))
22790075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
22890075Sobrien{
22990075Sobrien  return __builtin_ia32_orps (__A, __B);
23090075Sobrien}
23190075Sobrien
232169689Skanstatic __inline __m128 __attribute__((__always_inline__))
23390075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
23490075Sobrien{
23590075Sobrien  return __builtin_ia32_xorps (__A, __B);
23690075Sobrien}
23790075Sobrien
23890075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
23990075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
24090075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
24190075Sobrien
242169689Skanstatic __inline __m128 __attribute__((__always_inline__))
24390075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
24490075Sobrien{
24590075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
24690075Sobrien}
24790075Sobrien
248169689Skanstatic __inline __m128 __attribute__((__always_inline__))
24990075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
25090075Sobrien{
25190075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
25290075Sobrien}
25390075Sobrien
254169689Skanstatic __inline __m128 __attribute__((__always_inline__))
25590075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
25690075Sobrien{
25790075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
25890075Sobrien}
25990075Sobrien
260169689Skanstatic __inline __m128 __attribute__((__always_inline__))
26190075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
26290075Sobrien{
263107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
264107590Sobrien					(__v4sf)
265107590Sobrien					__builtin_ia32_cmpltss ((__v4sf) __B,
266107590Sobrien								(__v4sf)
267107590Sobrien								__A));
26890075Sobrien}
26990075Sobrien
270169689Skanstatic __inline __m128 __attribute__((__always_inline__))
27190075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
27290075Sobrien{
273107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
274107590Sobrien					(__v4sf)
275107590Sobrien					__builtin_ia32_cmpless ((__v4sf) __B,
276107590Sobrien								(__v4sf)
277107590Sobrien								__A));
27890075Sobrien}
27990075Sobrien
280169689Skanstatic __inline __m128 __attribute__((__always_inline__))
28190075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
28290075Sobrien{
28390075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
28490075Sobrien}
28590075Sobrien
286169689Skanstatic __inline __m128 __attribute__((__always_inline__))
28790075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
28890075Sobrien{
28990075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
29090075Sobrien}
29190075Sobrien
292169689Skanstatic __inline __m128 __attribute__((__always_inline__))
29390075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
29490075Sobrien{
29590075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
29690075Sobrien}
29790075Sobrien
298169689Skanstatic __inline __m128 __attribute__((__always_inline__))
29990075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
30090075Sobrien{
301107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
302107590Sobrien					(__v4sf)
303107590Sobrien					__builtin_ia32_cmpnltss ((__v4sf) __B,
304107590Sobrien								 (__v4sf)
305107590Sobrien								 __A));
30690075Sobrien}
30790075Sobrien
308169689Skanstatic __inline __m128 __attribute__((__always_inline__))
30990075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
31090075Sobrien{
311107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
312107590Sobrien					(__v4sf)
313107590Sobrien					__builtin_ia32_cmpnless ((__v4sf) __B,
314107590Sobrien								 (__v4sf)
315107590Sobrien								 __A));
31690075Sobrien}
31790075Sobrien
318169689Skanstatic __inline __m128 __attribute__((__always_inline__))
31990075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
32090075Sobrien{
32190075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
32290075Sobrien}
32390075Sobrien
324169689Skanstatic __inline __m128 __attribute__((__always_inline__))
32590075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
32690075Sobrien{
32790075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
32890075Sobrien}
32990075Sobrien
33090075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
33190075Sobrien   element, if the comparison is true, place a mask of all ones in the
33290075Sobrien   result, otherwise a mask of zeros.  */
33390075Sobrien
334169689Skanstatic __inline __m128 __attribute__((__always_inline__))
33590075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
33690075Sobrien{
33790075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
33890075Sobrien}
33990075Sobrien
340169689Skanstatic __inline __m128 __attribute__((__always_inline__))
34190075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
34290075Sobrien{
34390075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
34490075Sobrien}
34590075Sobrien
346169689Skanstatic __inline __m128 __attribute__((__always_inline__))
34790075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
34890075Sobrien{
34990075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
35090075Sobrien}
35190075Sobrien
352169689Skanstatic __inline __m128 __attribute__((__always_inline__))
35390075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
35490075Sobrien{
35590075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
35690075Sobrien}
35790075Sobrien
358169689Skanstatic __inline __m128 __attribute__((__always_inline__))
35990075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
36090075Sobrien{
36190075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
36290075Sobrien}
36390075Sobrien
364169689Skanstatic __inline __m128 __attribute__((__always_inline__))
36590075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
36690075Sobrien{
36790075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
36890075Sobrien}
36990075Sobrien
370169689Skanstatic __inline __m128 __attribute__((__always_inline__))
37190075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
37290075Sobrien{
37390075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
37490075Sobrien}
37590075Sobrien
376169689Skanstatic __inline __m128 __attribute__((__always_inline__))
37790075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
37890075Sobrien{
37990075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
38090075Sobrien}
38190075Sobrien
382169689Skanstatic __inline __m128 __attribute__((__always_inline__))
38390075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
38490075Sobrien{
38590075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
38690075Sobrien}
38790075Sobrien
388169689Skanstatic __inline __m128 __attribute__((__always_inline__))
38990075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
39090075Sobrien{
39190075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
39290075Sobrien}
39390075Sobrien
394169689Skanstatic __inline __m128 __attribute__((__always_inline__))
39590075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
39690075Sobrien{
39790075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
39890075Sobrien}
39990075Sobrien
400169689Skanstatic __inline __m128 __attribute__((__always_inline__))
40190075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
40290075Sobrien{
40390075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
40490075Sobrien}
40590075Sobrien
40690075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
40790075Sobrien   and 0 if false.  */
40890075Sobrien
409169689Skanstatic __inline int __attribute__((__always_inline__))
41090075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
41190075Sobrien{
41290075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
41390075Sobrien}
41490075Sobrien
415169689Skanstatic __inline int __attribute__((__always_inline__))
41690075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
41790075Sobrien{
41890075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
41990075Sobrien}
42090075Sobrien
421169689Skanstatic __inline int __attribute__((__always_inline__))
42290075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
42390075Sobrien{
42490075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
42590075Sobrien}
42690075Sobrien
427169689Skanstatic __inline int __attribute__((__always_inline__))
42890075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
42990075Sobrien{
43090075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
43190075Sobrien}
43290075Sobrien
433169689Skanstatic __inline int __attribute__((__always_inline__))
43490075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
43590075Sobrien{
43690075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
43790075Sobrien}
43890075Sobrien
439169689Skanstatic __inline int __attribute__((__always_inline__))
44090075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
44190075Sobrien{
44290075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
44390075Sobrien}
44490075Sobrien
445169689Skanstatic __inline int __attribute__((__always_inline__))
44690075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
44790075Sobrien{
44890075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
44990075Sobrien}
45090075Sobrien
451169689Skanstatic __inline int __attribute__((__always_inline__))
45290075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
45390075Sobrien{
45490075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
45590075Sobrien}
45690075Sobrien
457169689Skanstatic __inline int __attribute__((__always_inline__))
45890075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
45990075Sobrien{
46090075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
46190075Sobrien}
46290075Sobrien
463169689Skanstatic __inline int __attribute__((__always_inline__))
46490075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
46590075Sobrien{
46690075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
46790075Sobrien}
46890075Sobrien
469169689Skanstatic __inline int __attribute__((__always_inline__))
47090075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
47190075Sobrien{
47290075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
47390075Sobrien}
47490075Sobrien
475169689Skanstatic __inline int __attribute__((__always_inline__))
47690075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
47790075Sobrien{
47890075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
47990075Sobrien}
48090075Sobrien
48190075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
48290075Sobrien   rounding mode.  */
483169689Skanstatic __inline int __attribute__((__always_inline__))
48490075Sobrien_mm_cvtss_si32 (__m128 __A)
48590075Sobrien{
48690075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
48790075Sobrien}
48890075Sobrien
489169689Skanstatic __inline int __attribute__((__always_inline__))
490122180Skan_mm_cvt_ss2si (__m128 __A)
491122180Skan{
492122180Skan  return _mm_cvtss_si32 (__A);
493122180Skan}
494122180Skan
495117395Skan#ifdef __x86_64__
496169689Skan/* Convert the lower SPFP value to a 32-bit integer according to the
497169689Skan   current rounding mode.  */
498169689Skan
499169689Skan/* Intel intrinsic.  */
500169689Skanstatic __inline long long __attribute__((__always_inline__))
501169689Skan_mm_cvtss_si64 (__m128 __A)
502169689Skan{
503169689Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
504169689Skan}
505169689Skan
506169689Skan/* Microsoft intrinsic.  */
507169689Skanstatic __inline long long __attribute__((__always_inline__))
508117395Skan_mm_cvtss_si64x (__m128 __A)
509117395Skan{
510117395Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
511117395Skan}
512117395Skan#endif
513117395Skan
51490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
51590075Sobrien   current rounding mode.  Return the integers in packed form.  */
516169689Skanstatic __inline __m64 __attribute__((__always_inline__))
51790075Sobrien_mm_cvtps_pi32 (__m128 __A)
51890075Sobrien{
51990075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
52090075Sobrien}
52190075Sobrien
522169689Skanstatic __inline __m64 __attribute__((__always_inline__))
523122180Skan_mm_cvt_ps2pi (__m128 __A)
524122180Skan{
525122180Skan  return _mm_cvtps_pi32 (__A);
526122180Skan}
527122180Skan
52890075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
529169689Skanstatic __inline int __attribute__((__always_inline__))
53090075Sobrien_mm_cvttss_si32 (__m128 __A)
53190075Sobrien{
53290075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
53390075Sobrien}
53490075Sobrien
535169689Skanstatic __inline int __attribute__((__always_inline__))
536122180Skan_mm_cvtt_ss2si (__m128 __A)
537122180Skan{
538122180Skan  return _mm_cvttss_si32 (__A);
539122180Skan}
540122180Skan
541117395Skan#ifdef __x86_64__
542117395Skan/* Truncate the lower SPFP value to a 32-bit integer.  */
543169689Skan
544169689Skan/* Intel intrinsic.  */
545169689Skanstatic __inline long long __attribute__((__always_inline__))
546169689Skan_mm_cvttss_si64 (__m128 __A)
547169689Skan{
548169689Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
549169689Skan}
550169689Skan
551169689Skan/* Microsoft intrinsic.  */
552169689Skanstatic __inline long long __attribute__((__always_inline__))
553117395Skan_mm_cvttss_si64x (__m128 __A)
554117395Skan{
555117395Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
556117395Skan}
557117395Skan#endif
558117395Skan
55990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
56090075Sobrien   integers in packed form.  */
561169689Skanstatic __inline __m64 __attribute__((__always_inline__))
56290075Sobrien_mm_cvttps_pi32 (__m128 __A)
56390075Sobrien{
56490075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
56590075Sobrien}
56690075Sobrien
567169689Skanstatic __inline __m64 __attribute__((__always_inline__))
568122180Skan_mm_cvtt_ps2pi (__m128 __A)
569122180Skan{
570122180Skan  return _mm_cvttps_pi32 (__A);
571122180Skan}
572122180Skan
57390075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
574169689Skanstatic __inline __m128 __attribute__((__always_inline__))
57590075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
57690075Sobrien{
57790075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
57890075Sobrien}
57990075Sobrien
580169689Skanstatic __inline __m128 __attribute__((__always_inline__))
581122180Skan_mm_cvt_si2ss (__m128 __A, int __B)
582122180Skan{
583122180Skan  return _mm_cvtsi32_ss (__A, __B);
584122180Skan}
585122180Skan
586117395Skan#ifdef __x86_64__
587117395Skan/* Convert B to a SPFP value and insert it as element zero in A.  */
588169689Skan
589169689Skan/* Intel intrinsic.  */
590169689Skanstatic __inline __m128 __attribute__((__always_inline__))
591169689Skan_mm_cvtsi64_ss (__m128 __A, long long __B)
592169689Skan{
593169689Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
594169689Skan}
595169689Skan
596169689Skan/* Microsoft intrinsic.  */
597169689Skanstatic __inline __m128 __attribute__((__always_inline__))
598117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B)
599117395Skan{
600117395Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
601117395Skan}
602117395Skan#endif
603117395Skan
60490075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
60590075Sobrien   as the two lower elements in A.  */
606169689Skanstatic __inline __m128 __attribute__((__always_inline__))
60790075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
60890075Sobrien{
60990075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
61090075Sobrien}
61190075Sobrien
612169689Skanstatic __inline __m128 __attribute__((__always_inline__))
613122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B)
614122180Skan{
615122180Skan  return _mm_cvtpi32_ps (__A, __B);
616122180Skan}
617122180Skan
61890075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
619169689Skanstatic __inline __m128 __attribute__((__always_inline__))
62090075Sobrien_mm_cvtpi16_ps (__m64 __A)
62190075Sobrien{
62290075Sobrien  __v4hi __sign;
62390075Sobrien  __v2si __hisi, __losi;
62490075Sobrien  __v4sf __r;
62590075Sobrien
62690075Sobrien  /* This comparison against zero gives us a mask that can be used to
62790075Sobrien     fill in the missing sign bits in the unpack operations below, so
62890075Sobrien     that we get signed values after unpacking.  */
629169689Skan  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
63090075Sobrien
63190075Sobrien  /* Convert the four words to doublewords.  */
63290075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
63390075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
63490075Sobrien
63590075Sobrien  /* Convert the doublewords to floating point two at a time.  */
636169689Skan  __r = (__v4sf) _mm_setzero_ps ();
63790075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
63890075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
63990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
64090075Sobrien
64190075Sobrien  return (__m128) __r;
64290075Sobrien}
64390075Sobrien
64490075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
645169689Skanstatic __inline __m128 __attribute__((__always_inline__))
64690075Sobrien_mm_cvtpu16_ps (__m64 __A)
64790075Sobrien{
64890075Sobrien  __v2si __hisi, __losi;
64990075Sobrien  __v4sf __r;
65090075Sobrien
65190075Sobrien  /* Convert the four words to doublewords.  */
652169689Skan  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
653169689Skan  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
65490075Sobrien
65590075Sobrien  /* Convert the doublewords to floating point two at a time.  */
656169689Skan  __r = (__v4sf) _mm_setzero_ps ();
65790075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
65890075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
65990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
66090075Sobrien
66190075Sobrien  return (__m128) __r;
66290075Sobrien}
66390075Sobrien
66490075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
665169689Skanstatic __inline __m128 __attribute__((__always_inline__))
66690075Sobrien_mm_cvtpi8_ps (__m64 __A)
66790075Sobrien{
66890075Sobrien  __v8qi __sign;
66990075Sobrien
67090075Sobrien  /* This comparison against zero gives us a mask that can be used to
67190075Sobrien     fill in the missing sign bits in the unpack operations below, so
67290075Sobrien     that we get signed values after unpacking.  */
673169689Skan  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
67490075Sobrien
67590075Sobrien  /* Convert the four low bytes to words.  */
67690075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
67790075Sobrien
67890075Sobrien  return _mm_cvtpi16_ps(__A);
67990075Sobrien}
68090075Sobrien
68190075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
682169689Skanstatic __inline __m128 __attribute__((__always_inline__))
68390075Sobrien_mm_cvtpu8_ps(__m64 __A)
68490075Sobrien{
685169689Skan  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
68690075Sobrien  return _mm_cvtpu16_ps(__A);
68790075Sobrien}
68890075Sobrien
68990075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
690169689Skanstatic __inline __m128 __attribute__((__always_inline__))
69190075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
69290075Sobrien{
693169689Skan  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
69490075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
69590075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
69690075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
69790075Sobrien}
69890075Sobrien
69990075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
700169689Skanstatic __inline __m64 __attribute__((__always_inline__))
70190075Sobrien_mm_cvtps_pi16(__m128 __A)
70290075Sobrien{
70390075Sobrien  __v4sf __hisf = (__v4sf)__A;
70490075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
70590075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
70690075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
707117395Skan  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
70890075Sobrien}
70990075Sobrien
71090075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
711169689Skanstatic __inline __m64 __attribute__((__always_inline__))
71290075Sobrien_mm_cvtps_pi8(__m128 __A)
71390075Sobrien{
71490075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
715169689Skan  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
71690075Sobrien}
71790075Sobrien
71890075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
71990075Sobrien#if 0
720169689Skanstatic __inline __m128 __attribute__((__always_inline__))
72190075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
72290075Sobrien{
72390075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
72490075Sobrien}
72590075Sobrien#else
72690075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
72790075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
72890075Sobrien#endif
72990075Sobrien
73090075Sobrien
73190075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
732169689Skanstatic __inline __m128 __attribute__((__always_inline__))
73390075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
73490075Sobrien{
73590075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
73690075Sobrien}
73790075Sobrien
73890075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
739169689Skanstatic __inline __m128 __attribute__((__always_inline__))
74090075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
74190075Sobrien{
74290075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
74390075Sobrien}
74490075Sobrien
74590075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
74690075Sobrien   the lower two values are passed through from A.  */
747169689Skanstatic __inline __m128 __attribute__((__always_inline__))
748117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P)
74990075Sobrien{
75090075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
75190075Sobrien}
75290075Sobrien
75390075Sobrien/* Stores the upper two SPFP values of A into P.  */
754169689Skanstatic __inline void __attribute__((__always_inline__))
75590075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
75690075Sobrien{
75790075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
75890075Sobrien}
75990075Sobrien
76090075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
761169689Skanstatic __inline __m128 __attribute__((__always_inline__))
76290075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
76390075Sobrien{
76490075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
76590075Sobrien}
76690075Sobrien
76790075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
768169689Skanstatic __inline __m128 __attribute__((__always_inline__))
76990075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
77090075Sobrien{
77190075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
77290075Sobrien}
77390075Sobrien
77490075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
77590075Sobrien   the upper two values are passed through from A.  */
776169689Skanstatic __inline __m128 __attribute__((__always_inline__))
777117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P)
77890075Sobrien{
77990075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
78090075Sobrien}
78190075Sobrien
78290075Sobrien/* Stores the lower two SPFP values of A into P.  */
783169689Skanstatic __inline void __attribute__((__always_inline__))
78490075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
78590075Sobrien{
78690075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
78790075Sobrien}
78890075Sobrien
78990075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
790169689Skanstatic __inline int __attribute__((__always_inline__))
79190075Sobrien_mm_movemask_ps (__m128 __A)
79290075Sobrien{
79390075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
79490075Sobrien}
79590075Sobrien
79690075Sobrien/* Return the contents of the control register.  */
797169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
79890075Sobrien_mm_getcsr (void)
79990075Sobrien{
80090075Sobrien  return __builtin_ia32_stmxcsr ();
80190075Sobrien}
80290075Sobrien
80390075Sobrien/* Read exception bits from the control register.  */
804169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
80590075Sobrien_MM_GET_EXCEPTION_STATE (void)
80690075Sobrien{
80790075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
80890075Sobrien}
80990075Sobrien
810169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
81190075Sobrien_MM_GET_EXCEPTION_MASK (void)
81290075Sobrien{
81390075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
81490075Sobrien}
81590075Sobrien
816169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
81790075Sobrien_MM_GET_ROUNDING_MODE (void)
81890075Sobrien{
81990075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
82090075Sobrien}
82190075Sobrien
822169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
82390075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
82490075Sobrien{
82590075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
82690075Sobrien}
82790075Sobrien
82890075Sobrien/* Set the control register to I.  */
829169689Skanstatic __inline void __attribute__((__always_inline__))
83090075Sobrien_mm_setcsr (unsigned int __I)
83190075Sobrien{
83290075Sobrien  __builtin_ia32_ldmxcsr (__I);
83390075Sobrien}
83490075Sobrien
83590075Sobrien/* Set exception bits in the control register.  */
836169689Skanstatic __inline void __attribute__((__always_inline__))
83790075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
83890075Sobrien{
83990075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
84090075Sobrien}
84190075Sobrien
842169689Skanstatic __inline void __attribute__((__always_inline__))
84390075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
84490075Sobrien{
84590075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
84690075Sobrien}
84790075Sobrien
848169689Skanstatic __inline void __attribute__((__always_inline__))
84990075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
85090075Sobrien{
85190075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
85290075Sobrien}
85390075Sobrien
854169689Skanstatic __inline void __attribute__((__always_inline__))
85590075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
85690075Sobrien{
85790075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
85890075Sobrien}
85990075Sobrien
860169689Skan/* Create a vector with element 0 as F and the rest zero.  */
861169689Skanstatic __inline __m128 __attribute__((__always_inline__))
862169689Skan_mm_set_ss (float __F)
863169689Skan{
864169689Skan  return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 };
865169689Skan}
866169689Skan
867169689Skan/* Create a vector with all four elements equal to F.  */
868169689Skanstatic __inline __m128 __attribute__((__always_inline__))
869169689Skan_mm_set1_ps (float __F)
870169689Skan{
871169689Skan  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
872169689Skan}
873169689Skan
874169689Skanstatic __inline __m128 __attribute__((__always_inline__))
875169689Skan_mm_set_ps1 (float __F)
876169689Skan{
877169689Skan  return _mm_set1_ps (__F);
878169689Skan}
879169689Skan
88090075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
881169689Skanstatic __inline __m128 __attribute__((__always_inline__))
882117395Skan_mm_load_ss (float const *__P)
88390075Sobrien{
884169689Skan  return _mm_set_ss (*__P);
88590075Sobrien}
88690075Sobrien
88790075Sobrien/* Create a vector with all four elements equal to *P.  */
888169689Skanstatic __inline __m128 __attribute__((__always_inline__))
889117395Skan_mm_load1_ps (float const *__P)
89090075Sobrien{
891169689Skan  return _mm_set1_ps (*__P);
89290075Sobrien}
89390075Sobrien
894169689Skanstatic __inline __m128 __attribute__((__always_inline__))
895117395Skan_mm_load_ps1 (float const *__P)
89690075Sobrien{
89790075Sobrien  return _mm_load1_ps (__P);
89890075Sobrien}
89990075Sobrien
90090075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
901169689Skanstatic __inline __m128 __attribute__((__always_inline__))
902117395Skan_mm_load_ps (float const *__P)
90390075Sobrien{
904169689Skan  return (__m128) *(__v4sf *)__P;
90590075Sobrien}
90690075Sobrien
90790075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
908169689Skanstatic __inline __m128 __attribute__((__always_inline__))
909117395Skan_mm_loadu_ps (float const *__P)
91090075Sobrien{
91190075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
91290075Sobrien}
91390075Sobrien
91490075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
915169689Skanstatic __inline __m128 __attribute__((__always_inline__))
916117395Skan_mm_loadr_ps (float const *__P)
91790075Sobrien{
918169689Skan  __v4sf __tmp = *(__v4sf *)__P;
91990075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
92090075Sobrien}
92190075Sobrien
922169689Skan/* Create the vector [Z Y X W].  */
923169689Skanstatic __inline __m128 __attribute__((__always_inline__))
924169689Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
92590075Sobrien{
926169689Skan  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
92790075Sobrien}
92890075Sobrien
929169689Skan/* Create the vector [W X Y Z].  */
930169689Skanstatic __inline __m128 __attribute__((__always_inline__))
931169689Skan_mm_setr_ps (float __Z, float __Y, float __X, float __W)
93290075Sobrien{
933169689Skan  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
93490075Sobrien}
93590075Sobrien
936169689Skan/* Stores the lower SPFP value.  */
937169689Skanstatic __inline void __attribute__((__always_inline__))
938169689Skan_mm_store_ss (float *__P, __m128 __A)
93990075Sobrien{
940169689Skan  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
94190075Sobrien}
94290075Sobrien
943169689Skanstatic __inline float __attribute__((__always_inline__))
944169689Skan_mm_cvtss_f32 (__m128 __A)
94590075Sobrien{
946169689Skan  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
94790075Sobrien}
94890075Sobrien
949169689Skan/* Store four SPFP values.  The address must be 16-byte aligned.  */
950169689Skanstatic __inline void __attribute__((__always_inline__))
951169689Skan_mm_store_ps (float *__P, __m128 __A)
95290075Sobrien{
953169689Skan  *(__v4sf *)__P = (__v4sf)__A;
95490075Sobrien}
95590075Sobrien
956169689Skan/* Store four SPFP values.  The address need not be 16-byte aligned.  */
957169689Skanstatic __inline void __attribute__((__always_inline__))
958169689Skan_mm_storeu_ps (float *__P, __m128 __A)
95990075Sobrien{
960169689Skan  __builtin_ia32_storeups (__P, (__v4sf)__A);
96190075Sobrien}
96290075Sobrien
96390075Sobrien/* Store the lower SPFP value across four words.  */
964169689Skanstatic __inline void __attribute__((__always_inline__))
96590075Sobrien_mm_store1_ps (float *__P, __m128 __A)
96690075Sobrien{
96790075Sobrien  __v4sf __va = (__v4sf)__A;
96890075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
969169689Skan  _mm_storeu_ps (__P, __tmp);
97090075Sobrien}
97190075Sobrien
972169689Skanstatic __inline void __attribute__((__always_inline__))
97390075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
97490075Sobrien{
97590075Sobrien  _mm_store1_ps (__P, __A);
97690075Sobrien}
97790075Sobrien
978117395Skan/* Store four SPFP values in reverse order.  The address must be aligned.  */
979169689Skanstatic __inline void __attribute__((__always_inline__))
98090075Sobrien_mm_storer_ps (float *__P, __m128 __A)
98190075Sobrien{
98290075Sobrien  __v4sf __va = (__v4sf)__A;
98390075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
984169689Skan  _mm_store_ps (__P, __tmp);
98590075Sobrien}
98690075Sobrien
98790075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
988169689Skanstatic __inline __m128 __attribute__((__always_inline__))
98990075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
99090075Sobrien{
99190075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
99290075Sobrien}
99390075Sobrien
99490075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
99590075Sobrien#if 0
996169689Skanstatic __inline int __attribute__((__always_inline__))
997169689Skan_mm_extract_pi16 (__m64 const __A, int const __N)
99890075Sobrien{
999169689Skan  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
100090075Sobrien}
1001122180Skan
1002169689Skanstatic __inline int __attribute__((__always_inline__))
1003169689Skan_m_pextrw (__m64 const __A, int const __N)
1004122180Skan{
1005122180Skan  return _mm_extract_pi16 (__A, __N);
1006122180Skan}
100790075Sobrien#else
1008169689Skan#define _mm_extract_pi16(A, N)	__builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N))
1009122180Skan#define _m_pextrw(A, N)		_mm_extract_pi16((A), (N))
101090075Sobrien#endif
101190075Sobrien
101290075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
101390075Sobrien   immediate.  */
101490075Sobrien#if 0
1015169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1016169689Skan_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
101790075Sobrien{
1018169689Skan  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
101990075Sobrien}
1020122180Skan
1021169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1022169689Skan_m_pinsrw (__m64 const __A, int const __D, int const __N)
1023122180Skan{
1024122180Skan  return _mm_insert_pi16 (__A, __D, __N);
1025122180Skan}
102690075Sobrien#else
102790075Sobrien#define _mm_insert_pi16(A, D, N) \
1028169689Skan  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N)))
1029122180Skan#define _m_pinsrw(A, D, N)	 _mm_insert_pi16((A), (D), (N))
103090075Sobrien#endif
103190075Sobrien
103290075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
1033169689Skanstatic __inline __m64 __attribute__((__always_inline__))
103490075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
103590075Sobrien{
103690075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
103790075Sobrien}
103890075Sobrien
1039169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1040122180Skan_m_pmaxsw (__m64 __A, __m64 __B)
1041122180Skan{
1042122180Skan  return _mm_max_pi16 (__A, __B);
1043122180Skan}
1044122180Skan
104590075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
1046169689Skanstatic __inline __m64 __attribute__((__always_inline__))
104790075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
104890075Sobrien{
104990075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
105090075Sobrien}
105190075Sobrien
1052169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1053122180Skan_m_pmaxub (__m64 __A, __m64 __B)
1054122180Skan{
1055122180Skan  return _mm_max_pu8 (__A, __B);
1056122180Skan}
1057122180Skan
105890075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
1059169689Skanstatic __inline __m64 __attribute__((__always_inline__))
106090075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
106190075Sobrien{
106290075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
106390075Sobrien}
106490075Sobrien
1065169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1066122180Skan_m_pminsw (__m64 __A, __m64 __B)
1067122180Skan{
1068122180Skan  return _mm_min_pi16 (__A, __B);
1069122180Skan}
1070122180Skan
107190075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
1072169689Skanstatic __inline __m64 __attribute__((__always_inline__))
107390075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
107490075Sobrien{
107590075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
107690075Sobrien}
107790075Sobrien
1078169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1079122180Skan_m_pminub (__m64 __A, __m64 __B)
1080122180Skan{
1081122180Skan  return _mm_min_pu8 (__A, __B);
1082122180Skan}
1083122180Skan
108490075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
1085169689Skanstatic __inline int __attribute__((__always_inline__))
108690075Sobrien_mm_movemask_pi8 (__m64 __A)
108790075Sobrien{
108890075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
108990075Sobrien}
109090075Sobrien
1091169689Skanstatic __inline int __attribute__((__always_inline__))
1092122180Skan_m_pmovmskb (__m64 __A)
1093122180Skan{
1094122180Skan  return _mm_movemask_pi8 (__A);
1095122180Skan}
1096122180Skan
109790075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
109890075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
1099169689Skanstatic __inline __m64 __attribute__((__always_inline__))
110090075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
110190075Sobrien{
110290075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
110390075Sobrien}
110490075Sobrien
1105169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1106122180Skan_m_pmulhuw (__m64 __A, __m64 __B)
1107122180Skan{
1108122180Skan  return _mm_mulhi_pu16 (__A, __B);
1109122180Skan}
1110122180Skan
111190075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
111290075Sobrien   must be an immediate.  */
111390075Sobrien#if 0
1114169689Skanstatic __inline __m64 __attribute__((__always_inline__))
111590075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
111690075Sobrien{
111790075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
111890075Sobrien}
1119122180Skan
1120169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1121122180Skan_m_pshufw (__m64 __A, int __N)
1122122180Skan{
1123122180Skan  return _mm_shuffle_pi16 (__A, __N);
1124122180Skan}
112590075Sobrien#else
112690075Sobrien#define _mm_shuffle_pi16(A, N) \
112790075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1128122180Skan#define _m_pshufw(A, N)		_mm_shuffle_pi16 ((A), (N))
112990075Sobrien#endif
113090075Sobrien
113190075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
113290075Sobrien   byte in the selector N determines whether the corresponding byte from
113390075Sobrien   A is stored.  */
1134169689Skanstatic __inline void __attribute__((__always_inline__))
113590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
113690075Sobrien{
113790075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
113890075Sobrien}
113990075Sobrien
1140169689Skanstatic __inline void __attribute__((__always_inline__))
1141122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1142122180Skan{
1143122180Skan  _mm_maskmove_si64 (__A, __N, __P);
1144122180Skan}
1145122180Skan
114690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1147169689Skanstatic __inline __m64 __attribute__((__always_inline__))
114890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
114990075Sobrien{
115090075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
115190075Sobrien}
115290075Sobrien
1153169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1154122180Skan_m_pavgb (__m64 __A, __m64 __B)
1155122180Skan{
1156122180Skan  return _mm_avg_pu8 (__A, __B);
1157122180Skan}
1158122180Skan
115990075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1160169689Skanstatic __inline __m64 __attribute__((__always_inline__))
116190075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
116290075Sobrien{
116390075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
116490075Sobrien}
116590075Sobrien
1166169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1167122180Skan_m_pavgw (__m64 __A, __m64 __B)
1168122180Skan{
1169122180Skan  return _mm_avg_pu16 (__A, __B);
1170122180Skan}
1171122180Skan
117290075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
117390075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
117490075Sobrien   upper words are cleared.  */
1175169689Skanstatic __inline __m64 __attribute__((__always_inline__))
117690075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
117790075Sobrien{
117890075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
117990075Sobrien}
118090075Sobrien
1181169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1182122180Skan_m_psadbw (__m64 __A, __m64 __B)
1183122180Skan{
1184122180Skan  return _mm_sad_pu8 (__A, __B);
1185122180Skan}
1186122180Skan
118790075Sobrien/* Loads one cache line from address P to a location "closer" to the
118890075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
118990075Sobrien#if 0
1190169689Skanstatic __inline void __attribute__((__always_inline__))
119190075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
119290075Sobrien{
119390075Sobrien  __builtin_prefetch (__P, 0, __I);
119490075Sobrien}
119590075Sobrien#else
119690075Sobrien#define _mm_prefetch(P, I) \
119790075Sobrien  __builtin_prefetch ((P), 0, (I))
119890075Sobrien#endif
119990075Sobrien
120090075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
1201169689Skanstatic __inline void __attribute__((__always_inline__))
120290075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
120390075Sobrien{
1204117395Skan  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
120590075Sobrien}
120690075Sobrien
120790075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
1208169689Skanstatic __inline void __attribute__((__always_inline__))
120990075Sobrien_mm_stream_ps (float *__P, __m128 __A)
121090075Sobrien{
121190075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
121290075Sobrien}
121390075Sobrien
1214132718Skan/* Guarantees that every preceding store is globally visible before
121590075Sobrien   any subsequent store.  */
1216169689Skanstatic __inline void __attribute__((__always_inline__))
121790075Sobrien_mm_sfence (void)
121890075Sobrien{
121990075Sobrien  __builtin_ia32_sfence ();
122090075Sobrien}
122190075Sobrien
122290075Sobrien/* The execution of the next instruction is delayed by an implementation
122390075Sobrien   specific amount of time.  The instruction does not modify the
122490075Sobrien   architectural state.  */
1225169689Skanstatic __inline void __attribute__((__always_inline__))
122690075Sobrien_mm_pause (void)
122790075Sobrien{
122890075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
122990075Sobrien}
123090075Sobrien
123190075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
123290075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
123390075Sobriendo {									\
123490075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1235169689Skan  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
1236169689Skan  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
1237169689Skan  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
1238169689Skan  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
1239169689Skan  (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
1240169689Skan  (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
1241169689Skan  (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
1242169689Skan  (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
124390075Sobrien} while (0)
124490075Sobrien
1245122180Skan/* For backward source compatibility.  */
1246122180Skan#include <emmintrin.h>
1247117395Skan
1248117395Skan#endif /* __SSE__ */
124990075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1250