xmmintrin.h revision 122180
1122180Skan/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
290075Sobrien
390075Sobrien   This file is part of GNU CC.
490075Sobrien
590075Sobrien   GNU CC is free software; you can redistribute it and/or modify
690075Sobrien   it under the terms of the GNU General Public License as published by
790075Sobrien   the Free Software Foundation; either version 2, or (at your option)
890075Sobrien   any later version.
990075Sobrien
1090075Sobrien   GNU CC is distributed in the hope that it will be useful,
1190075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1290075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1390075Sobrien   GNU General Public License for more details.
1490075Sobrien
1590075Sobrien   You should have received a copy of the GNU General Public License
1690075Sobrien   along with GNU CC; see the file COPYING.  If not, write to
1790075Sobrien   the Free Software Foundation, 59 Temple Place - Suite 330,
1890075Sobrien   Boston, MA 02111-1307, USA.  */
1990075Sobrien
2090075Sobrien/* As a special exception, if you include this header file into source
2190075Sobrien   files compiled by GCC, this header file does not by itself cause
2290075Sobrien   the resulting executable to be covered by the GNU General Public
2390075Sobrien   License.  This exception does not however invalidate any other
2490075Sobrien   reasons why the executable file might be covered by the GNU General
2590075Sobrien   Public License.  */
2690075Sobrien
2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
28122180Skan   User Guide and Reference, version 8.0.  */
2990075Sobrien
3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3190075Sobrien#define _XMMINTRIN_H_INCLUDED
3290075Sobrien
33117395Skan#ifndef __SSE__
34117395Skan# error "SSE instruction set not enabled"
35117395Skan#else
36117395Skan
3790075Sobrien/* We need type definitions from the MMX header file.  */
3890075Sobrien#include <mmintrin.h>
3990075Sobrien
4090075Sobrien/* The data type indended for user use.  */
4190075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__)));
4290075Sobrien
4390075Sobrien/* Internal data types for implementing the instrinsics.  */
4490075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
4590075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__)));
4690075Sobrien
4790075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
4890075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
4990075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
5090075Sobrien
5190075Sobrien/* Constants for use with _mm_prefetch.  */
5290075Sobrienenum _mm_hint
5390075Sobrien{
5490075Sobrien  _MM_HINT_T0 = 3,
5590075Sobrien  _MM_HINT_T1 = 2,
5690075Sobrien  _MM_HINT_T2 = 1,
5790075Sobrien  _MM_HINT_NTA = 0
5890075Sobrien};
5990075Sobrien
6090075Sobrien/* Bits in the MXCSR.  */
6190075Sobrien#define _MM_EXCEPT_MASK       0x003f
6290075Sobrien#define _MM_EXCEPT_INVALID    0x0001
6390075Sobrien#define _MM_EXCEPT_DENORM     0x0002
6490075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
6590075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
6690075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
6790075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
6890075Sobrien
6990075Sobrien#define _MM_MASK_MASK         0x1f80
7090075Sobrien#define _MM_MASK_INVALID      0x0080
7190075Sobrien#define _MM_MASK_DENORM       0x0100
7290075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
7390075Sobrien#define _MM_MASK_OVERFLOW     0x0400
7490075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
7590075Sobrien#define _MM_MASK_INEXACT      0x1000
7690075Sobrien
7790075Sobrien#define _MM_ROUND_MASK        0x6000
7890075Sobrien#define _MM_ROUND_NEAREST     0x0000
7990075Sobrien#define _MM_ROUND_DOWN        0x2000
8090075Sobrien#define _MM_ROUND_UP          0x4000
8190075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
8290075Sobrien
8390075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
8490075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
8590075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
8690075Sobrien
8790075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
8890075Sobrien   floating-point) values of A and B; the upper three SPFP values are
8990075Sobrien   passed through from A.  */
9090075Sobrien
9190075Sobrienstatic __inline __m128
9290075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
9390075Sobrien{
9490075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
9590075Sobrien}
9690075Sobrien
9790075Sobrienstatic __inline __m128
9890075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
9990075Sobrien{
10090075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
10190075Sobrien}
10290075Sobrien
10390075Sobrienstatic __inline __m128
10490075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
10590075Sobrien{
10690075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
10790075Sobrien}
10890075Sobrien
10990075Sobrienstatic __inline __m128
11090075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
11190075Sobrien{
11290075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
11390075Sobrien}
11490075Sobrien
11590075Sobrienstatic __inline __m128
11690075Sobrien_mm_sqrt_ss (__m128 __A)
11790075Sobrien{
11890075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
11990075Sobrien}
12090075Sobrien
12190075Sobrienstatic __inline __m128
12290075Sobrien_mm_rcp_ss (__m128 __A)
12390075Sobrien{
12490075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
12590075Sobrien}
12690075Sobrien
12790075Sobrienstatic __inline __m128
12890075Sobrien_mm_rsqrt_ss (__m128 __A)
12990075Sobrien{
13090075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
13190075Sobrien}
13290075Sobrien
13390075Sobrienstatic __inline __m128
13490075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
13590075Sobrien{
13690075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
13790075Sobrien}
13890075Sobrien
13990075Sobrienstatic __inline __m128
14090075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
14190075Sobrien{
14290075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
14390075Sobrien}
14490075Sobrien
14590075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
14690075Sobrien
14790075Sobrienstatic __inline __m128
14890075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
14990075Sobrien{
15090075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
15190075Sobrien}
15290075Sobrien
15390075Sobrienstatic __inline __m128
15490075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
15590075Sobrien{
15690075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
15790075Sobrien}
15890075Sobrien
15990075Sobrienstatic __inline __m128
16090075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
16190075Sobrien{
16290075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
16390075Sobrien}
16490075Sobrien
16590075Sobrienstatic __inline __m128
16690075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
16790075Sobrien{
16890075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
16990075Sobrien}
17090075Sobrien
17190075Sobrienstatic __inline __m128
17290075Sobrien_mm_sqrt_ps (__m128 __A)
17390075Sobrien{
17490075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
17590075Sobrien}
17690075Sobrien
17790075Sobrienstatic __inline __m128
17890075Sobrien_mm_rcp_ps (__m128 __A)
17990075Sobrien{
18090075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
18190075Sobrien}
18290075Sobrien
18390075Sobrienstatic __inline __m128
18490075Sobrien_mm_rsqrt_ps (__m128 __A)
18590075Sobrien{
18690075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
18790075Sobrien}
18890075Sobrien
18990075Sobrienstatic __inline __m128
19090075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
19190075Sobrien{
19290075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
19390075Sobrien}
19490075Sobrien
19590075Sobrienstatic __inline __m128
19690075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
19790075Sobrien{
19890075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
19990075Sobrien}
20090075Sobrien
20190075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
20290075Sobrien
20390075Sobrienstatic __inline __m128
20490075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
20590075Sobrien{
20690075Sobrien  return __builtin_ia32_andps (__A, __B);
20790075Sobrien}
20890075Sobrien
20990075Sobrienstatic __inline __m128
21090075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
21190075Sobrien{
21290075Sobrien  return __builtin_ia32_andnps (__A, __B);
21390075Sobrien}
21490075Sobrien
21590075Sobrienstatic __inline __m128
21690075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
21790075Sobrien{
21890075Sobrien  return __builtin_ia32_orps (__A, __B);
21990075Sobrien}
22090075Sobrien
22190075Sobrienstatic __inline __m128
22290075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
22390075Sobrien{
22490075Sobrien  return __builtin_ia32_xorps (__A, __B);
22590075Sobrien}
22690075Sobrien
22790075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
22890075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
22990075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
23090075Sobrien
23190075Sobrienstatic __inline __m128
23290075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
23390075Sobrien{
23490075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
23590075Sobrien}
23690075Sobrien
23790075Sobrienstatic __inline __m128
23890075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
23990075Sobrien{
24090075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
24190075Sobrien}
24290075Sobrien
24390075Sobrienstatic __inline __m128
24490075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
24590075Sobrien{
24690075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
24790075Sobrien}
24890075Sobrien
24990075Sobrienstatic __inline __m128
25090075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
25190075Sobrien{
252107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
253107590Sobrien					(__v4sf)
254107590Sobrien					__builtin_ia32_cmpltss ((__v4sf) __B,
255107590Sobrien								(__v4sf)
256107590Sobrien								__A));
25790075Sobrien}
25890075Sobrien
25990075Sobrienstatic __inline __m128
26090075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
26190075Sobrien{
262107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
263107590Sobrien					(__v4sf)
264107590Sobrien					__builtin_ia32_cmpless ((__v4sf) __B,
265107590Sobrien								(__v4sf)
266107590Sobrien								__A));
26790075Sobrien}
26890075Sobrien
26990075Sobrienstatic __inline __m128
27090075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
27190075Sobrien{
27290075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
27390075Sobrien}
27490075Sobrien
27590075Sobrienstatic __inline __m128
27690075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
27790075Sobrien{
27890075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
27990075Sobrien}
28090075Sobrien
28190075Sobrienstatic __inline __m128
28290075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
28390075Sobrien{
28490075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
28590075Sobrien}
28690075Sobrien
28790075Sobrienstatic __inline __m128
28890075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
28990075Sobrien{
290107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
291107590Sobrien					(__v4sf)
292107590Sobrien					__builtin_ia32_cmpnltss ((__v4sf) __B,
293107590Sobrien								 (__v4sf)
294107590Sobrien								 __A));
29590075Sobrien}
29690075Sobrien
29790075Sobrienstatic __inline __m128
29890075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
29990075Sobrien{
300107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
301107590Sobrien					(__v4sf)
302107590Sobrien					__builtin_ia32_cmpnless ((__v4sf) __B,
303107590Sobrien								 (__v4sf)
304107590Sobrien								 __A));
30590075Sobrien}
30690075Sobrien
30790075Sobrienstatic __inline __m128
30890075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
30990075Sobrien{
31090075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
31190075Sobrien}
31290075Sobrien
31390075Sobrienstatic __inline __m128
31490075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
31590075Sobrien{
31690075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
31790075Sobrien}
31890075Sobrien
31990075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
32090075Sobrien   element, if the comparison is true, place a mask of all ones in the
32190075Sobrien   result, otherwise a mask of zeros.  */
32290075Sobrien
32390075Sobrienstatic __inline __m128
32490075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
32590075Sobrien{
32690075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
32790075Sobrien}
32890075Sobrien
32990075Sobrienstatic __inline __m128
33090075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
33190075Sobrien{
33290075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
33390075Sobrien}
33490075Sobrien
33590075Sobrienstatic __inline __m128
33690075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
33790075Sobrien{
33890075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
33990075Sobrien}
34090075Sobrien
34190075Sobrienstatic __inline __m128
34290075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
34390075Sobrien{
34490075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
34590075Sobrien}
34690075Sobrien
34790075Sobrienstatic __inline __m128
34890075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
34990075Sobrien{
35090075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
35190075Sobrien}
35290075Sobrien
35390075Sobrienstatic __inline __m128
35490075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
35590075Sobrien{
35690075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
35790075Sobrien}
35890075Sobrien
35990075Sobrienstatic __inline __m128
36090075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
36190075Sobrien{
36290075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
36390075Sobrien}
36490075Sobrien
36590075Sobrienstatic __inline __m128
36690075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
36790075Sobrien{
36890075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
36990075Sobrien}
37090075Sobrien
37190075Sobrienstatic __inline __m128
37290075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
37390075Sobrien{
37490075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
37590075Sobrien}
37690075Sobrien
37790075Sobrienstatic __inline __m128
37890075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
37990075Sobrien{
38090075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
38190075Sobrien}
38290075Sobrien
38390075Sobrienstatic __inline __m128
38490075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
38590075Sobrien{
38690075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
38790075Sobrien}
38890075Sobrien
38990075Sobrienstatic __inline __m128
39090075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
39190075Sobrien{
39290075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
39390075Sobrien}
39490075Sobrien
39590075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
39690075Sobrien   and 0 if false.  */
39790075Sobrien
39890075Sobrienstatic __inline int
39990075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
40090075Sobrien{
40190075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
40290075Sobrien}
40390075Sobrien
40490075Sobrienstatic __inline int
40590075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
40690075Sobrien{
40790075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
40890075Sobrien}
40990075Sobrien
41090075Sobrienstatic __inline int
41190075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
41290075Sobrien{
41390075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
41490075Sobrien}
41590075Sobrien
41690075Sobrienstatic __inline int
41790075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
41890075Sobrien{
41990075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
42090075Sobrien}
42190075Sobrien
42290075Sobrienstatic __inline int
42390075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
42490075Sobrien{
42590075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
42690075Sobrien}
42790075Sobrien
42890075Sobrienstatic __inline int
42990075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
43090075Sobrien{
43190075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
43290075Sobrien}
43390075Sobrien
43490075Sobrienstatic __inline int
43590075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
43690075Sobrien{
43790075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
43890075Sobrien}
43990075Sobrien
44090075Sobrienstatic __inline int
44190075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
44290075Sobrien{
44390075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
44490075Sobrien}
44590075Sobrien
44690075Sobrienstatic __inline int
44790075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
44890075Sobrien{
44990075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
45090075Sobrien}
45190075Sobrien
45290075Sobrienstatic __inline int
45390075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
45490075Sobrien{
45590075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
45690075Sobrien}
45790075Sobrien
45890075Sobrienstatic __inline int
45990075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
46090075Sobrien{
46190075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
46290075Sobrien}
46390075Sobrien
46490075Sobrienstatic __inline int
46590075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
46690075Sobrien{
46790075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
46890075Sobrien}
46990075Sobrien
47090075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
47190075Sobrien   rounding mode.  */
47290075Sobrienstatic __inline int
47390075Sobrien_mm_cvtss_si32 (__m128 __A)
47490075Sobrien{
47590075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
47690075Sobrien}
47790075Sobrien
478122180Skanstatic __inline int
479122180Skan_mm_cvt_ss2si (__m128 __A)
480122180Skan{
481122180Skan  return _mm_cvtss_si32 (__A);
482122180Skan}
483122180Skan
484117395Skan#ifdef __x86_64__
485117395Skan/* Convert the lower SPFP value to a 32-bit integer according to the current
486117395Skan   rounding mode.  */
487117395Skanstatic __inline long long
488117395Skan_mm_cvtss_si64x (__m128 __A)
489117395Skan{
490117395Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
491117395Skan}
492117395Skan#endif
493117395Skan
49490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
49590075Sobrien   current rounding mode.  Return the integers in packed form.  */
49690075Sobrienstatic __inline __m64
49790075Sobrien_mm_cvtps_pi32 (__m128 __A)
49890075Sobrien{
49990075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
50090075Sobrien}
50190075Sobrien
502122180Skanstatic __inline __m64
503122180Skan_mm_cvt_ps2pi (__m128 __A)
504122180Skan{
505122180Skan  return _mm_cvtps_pi32 (__A);
506122180Skan}
507122180Skan
50890075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
50990075Sobrienstatic __inline int
51090075Sobrien_mm_cvttss_si32 (__m128 __A)
51190075Sobrien{
51290075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
51390075Sobrien}
51490075Sobrien
515122180Skanstatic __inline int
516122180Skan_mm_cvtt_ss2si (__m128 __A)
517122180Skan{
518122180Skan  return _mm_cvttss_si32 (__A);
519122180Skan}
520122180Skan
521117395Skan#ifdef __x86_64__
522117395Skan/* Truncate the lower SPFP value to a 32-bit integer.  */
523117395Skanstatic __inline long long
524117395Skan_mm_cvttss_si64x (__m128 __A)
525117395Skan{
526117395Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
527117395Skan}
528117395Skan#endif
529117395Skan
53090075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
53190075Sobrien   integers in packed form.  */
53290075Sobrienstatic __inline __m64
53390075Sobrien_mm_cvttps_pi32 (__m128 __A)
53490075Sobrien{
53590075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
53690075Sobrien}
53790075Sobrien
538122180Skanstatic __inline __m64
539122180Skan_mm_cvtt_ps2pi (__m128 __A)
540122180Skan{
541122180Skan  return _mm_cvttps_pi32 (__A);
542122180Skan}
543122180Skan
54490075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
54590075Sobrienstatic __inline __m128
54690075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
54790075Sobrien{
54890075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
54990075Sobrien}
55090075Sobrien
551122180Skanstatic __inline __m128
552122180Skan_mm_cvt_si2ss (__m128 __A, int __B)
553122180Skan{
554122180Skan  return _mm_cvtsi32_ss (__A, __B);
555122180Skan}
556122180Skan
557117395Skan#ifdef __x86_64__
558117395Skan/* Convert B to a SPFP value and insert it as element zero in A.  */
559117395Skanstatic __inline __m128
560117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B)
561117395Skan{
562117395Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
563117395Skan}
564117395Skan#endif
565117395Skan
56690075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
56790075Sobrien   as the two lower elements in A.  */
56890075Sobrienstatic __inline __m128
56990075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
57090075Sobrien{
57190075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
57290075Sobrien}
57390075Sobrien
574122180Skanstatic __inline __m128
575122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B)
576122180Skan{
577122180Skan  return _mm_cvtpi32_ps (__A, __B);
578122180Skan}
579122180Skan
58090075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
58190075Sobrienstatic __inline __m128
58290075Sobrien_mm_cvtpi16_ps (__m64 __A)
58390075Sobrien{
58490075Sobrien  __v4hi __sign;
58590075Sobrien  __v2si __hisi, __losi;
58690075Sobrien  __v4sf __r;
58790075Sobrien
58890075Sobrien  /* This comparison against zero gives us a mask that can be used to
58990075Sobrien     fill in the missing sign bits in the unpack operations below, so
59090075Sobrien     that we get signed values after unpacking.  */
59190075Sobrien  __sign = (__v4hi) __builtin_ia32_mmx_zero ();
59290075Sobrien  __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
59390075Sobrien
59490075Sobrien  /* Convert the four words to doublewords.  */
59590075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
59690075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
59790075Sobrien
59890075Sobrien  /* Convert the doublewords to floating point two at a time.  */
59990075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
60090075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
60190075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
60290075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
60390075Sobrien
60490075Sobrien  return (__m128) __r;
60590075Sobrien}
60690075Sobrien
60790075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
60890075Sobrienstatic __inline __m128
60990075Sobrien_mm_cvtpu16_ps (__m64 __A)
61090075Sobrien{
61190075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
61290075Sobrien  __v2si __hisi, __losi;
61390075Sobrien  __v4sf __r;
61490075Sobrien
61590075Sobrien  /* Convert the four words to doublewords.  */
61690075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
61790075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
61890075Sobrien
61990075Sobrien  /* Convert the doublewords to floating point two at a time.  */
62090075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
62190075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
62290075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
62390075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
62490075Sobrien
62590075Sobrien  return (__m128) __r;
62690075Sobrien}
62790075Sobrien
62890075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
62990075Sobrienstatic __inline __m128
63090075Sobrien_mm_cvtpi8_ps (__m64 __A)
63190075Sobrien{
63290075Sobrien  __v8qi __sign;
63390075Sobrien
63490075Sobrien  /* This comparison against zero gives us a mask that can be used to
63590075Sobrien     fill in the missing sign bits in the unpack operations below, so
63690075Sobrien     that we get signed values after unpacking.  */
63790075Sobrien  __sign = (__v8qi) __builtin_ia32_mmx_zero ();
63890075Sobrien  __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
63990075Sobrien
64090075Sobrien  /* Convert the four low bytes to words.  */
64190075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
64290075Sobrien
64390075Sobrien  return _mm_cvtpi16_ps(__A);
64490075Sobrien}
64590075Sobrien
64690075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
64790075Sobrienstatic __inline __m128
64890075Sobrien_mm_cvtpu8_ps(__m64 __A)
64990075Sobrien{
65090075Sobrien  __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
65190075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
65290075Sobrien  return _mm_cvtpu16_ps(__A);
65390075Sobrien}
65490075Sobrien
65590075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
65690075Sobrienstatic __inline __m128
65790075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
65890075Sobrien{
65990075Sobrien  __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
66090075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
66190075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
66290075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
66390075Sobrien}
66490075Sobrien
66590075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
66690075Sobrienstatic __inline __m64
66790075Sobrien_mm_cvtps_pi16(__m128 __A)
66890075Sobrien{
66990075Sobrien  __v4sf __hisf = (__v4sf)__A;
67090075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
67190075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
67290075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
673117395Skan  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
67490075Sobrien}
67590075Sobrien
67690075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
67790075Sobrienstatic __inline __m64
67890075Sobrien_mm_cvtps_pi8(__m128 __A)
67990075Sobrien{
68090075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
68190075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
68290075Sobrien  return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
68390075Sobrien}
68490075Sobrien
68590075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
68690075Sobrien#if 0
68790075Sobrienstatic __inline __m128
68890075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
68990075Sobrien{
69090075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
69190075Sobrien}
69290075Sobrien#else
69390075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
69490075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
69590075Sobrien#endif
69690075Sobrien
69790075Sobrien
69890075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
69990075Sobrienstatic __inline __m128
70090075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
70190075Sobrien{
70290075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
70390075Sobrien}
70490075Sobrien
70590075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
70690075Sobrienstatic __inline __m128
70790075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
70890075Sobrien{
70990075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
71090075Sobrien}
71190075Sobrien
71290075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
71390075Sobrien   the lower two values are passed through from A.  */
71490075Sobrienstatic __inline __m128
715117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P)
71690075Sobrien{
71790075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
71890075Sobrien}
71990075Sobrien
72090075Sobrien/* Stores the upper two SPFP values of A into P.  */
72190075Sobrienstatic __inline void
72290075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
72390075Sobrien{
72490075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
72590075Sobrien}
72690075Sobrien
72790075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
72890075Sobrienstatic __inline __m128
72990075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
73090075Sobrien{
73190075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
73290075Sobrien}
73390075Sobrien
73490075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
73590075Sobrienstatic __inline __m128
73690075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
73790075Sobrien{
73890075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
73990075Sobrien}
74090075Sobrien
74190075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
74290075Sobrien   the upper two values are passed through from A.  */
74390075Sobrienstatic __inline __m128
744117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P)
74590075Sobrien{
74690075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
74790075Sobrien}
74890075Sobrien
74990075Sobrien/* Stores the lower two SPFP values of A into P.  */
75090075Sobrienstatic __inline void
75190075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
75290075Sobrien{
75390075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
75490075Sobrien}
75590075Sobrien
75690075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
75790075Sobrienstatic __inline int
75890075Sobrien_mm_movemask_ps (__m128 __A)
75990075Sobrien{
76090075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
76190075Sobrien}
76290075Sobrien
76390075Sobrien/* Return the contents of the control register.  */
76490075Sobrienstatic __inline unsigned int
76590075Sobrien_mm_getcsr (void)
76690075Sobrien{
76790075Sobrien  return __builtin_ia32_stmxcsr ();
76890075Sobrien}
76990075Sobrien
77090075Sobrien/* Read exception bits from the control register.  */
77190075Sobrienstatic __inline unsigned int
77290075Sobrien_MM_GET_EXCEPTION_STATE (void)
77390075Sobrien{
77490075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
77590075Sobrien}
77690075Sobrien
77790075Sobrienstatic __inline unsigned int
77890075Sobrien_MM_GET_EXCEPTION_MASK (void)
77990075Sobrien{
78090075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
78190075Sobrien}
78290075Sobrien
78390075Sobrienstatic __inline unsigned int
78490075Sobrien_MM_GET_ROUNDING_MODE (void)
78590075Sobrien{
78690075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
78790075Sobrien}
78890075Sobrien
78990075Sobrienstatic __inline unsigned int
79090075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
79190075Sobrien{
79290075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
79390075Sobrien}
79490075Sobrien
79590075Sobrien/* Set the control register to I.  */
79690075Sobrienstatic __inline void
79790075Sobrien_mm_setcsr (unsigned int __I)
79890075Sobrien{
79990075Sobrien  __builtin_ia32_ldmxcsr (__I);
80090075Sobrien}
80190075Sobrien
80290075Sobrien/* Set exception bits in the control register.  */
80390075Sobrienstatic __inline void
80490075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
80590075Sobrien{
80690075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
80790075Sobrien}
80890075Sobrien
80990075Sobrienstatic __inline void
81090075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
81190075Sobrien{
81290075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
81390075Sobrien}
81490075Sobrien
81590075Sobrienstatic __inline void
81690075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
81790075Sobrien{
81890075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
81990075Sobrien}
82090075Sobrien
82190075Sobrienstatic __inline void
82290075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
82390075Sobrien{
82490075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
82590075Sobrien}
82690075Sobrien
82790075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
82890075Sobrienstatic __inline __m128
829117395Skan_mm_load_ss (float const *__P)
83090075Sobrien{
83190075Sobrien  return (__m128) __builtin_ia32_loadss (__P);
83290075Sobrien}
83390075Sobrien
83490075Sobrien/* Create a vector with all four elements equal to *P.  */
83590075Sobrienstatic __inline __m128
836117395Skan_mm_load1_ps (float const *__P)
83790075Sobrien{
83890075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (__P);
83990075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
84090075Sobrien}
84190075Sobrien
84290075Sobrienstatic __inline __m128
843117395Skan_mm_load_ps1 (float const *__P)
84490075Sobrien{
84590075Sobrien  return _mm_load1_ps (__P);
84690075Sobrien}
84790075Sobrien
84890075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
84990075Sobrienstatic __inline __m128
850117395Skan_mm_load_ps (float const *__P)
85190075Sobrien{
85290075Sobrien  return (__m128) __builtin_ia32_loadaps (__P);
85390075Sobrien}
85490075Sobrien
85590075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
85690075Sobrienstatic __inline __m128
857117395Skan_mm_loadu_ps (float const *__P)
85890075Sobrien{
85990075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
86090075Sobrien}
86190075Sobrien
86290075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
86390075Sobrienstatic __inline __m128
864117395Skan_mm_loadr_ps (float const *__P)
86590075Sobrien{
86690075Sobrien  __v4sf __tmp = __builtin_ia32_loadaps (__P);
86790075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
86890075Sobrien}
86990075Sobrien
87090075Sobrien/* Create a vector with element 0 as F and the rest zero.  */
87190075Sobrienstatic __inline __m128
87290075Sobrien_mm_set_ss (float __F)
87390075Sobrien{
87490075Sobrien  return (__m128) __builtin_ia32_loadss (&__F);
87590075Sobrien}
87690075Sobrien
87790075Sobrien/* Create a vector with all four elements equal to F.  */
87890075Sobrienstatic __inline __m128
87990075Sobrien_mm_set1_ps (float __F)
88090075Sobrien{
88190075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (&__F);
88290075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
88390075Sobrien}
88490075Sobrien
88590075Sobrienstatic __inline __m128
88690075Sobrien_mm_set_ps1 (float __F)
88790075Sobrien{
88890075Sobrien  return _mm_set1_ps (__F);
88990075Sobrien}
89090075Sobrien
89190075Sobrien/* Create the vector [Z Y X W].  */
89290075Sobrienstatic __inline __m128
89390075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W)
89490075Sobrien{
89590075Sobrien  union {
89690075Sobrien    float __a[4];
89790075Sobrien    __m128 __v;
89890075Sobrien  } __u;
89990075Sobrien
90090075Sobrien  __u.__a[0] = __W;
90190075Sobrien  __u.__a[1] = __X;
90290075Sobrien  __u.__a[2] = __Y;
90390075Sobrien  __u.__a[3] = __Z;
90490075Sobrien
90590075Sobrien  return __u.__v;
90690075Sobrien}
90790075Sobrien
90890075Sobrien/* Create the vector [W X Y Z].  */
90990075Sobrienstatic __inline __m128
91090075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W)
91190075Sobrien{
91290075Sobrien  return _mm_set_ps (__W, __X, __Y, __Z);
91390075Sobrien}
91490075Sobrien
91590075Sobrien/* Create a vector of zeros.  */
91690075Sobrienstatic __inline __m128
91790075Sobrien_mm_setzero_ps (void)
91890075Sobrien{
91990075Sobrien  return (__m128) __builtin_ia32_setzerops ();
92090075Sobrien}
92190075Sobrien
92290075Sobrien/* Stores the lower SPFP value.  */
92390075Sobrienstatic __inline void
92490075Sobrien_mm_store_ss (float *__P, __m128 __A)
92590075Sobrien{
92690075Sobrien  __builtin_ia32_storess (__P, (__v4sf)__A);
92790075Sobrien}
92890075Sobrien
92990075Sobrien/* Store the lower SPFP value across four words.  */
93090075Sobrienstatic __inline void
93190075Sobrien_mm_store1_ps (float *__P, __m128 __A)
93290075Sobrien{
93390075Sobrien  __v4sf __va = (__v4sf)__A;
93490075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
93590075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
93690075Sobrien}
93790075Sobrien
93890075Sobrienstatic __inline void
93990075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
94090075Sobrien{
94190075Sobrien  _mm_store1_ps (__P, __A);
94290075Sobrien}
94390075Sobrien
94490075Sobrien/* Store four SPFP values.  The address must be 16-byte aligned.  */
94590075Sobrienstatic __inline void
94690075Sobrien_mm_store_ps (float *__P, __m128 __A)
94790075Sobrien{
94890075Sobrien  __builtin_ia32_storeaps (__P, (__v4sf)__A);
94990075Sobrien}
95090075Sobrien
95190075Sobrien/* Store four SPFP values.  The address need not be 16-byte aligned.  */
95290075Sobrienstatic __inline void
95390075Sobrien_mm_storeu_ps (float *__P, __m128 __A)
95490075Sobrien{
95590075Sobrien  __builtin_ia32_storeups (__P, (__v4sf)__A);
95690075Sobrien}
95790075Sobrien
958117395Skan/* Store four SPFP values in reverse order.  The address must be aligned.  */
95990075Sobrienstatic __inline void
96090075Sobrien_mm_storer_ps (float *__P, __m128 __A)
96190075Sobrien{
96290075Sobrien  __v4sf __va = (__v4sf)__A;
96390075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
96490075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
96590075Sobrien}
96690075Sobrien
96790075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
96890075Sobrienstatic __inline __m128
96990075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
97090075Sobrien{
97190075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
97290075Sobrien}
97390075Sobrien
97490075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
97590075Sobrien#if 0
97690075Sobrienstatic __inline int
97790075Sobrien_mm_extract_pi16 (__m64 __A, int __N)
97890075Sobrien{
97990075Sobrien  return __builtin_ia32_pextrw ((__v4hi)__A, __N);
98090075Sobrien}
981122180Skan
982122180Skanstatic __inline int
983122180Skan_m_pextrw (__m64 __A, int __N)
984122180Skan{
985122180Skan  return _mm_extract_pi16 (__A, __N);
986122180Skan}
98790075Sobrien#else
98890075Sobrien#define _mm_extract_pi16(A, N) \
98990075Sobrien  __builtin_ia32_pextrw ((__v4hi)(A), (N))
990122180Skan#define _m_pextrw(A, N)		_mm_extract_pi16((A), (N))
99190075Sobrien#endif
99290075Sobrien
99390075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
99490075Sobrien   immediate.  */
99590075Sobrien#if 0
99690075Sobrienstatic __inline __m64
99790075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N)
99890075Sobrien{
99990075Sobrien  return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
100090075Sobrien}
1001122180Skan
1002122180Skanstatic __inline __m64
1003122180Skan_m_pinsrw (__m64 __A, int __D, int __N)
1004122180Skan{
1005122180Skan  return _mm_insert_pi16 (__A, __D, __N);
1006122180Skan}
100790075Sobrien#else
100890075Sobrien#define _mm_insert_pi16(A, D, N) \
100990075Sobrien  ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
1010122180Skan#define _m_pinsrw(A, D, N)	 _mm_insert_pi16((A), (D), (N))
101190075Sobrien#endif
101290075Sobrien
101390075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
101490075Sobrienstatic __inline __m64
101590075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
101690075Sobrien{
101790075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
101890075Sobrien}
101990075Sobrien
1020122180Skanstatic __inline __m64
1021122180Skan_m_pmaxsw (__m64 __A, __m64 __B)
1022122180Skan{
1023122180Skan  return _mm_max_pi16 (__A, __B);
1024122180Skan}
1025122180Skan
102690075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
102790075Sobrienstatic __inline __m64
102890075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
102990075Sobrien{
103090075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
103190075Sobrien}
103290075Sobrien
1033122180Skanstatic __inline __m64
1034122180Skan_m_pmaxub (__m64 __A, __m64 __B)
1035122180Skan{
1036122180Skan  return _mm_max_pu8 (__A, __B);
1037122180Skan}
1038122180Skan
103990075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
104090075Sobrienstatic __inline __m64
104190075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
104290075Sobrien{
104390075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
104490075Sobrien}
104590075Sobrien
1046122180Skanstatic __inline __m64
1047122180Skan_m_pminsw (__m64 __A, __m64 __B)
1048122180Skan{
1049122180Skan  return _mm_min_pi16 (__A, __B);
1050122180Skan}
1051122180Skan
105290075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
105390075Sobrienstatic __inline __m64
105490075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
105590075Sobrien{
105690075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
105790075Sobrien}
105890075Sobrien
1059122180Skanstatic __inline __m64
1060122180Skan_m_pminub (__m64 __A, __m64 __B)
1061122180Skan{
1062122180Skan  return _mm_min_pu8 (__A, __B);
1063122180Skan}
1064122180Skan
106590075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
106690075Sobrienstatic __inline int
106790075Sobrien_mm_movemask_pi8 (__m64 __A)
106890075Sobrien{
106990075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
107090075Sobrien}
107190075Sobrien
1072122180Skanstatic __inline int
1073122180Skan_m_pmovmskb (__m64 __A)
1074122180Skan{
1075122180Skan  return _mm_movemask_pi8 (__A);
1076122180Skan}
1077122180Skan
107890075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
107990075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
108090075Sobrienstatic __inline __m64
108190075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
108290075Sobrien{
108390075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
108490075Sobrien}
108590075Sobrien
1086122180Skanstatic __inline __m64
1087122180Skan_m_pmulhuw (__m64 __A, __m64 __B)
1088122180Skan{
1089122180Skan  return _mm_mulhi_pu16 (__A, __B);
1090122180Skan}
1091122180Skan
109290075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
109390075Sobrien   must be an immediate.  */
109490075Sobrien#if 0
109590075Sobrienstatic __inline __m64
109690075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
109790075Sobrien{
109890075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
109990075Sobrien}
1100122180Skan
1101122180Skanstatic __inline __m64
1102122180Skan_m_pshufw (__m64 __A, int __N)
1103122180Skan{
1104122180Skan  return _mm_shuffle_pi16 (__A, __N);
1105122180Skan}
110690075Sobrien#else
110790075Sobrien#define _mm_shuffle_pi16(A, N) \
110890075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1109122180Skan#define _m_pshufw(A, N)		_mm_shuffle_pi16 ((A), (N))
111090075Sobrien#endif
111190075Sobrien
111290075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
111390075Sobrien   byte in the selector N determines whether the corresponding byte from
111490075Sobrien   A is stored.  */
111590075Sobrienstatic __inline void
111690075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
111790075Sobrien{
111890075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
111990075Sobrien}
112090075Sobrien
1121122180Skanstatic __inline void
1122122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1123122180Skan{
1124122180Skan  _mm_maskmove_si64 (__A, __N, __P);
1125122180Skan}
1126122180Skan
112790075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
112890075Sobrienstatic __inline __m64
112990075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
113090075Sobrien{
113190075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
113290075Sobrien}
113390075Sobrien
1134122180Skanstatic __inline __m64
1135122180Skan_m_pavgb (__m64 __A, __m64 __B)
1136122180Skan{
1137122180Skan  return _mm_avg_pu8 (__A, __B);
1138122180Skan}
1139122180Skan
114090075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
114190075Sobrienstatic __inline __m64
114290075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
114390075Sobrien{
114490075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
114590075Sobrien}
114690075Sobrien
1147122180Skanstatic __inline __m64
1148122180Skan_m_pavgw (__m64 __A, __m64 __B)
1149122180Skan{
1150122180Skan  return _mm_avg_pu16 (__A, __B);
1151122180Skan}
1152122180Skan
115390075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
115490075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
115590075Sobrien   upper words are cleared.  */
115690075Sobrienstatic __inline __m64
115790075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
115890075Sobrien{
115990075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
116090075Sobrien}
116190075Sobrien
1162122180Skanstatic __inline __m64
1163122180Skan_m_psadbw (__m64 __A, __m64 __B)
1164122180Skan{
1165122180Skan  return _mm_sad_pu8 (__A, __B);
1166122180Skan}
1167122180Skan
116890075Sobrien/* Loads one cache line from address P to a location "closer" to the
116990075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
117090075Sobrien#if 0
117190075Sobrienstatic __inline void
117290075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
117390075Sobrien{
117490075Sobrien  __builtin_prefetch (__P, 0, __I);
117590075Sobrien}
117690075Sobrien#else
117790075Sobrien#define _mm_prefetch(P, I) \
117890075Sobrien  __builtin_prefetch ((P), 0, (I))
117990075Sobrien#endif
118090075Sobrien
118190075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
118290075Sobrienstatic __inline void
118390075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
118490075Sobrien{
1185117395Skan  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
118690075Sobrien}
118790075Sobrien
118890075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
118990075Sobrienstatic __inline void
119090075Sobrien_mm_stream_ps (float *__P, __m128 __A)
119190075Sobrien{
119290075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
119390075Sobrien}
119490075Sobrien
119590075Sobrien/* Guarantees that every preceeding store is globally visible before
119690075Sobrien   any subsequent store.  */
119790075Sobrienstatic __inline void
119890075Sobrien_mm_sfence (void)
119990075Sobrien{
120090075Sobrien  __builtin_ia32_sfence ();
120190075Sobrien}
120290075Sobrien
120390075Sobrien/* The execution of the next instruction is delayed by an implementation
120490075Sobrien   specific amount of time.  The instruction does not modify the
120590075Sobrien   architectural state.  */
120690075Sobrienstatic __inline void
120790075Sobrien_mm_pause (void)
120890075Sobrien{
120990075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
121090075Sobrien}
121190075Sobrien
121290075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
121390075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
121490075Sobriendo {									\
121590075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
121690075Sobrien  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
1217107590Sobrien  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
1218107590Sobrien  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
121990075Sobrien  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
122090075Sobrien  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
122190075Sobrien  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
122290075Sobrien  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
122390075Sobrien  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
122490075Sobrien} while (0)
122590075Sobrien
1226122180Skan/* For backward source compatibility.  */
1227122180Skan#include <emmintrin.h>
1228117395Skan
1229117395Skan#endif /* __SSE__ */
123090075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1231