xmmintrin.h revision 132718
1132718Skan/* Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
290075Sobrien
3132718Skan   This file is part of GCC.
490075Sobrien
5132718Skan   GCC is free software; you can redistribute it and/or modify
690075Sobrien   it under the terms of the GNU General Public License as published by
790075Sobrien   the Free Software Foundation; either version 2, or (at your option)
890075Sobrien   any later version.
990075Sobrien
10132718Skan   GCC is distributed in the hope that it will be useful,
1190075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1290075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1390075Sobrien   GNU General Public License for more details.
1490075Sobrien
1590075Sobrien   You should have received a copy of the GNU General Public License
16132718Skan   along with GCC; see the file COPYING.  If not, write to
1790075Sobrien   the Free Software Foundation, 59 Temple Place - Suite 330,
1890075Sobrien   Boston, MA 02111-1307, USA.  */
1990075Sobrien
2090075Sobrien/* As a special exception, if you include this header file into source
2190075Sobrien   files compiled by GCC, this header file does not by itself cause
2290075Sobrien   the resulting executable to be covered by the GNU General Public
2390075Sobrien   License.  This exception does not however invalidate any other
2490075Sobrien   reasons why the executable file might be covered by the GNU General
2590075Sobrien   Public License.  */
2690075Sobrien
2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
28122180Skan   User Guide and Reference, version 8.0.  */
2990075Sobrien
3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3190075Sobrien#define _XMMINTRIN_H_INCLUDED
3290075Sobrien
33117395Skan#ifndef __SSE__
34117395Skan# error "SSE instruction set not enabled"
35117395Skan#else
36117395Skan
3790075Sobrien/* We need type definitions from the MMX header file.  */
3890075Sobrien#include <mmintrin.h>
3990075Sobrien
40132718Skan/* The data type intended for user use.  */
4190075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__)));
4290075Sobrien
43132718Skan/* Internal data types for implementing the intrinsics.  */
4490075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
4590075Sobrien
4690075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
4790075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
4890075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
4990075Sobrien
5090075Sobrien/* Constants for use with _mm_prefetch.  */
5190075Sobrienenum _mm_hint
5290075Sobrien{
5390075Sobrien  _MM_HINT_T0 = 3,
5490075Sobrien  _MM_HINT_T1 = 2,
5590075Sobrien  _MM_HINT_T2 = 1,
5690075Sobrien  _MM_HINT_NTA = 0
5790075Sobrien};
5890075Sobrien
5990075Sobrien/* Bits in the MXCSR.  */
6090075Sobrien#define _MM_EXCEPT_MASK       0x003f
6190075Sobrien#define _MM_EXCEPT_INVALID    0x0001
6290075Sobrien#define _MM_EXCEPT_DENORM     0x0002
6390075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
6490075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
6590075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
6690075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
6790075Sobrien
6890075Sobrien#define _MM_MASK_MASK         0x1f80
6990075Sobrien#define _MM_MASK_INVALID      0x0080
7090075Sobrien#define _MM_MASK_DENORM       0x0100
7190075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
7290075Sobrien#define _MM_MASK_OVERFLOW     0x0400
7390075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
7490075Sobrien#define _MM_MASK_INEXACT      0x1000
7590075Sobrien
7690075Sobrien#define _MM_ROUND_MASK        0x6000
7790075Sobrien#define _MM_ROUND_NEAREST     0x0000
7890075Sobrien#define _MM_ROUND_DOWN        0x2000
7990075Sobrien#define _MM_ROUND_UP          0x4000
8090075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
8190075Sobrien
8290075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
8390075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
8490075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
8590075Sobrien
8690075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
8790075Sobrien   floating-point) values of A and B; the upper three SPFP values are
8890075Sobrien   passed through from A.  */
8990075Sobrien
9090075Sobrienstatic __inline __m128
9190075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
9290075Sobrien{
9390075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
9490075Sobrien}
9590075Sobrien
9690075Sobrienstatic __inline __m128
9790075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
9890075Sobrien{
9990075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
10090075Sobrien}
10190075Sobrien
10290075Sobrienstatic __inline __m128
10390075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
10490075Sobrien{
10590075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
10690075Sobrien}
10790075Sobrien
10890075Sobrienstatic __inline __m128
10990075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
11090075Sobrien{
11190075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
11290075Sobrien}
11390075Sobrien
11490075Sobrienstatic __inline __m128
11590075Sobrien_mm_sqrt_ss (__m128 __A)
11690075Sobrien{
11790075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
11890075Sobrien}
11990075Sobrien
12090075Sobrienstatic __inline __m128
12190075Sobrien_mm_rcp_ss (__m128 __A)
12290075Sobrien{
12390075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
12490075Sobrien}
12590075Sobrien
12690075Sobrienstatic __inline __m128
12790075Sobrien_mm_rsqrt_ss (__m128 __A)
12890075Sobrien{
12990075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
13090075Sobrien}
13190075Sobrien
13290075Sobrienstatic __inline __m128
13390075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
13490075Sobrien{
13590075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
13690075Sobrien}
13790075Sobrien
13890075Sobrienstatic __inline __m128
13990075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
14090075Sobrien{
14190075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
14290075Sobrien}
14390075Sobrien
14490075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
14590075Sobrien
14690075Sobrienstatic __inline __m128
14790075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
14890075Sobrien{
14990075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
15090075Sobrien}
15190075Sobrien
15290075Sobrienstatic __inline __m128
15390075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
15490075Sobrien{
15590075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
15690075Sobrien}
15790075Sobrien
15890075Sobrienstatic __inline __m128
15990075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
16090075Sobrien{
16190075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
16290075Sobrien}
16390075Sobrien
16490075Sobrienstatic __inline __m128
16590075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
16690075Sobrien{
16790075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
16890075Sobrien}
16990075Sobrien
17090075Sobrienstatic __inline __m128
17190075Sobrien_mm_sqrt_ps (__m128 __A)
17290075Sobrien{
17390075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
17490075Sobrien}
17590075Sobrien
17690075Sobrienstatic __inline __m128
17790075Sobrien_mm_rcp_ps (__m128 __A)
17890075Sobrien{
17990075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
18090075Sobrien}
18190075Sobrien
18290075Sobrienstatic __inline __m128
18390075Sobrien_mm_rsqrt_ps (__m128 __A)
18490075Sobrien{
18590075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
18690075Sobrien}
18790075Sobrien
18890075Sobrienstatic __inline __m128
18990075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
19090075Sobrien{
19190075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
19290075Sobrien}
19390075Sobrien
19490075Sobrienstatic __inline __m128
19590075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
19690075Sobrien{
19790075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
19890075Sobrien}
19990075Sobrien
20090075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
20190075Sobrien
20290075Sobrienstatic __inline __m128
20390075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
20490075Sobrien{
20590075Sobrien  return __builtin_ia32_andps (__A, __B);
20690075Sobrien}
20790075Sobrien
20890075Sobrienstatic __inline __m128
20990075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
21090075Sobrien{
21190075Sobrien  return __builtin_ia32_andnps (__A, __B);
21290075Sobrien}
21390075Sobrien
21490075Sobrienstatic __inline __m128
21590075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
21690075Sobrien{
21790075Sobrien  return __builtin_ia32_orps (__A, __B);
21890075Sobrien}
21990075Sobrien
22090075Sobrienstatic __inline __m128
22190075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
22290075Sobrien{
22390075Sobrien  return __builtin_ia32_xorps (__A, __B);
22490075Sobrien}
22590075Sobrien
22690075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
22790075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
22890075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
22990075Sobrien
23090075Sobrienstatic __inline __m128
23190075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
23290075Sobrien{
23390075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
23490075Sobrien}
23590075Sobrien
23690075Sobrienstatic __inline __m128
23790075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
23890075Sobrien{
23990075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
24090075Sobrien}
24190075Sobrien
24290075Sobrienstatic __inline __m128
24390075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
24490075Sobrien{
24590075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
24690075Sobrien}
24790075Sobrien
24890075Sobrienstatic __inline __m128
24990075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
25090075Sobrien{
251107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
252107590Sobrien					(__v4sf)
253107590Sobrien					__builtin_ia32_cmpltss ((__v4sf) __B,
254107590Sobrien								(__v4sf)
255107590Sobrien								__A));
25690075Sobrien}
25790075Sobrien
25890075Sobrienstatic __inline __m128
25990075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
26090075Sobrien{
261107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
262107590Sobrien					(__v4sf)
263107590Sobrien					__builtin_ia32_cmpless ((__v4sf) __B,
264107590Sobrien								(__v4sf)
265107590Sobrien								__A));
26690075Sobrien}
26790075Sobrien
26890075Sobrienstatic __inline __m128
26990075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
27090075Sobrien{
27190075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
27290075Sobrien}
27390075Sobrien
27490075Sobrienstatic __inline __m128
27590075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
27690075Sobrien{
27790075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
27890075Sobrien}
27990075Sobrien
28090075Sobrienstatic __inline __m128
28190075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
28290075Sobrien{
28390075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
28490075Sobrien}
28590075Sobrien
28690075Sobrienstatic __inline __m128
28790075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
28890075Sobrien{
289107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
290107590Sobrien					(__v4sf)
291107590Sobrien					__builtin_ia32_cmpnltss ((__v4sf) __B,
292107590Sobrien								 (__v4sf)
293107590Sobrien								 __A));
29490075Sobrien}
29590075Sobrien
29690075Sobrienstatic __inline __m128
29790075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
29890075Sobrien{
299107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
300107590Sobrien					(__v4sf)
301107590Sobrien					__builtin_ia32_cmpnless ((__v4sf) __B,
302107590Sobrien								 (__v4sf)
303107590Sobrien								 __A));
30490075Sobrien}
30590075Sobrien
30690075Sobrienstatic __inline __m128
30790075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
30890075Sobrien{
30990075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
31090075Sobrien}
31190075Sobrien
31290075Sobrienstatic __inline __m128
31390075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
31490075Sobrien{
31590075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
31690075Sobrien}
31790075Sobrien
31890075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
31990075Sobrien   element, if the comparison is true, place a mask of all ones in the
32090075Sobrien   result, otherwise a mask of zeros.  */
32190075Sobrien
32290075Sobrienstatic __inline __m128
32390075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
32490075Sobrien{
32590075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
32690075Sobrien}
32790075Sobrien
32890075Sobrienstatic __inline __m128
32990075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
33090075Sobrien{
33190075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
33290075Sobrien}
33390075Sobrien
33490075Sobrienstatic __inline __m128
33590075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
33690075Sobrien{
33790075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
33890075Sobrien}
33990075Sobrien
34090075Sobrienstatic __inline __m128
34190075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
34290075Sobrien{
34390075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
34490075Sobrien}
34590075Sobrien
34690075Sobrienstatic __inline __m128
34790075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
34890075Sobrien{
34990075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
35090075Sobrien}
35190075Sobrien
35290075Sobrienstatic __inline __m128
35390075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
35490075Sobrien{
35590075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
35690075Sobrien}
35790075Sobrien
35890075Sobrienstatic __inline __m128
35990075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
36090075Sobrien{
36190075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
36290075Sobrien}
36390075Sobrien
36490075Sobrienstatic __inline __m128
36590075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
36690075Sobrien{
36790075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
36890075Sobrien}
36990075Sobrien
37090075Sobrienstatic __inline __m128
37190075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
37290075Sobrien{
37390075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
37490075Sobrien}
37590075Sobrien
37690075Sobrienstatic __inline __m128
37790075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
37890075Sobrien{
37990075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
38090075Sobrien}
38190075Sobrien
38290075Sobrienstatic __inline __m128
38390075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
38490075Sobrien{
38590075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
38690075Sobrien}
38790075Sobrien
38890075Sobrienstatic __inline __m128
38990075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
39090075Sobrien{
39190075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
39290075Sobrien}
39390075Sobrien
39490075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
39590075Sobrien   and 0 if false.  */
39690075Sobrien
39790075Sobrienstatic __inline int
39890075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
39990075Sobrien{
40090075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
40190075Sobrien}
40290075Sobrien
40390075Sobrienstatic __inline int
40490075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
40590075Sobrien{
40690075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
40790075Sobrien}
40890075Sobrien
40990075Sobrienstatic __inline int
41090075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
41190075Sobrien{
41290075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
41390075Sobrien}
41490075Sobrien
41590075Sobrienstatic __inline int
41690075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
41790075Sobrien{
41890075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
41990075Sobrien}
42090075Sobrien
42190075Sobrienstatic __inline int
42290075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
42390075Sobrien{
42490075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
42590075Sobrien}
42690075Sobrien
42790075Sobrienstatic __inline int
42890075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
42990075Sobrien{
43090075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
43190075Sobrien}
43290075Sobrien
43390075Sobrienstatic __inline int
43490075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
43590075Sobrien{
43690075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
43790075Sobrien}
43890075Sobrien
43990075Sobrienstatic __inline int
44090075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
44190075Sobrien{
44290075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
44390075Sobrien}
44490075Sobrien
44590075Sobrienstatic __inline int
44690075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
44790075Sobrien{
44890075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
44990075Sobrien}
45090075Sobrien
45190075Sobrienstatic __inline int
45290075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
45390075Sobrien{
45490075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
45590075Sobrien}
45690075Sobrien
45790075Sobrienstatic __inline int
45890075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
45990075Sobrien{
46090075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
46190075Sobrien}
46290075Sobrien
46390075Sobrienstatic __inline int
46490075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
46590075Sobrien{
46690075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
46790075Sobrien}
46890075Sobrien
46990075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
47090075Sobrien   rounding mode.  */
47190075Sobrienstatic __inline int
47290075Sobrien_mm_cvtss_si32 (__m128 __A)
47390075Sobrien{
47490075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
47590075Sobrien}
47690075Sobrien
477122180Skanstatic __inline int
478122180Skan_mm_cvt_ss2si (__m128 __A)
479122180Skan{
480122180Skan  return _mm_cvtss_si32 (__A);
481122180Skan}
482122180Skan
483117395Skan#ifdef __x86_64__
484117395Skan/* Convert the lower SPFP value to a 32-bit integer according to the current
485117395Skan   rounding mode.  */
486117395Skanstatic __inline long long
487117395Skan_mm_cvtss_si64x (__m128 __A)
488117395Skan{
489117395Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
490117395Skan}
491117395Skan#endif
492117395Skan
49390075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
49490075Sobrien   current rounding mode.  Return the integers in packed form.  */
49590075Sobrienstatic __inline __m64
49690075Sobrien_mm_cvtps_pi32 (__m128 __A)
49790075Sobrien{
49890075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
49990075Sobrien}
50090075Sobrien
501122180Skanstatic __inline __m64
502122180Skan_mm_cvt_ps2pi (__m128 __A)
503122180Skan{
504122180Skan  return _mm_cvtps_pi32 (__A);
505122180Skan}
506122180Skan
50790075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
50890075Sobrienstatic __inline int
50990075Sobrien_mm_cvttss_si32 (__m128 __A)
51090075Sobrien{
51190075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
51290075Sobrien}
51390075Sobrien
514122180Skanstatic __inline int
515122180Skan_mm_cvtt_ss2si (__m128 __A)
516122180Skan{
517122180Skan  return _mm_cvttss_si32 (__A);
518122180Skan}
519122180Skan
520117395Skan#ifdef __x86_64__
521117395Skan/* Truncate the lower SPFP value to a 32-bit integer.  */
522117395Skanstatic __inline long long
523117395Skan_mm_cvttss_si64x (__m128 __A)
524117395Skan{
525117395Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
526117395Skan}
527117395Skan#endif
528117395Skan
52990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
53090075Sobrien   integers in packed form.  */
53190075Sobrienstatic __inline __m64
53290075Sobrien_mm_cvttps_pi32 (__m128 __A)
53390075Sobrien{
53490075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
53590075Sobrien}
53690075Sobrien
537122180Skanstatic __inline __m64
538122180Skan_mm_cvtt_ps2pi (__m128 __A)
539122180Skan{
540122180Skan  return _mm_cvttps_pi32 (__A);
541122180Skan}
542122180Skan
54390075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
54490075Sobrienstatic __inline __m128
54590075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
54690075Sobrien{
54790075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
54890075Sobrien}
54990075Sobrien
550122180Skanstatic __inline __m128
551122180Skan_mm_cvt_si2ss (__m128 __A, int __B)
552122180Skan{
553122180Skan  return _mm_cvtsi32_ss (__A, __B);
554122180Skan}
555122180Skan
556117395Skan#ifdef __x86_64__
557117395Skan/* Convert B to a SPFP value and insert it as element zero in A.  */
558117395Skanstatic __inline __m128
559117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B)
560117395Skan{
561117395Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
562117395Skan}
563117395Skan#endif
564117395Skan
56590075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
56690075Sobrien   as the two lower elements in A.  */
56790075Sobrienstatic __inline __m128
56890075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
56990075Sobrien{
57090075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
57190075Sobrien}
57290075Sobrien
573122180Skanstatic __inline __m128
574122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B)
575122180Skan{
576122180Skan  return _mm_cvtpi32_ps (__A, __B);
577122180Skan}
578122180Skan
57990075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
58090075Sobrienstatic __inline __m128
58190075Sobrien_mm_cvtpi16_ps (__m64 __A)
58290075Sobrien{
58390075Sobrien  __v4hi __sign;
58490075Sobrien  __v2si __hisi, __losi;
58590075Sobrien  __v4sf __r;
58690075Sobrien
58790075Sobrien  /* This comparison against zero gives us a mask that can be used to
58890075Sobrien     fill in the missing sign bits in the unpack operations below, so
58990075Sobrien     that we get signed values after unpacking.  */
59090075Sobrien  __sign = (__v4hi) __builtin_ia32_mmx_zero ();
59190075Sobrien  __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
59290075Sobrien
59390075Sobrien  /* Convert the four words to doublewords.  */
59490075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
59590075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
59690075Sobrien
59790075Sobrien  /* Convert the doublewords to floating point two at a time.  */
59890075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
59990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
60090075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
60190075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
60290075Sobrien
60390075Sobrien  return (__m128) __r;
60490075Sobrien}
60590075Sobrien
60690075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
60790075Sobrienstatic __inline __m128
60890075Sobrien_mm_cvtpu16_ps (__m64 __A)
60990075Sobrien{
61090075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
61190075Sobrien  __v2si __hisi, __losi;
61290075Sobrien  __v4sf __r;
61390075Sobrien
61490075Sobrien  /* Convert the four words to doublewords.  */
61590075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
61690075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
61790075Sobrien
61890075Sobrien  /* Convert the doublewords to floating point two at a time.  */
61990075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
62090075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
62190075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
62290075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
62390075Sobrien
62490075Sobrien  return (__m128) __r;
62590075Sobrien}
62690075Sobrien
62790075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
62890075Sobrienstatic __inline __m128
62990075Sobrien_mm_cvtpi8_ps (__m64 __A)
63090075Sobrien{
63190075Sobrien  __v8qi __sign;
63290075Sobrien
63390075Sobrien  /* This comparison against zero gives us a mask that can be used to
63490075Sobrien     fill in the missing sign bits in the unpack operations below, so
63590075Sobrien     that we get signed values after unpacking.  */
63690075Sobrien  __sign = (__v8qi) __builtin_ia32_mmx_zero ();
63790075Sobrien  __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
63890075Sobrien
63990075Sobrien  /* Convert the four low bytes to words.  */
64090075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
64190075Sobrien
64290075Sobrien  return _mm_cvtpi16_ps(__A);
64390075Sobrien}
64490075Sobrien
64590075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
64690075Sobrienstatic __inline __m128
64790075Sobrien_mm_cvtpu8_ps(__m64 __A)
64890075Sobrien{
64990075Sobrien  __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
65090075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
65190075Sobrien  return _mm_cvtpu16_ps(__A);
65290075Sobrien}
65390075Sobrien
65490075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
65590075Sobrienstatic __inline __m128
65690075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
65790075Sobrien{
65890075Sobrien  __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
65990075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
66090075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
66190075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
66290075Sobrien}
66390075Sobrien
66490075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
66590075Sobrienstatic __inline __m64
66690075Sobrien_mm_cvtps_pi16(__m128 __A)
66790075Sobrien{
66890075Sobrien  __v4sf __hisf = (__v4sf)__A;
66990075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
67090075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
67190075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
672117395Skan  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
67390075Sobrien}
67490075Sobrien
67590075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
67690075Sobrienstatic __inline __m64
67790075Sobrien_mm_cvtps_pi8(__m128 __A)
67890075Sobrien{
67990075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
68090075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
68190075Sobrien  return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
68290075Sobrien}
68390075Sobrien
68490075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
68590075Sobrien#if 0
68690075Sobrienstatic __inline __m128
68790075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
68890075Sobrien{
68990075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
69090075Sobrien}
69190075Sobrien#else
69290075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
69390075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
69490075Sobrien#endif
69590075Sobrien
69690075Sobrien
69790075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
69890075Sobrienstatic __inline __m128
69990075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
70090075Sobrien{
70190075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
70290075Sobrien}
70390075Sobrien
70490075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
70590075Sobrienstatic __inline __m128
70690075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
70790075Sobrien{
70890075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
70990075Sobrien}
71090075Sobrien
71190075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
71290075Sobrien   the lower two values are passed through from A.  */
71390075Sobrienstatic __inline __m128
714117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P)
71590075Sobrien{
71690075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
71790075Sobrien}
71890075Sobrien
71990075Sobrien/* Stores the upper two SPFP values of A into P.  */
72090075Sobrienstatic __inline void
72190075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
72290075Sobrien{
72390075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
72490075Sobrien}
72590075Sobrien
72690075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
72790075Sobrienstatic __inline __m128
72890075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
72990075Sobrien{
73090075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
73190075Sobrien}
73290075Sobrien
73390075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
73490075Sobrienstatic __inline __m128
73590075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
73690075Sobrien{
73790075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
73890075Sobrien}
73990075Sobrien
74090075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
74190075Sobrien   the upper two values are passed through from A.  */
74290075Sobrienstatic __inline __m128
743117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P)
74490075Sobrien{
74590075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
74690075Sobrien}
74790075Sobrien
74890075Sobrien/* Stores the lower two SPFP values of A into P.  */
74990075Sobrienstatic __inline void
75090075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
75190075Sobrien{
75290075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
75390075Sobrien}
75490075Sobrien
75590075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
75690075Sobrienstatic __inline int
75790075Sobrien_mm_movemask_ps (__m128 __A)
75890075Sobrien{
75990075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
76090075Sobrien}
76190075Sobrien
76290075Sobrien/* Return the contents of the control register.  */
76390075Sobrienstatic __inline unsigned int
76490075Sobrien_mm_getcsr (void)
76590075Sobrien{
76690075Sobrien  return __builtin_ia32_stmxcsr ();
76790075Sobrien}
76890075Sobrien
76990075Sobrien/* Read exception bits from the control register.  */
77090075Sobrienstatic __inline unsigned int
77190075Sobrien_MM_GET_EXCEPTION_STATE (void)
77290075Sobrien{
77390075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
77490075Sobrien}
77590075Sobrien
77690075Sobrienstatic __inline unsigned int
77790075Sobrien_MM_GET_EXCEPTION_MASK (void)
77890075Sobrien{
77990075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
78090075Sobrien}
78190075Sobrien
78290075Sobrienstatic __inline unsigned int
78390075Sobrien_MM_GET_ROUNDING_MODE (void)
78490075Sobrien{
78590075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
78690075Sobrien}
78790075Sobrien
78890075Sobrienstatic __inline unsigned int
78990075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
79090075Sobrien{
79190075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
79290075Sobrien}
79390075Sobrien
79490075Sobrien/* Set the control register to I.  */
79590075Sobrienstatic __inline void
79690075Sobrien_mm_setcsr (unsigned int __I)
79790075Sobrien{
79890075Sobrien  __builtin_ia32_ldmxcsr (__I);
79990075Sobrien}
80090075Sobrien
80190075Sobrien/* Set exception bits in the control register.  */
80290075Sobrienstatic __inline void
80390075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
80490075Sobrien{
80590075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
80690075Sobrien}
80790075Sobrien
80890075Sobrienstatic __inline void
80990075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
81090075Sobrien{
81190075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
81290075Sobrien}
81390075Sobrien
81490075Sobrienstatic __inline void
81590075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
81690075Sobrien{
81790075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
81890075Sobrien}
81990075Sobrien
82090075Sobrienstatic __inline void
82190075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
82290075Sobrien{
82390075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
82490075Sobrien}
82590075Sobrien
82690075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
82790075Sobrienstatic __inline __m128
828117395Skan_mm_load_ss (float const *__P)
82990075Sobrien{
83090075Sobrien  return (__m128) __builtin_ia32_loadss (__P);
83190075Sobrien}
83290075Sobrien
83390075Sobrien/* Create a vector with all four elements equal to *P.  */
83490075Sobrienstatic __inline __m128
835117395Skan_mm_load1_ps (float const *__P)
83690075Sobrien{
83790075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (__P);
83890075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
83990075Sobrien}
84090075Sobrien
84190075Sobrienstatic __inline __m128
842117395Skan_mm_load_ps1 (float const *__P)
84390075Sobrien{
84490075Sobrien  return _mm_load1_ps (__P);
84590075Sobrien}
84690075Sobrien
84790075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
84890075Sobrienstatic __inline __m128
849117395Skan_mm_load_ps (float const *__P)
85090075Sobrien{
85190075Sobrien  return (__m128) __builtin_ia32_loadaps (__P);
85290075Sobrien}
85390075Sobrien
85490075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
85590075Sobrienstatic __inline __m128
856117395Skan_mm_loadu_ps (float const *__P)
85790075Sobrien{
85890075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
85990075Sobrien}
86090075Sobrien
86190075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
86290075Sobrienstatic __inline __m128
863117395Skan_mm_loadr_ps (float const *__P)
86490075Sobrien{
86590075Sobrien  __v4sf __tmp = __builtin_ia32_loadaps (__P);
86690075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
86790075Sobrien}
86890075Sobrien
86990075Sobrien/* Create a vector with element 0 as F and the rest zero.  */
87090075Sobrienstatic __inline __m128
87190075Sobrien_mm_set_ss (float __F)
87290075Sobrien{
87390075Sobrien  return (__m128) __builtin_ia32_loadss (&__F);
87490075Sobrien}
87590075Sobrien
87690075Sobrien/* Create a vector with all four elements equal to F.  */
87790075Sobrienstatic __inline __m128
87890075Sobrien_mm_set1_ps (float __F)
87990075Sobrien{
88090075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (&__F);
88190075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
88290075Sobrien}
88390075Sobrien
88490075Sobrienstatic __inline __m128
88590075Sobrien_mm_set_ps1 (float __F)
88690075Sobrien{
88790075Sobrien  return _mm_set1_ps (__F);
88890075Sobrien}
88990075Sobrien
89090075Sobrien/* Create the vector [Z Y X W].  */
89190075Sobrienstatic __inline __m128
892132718Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
89390075Sobrien{
894132718Skan  return (__v4sf) {__W, __X, __Y, __Z};
89590075Sobrien}
89690075Sobrien
89790075Sobrien/* Create the vector [W X Y Z].  */
89890075Sobrienstatic __inline __m128
89990075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W)
90090075Sobrien{
90190075Sobrien  return _mm_set_ps (__W, __X, __Y, __Z);
90290075Sobrien}
90390075Sobrien
90490075Sobrien/* Create a vector of zeros.  */
90590075Sobrienstatic __inline __m128
90690075Sobrien_mm_setzero_ps (void)
90790075Sobrien{
90890075Sobrien  return (__m128) __builtin_ia32_setzerops ();
90990075Sobrien}
91090075Sobrien
91190075Sobrien/* Stores the lower SPFP value.  */
91290075Sobrienstatic __inline void
91390075Sobrien_mm_store_ss (float *__P, __m128 __A)
91490075Sobrien{
91590075Sobrien  __builtin_ia32_storess (__P, (__v4sf)__A);
91690075Sobrien}
91790075Sobrien
91890075Sobrien/* Store the lower SPFP value across four words.  */
91990075Sobrienstatic __inline void
92090075Sobrien_mm_store1_ps (float *__P, __m128 __A)
92190075Sobrien{
92290075Sobrien  __v4sf __va = (__v4sf)__A;
92390075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
92490075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
92590075Sobrien}
92690075Sobrien
92790075Sobrienstatic __inline void
92890075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
92990075Sobrien{
93090075Sobrien  _mm_store1_ps (__P, __A);
93190075Sobrien}
93290075Sobrien
93390075Sobrien/* Store four SPFP values.  The address must be 16-byte aligned.  */
93490075Sobrienstatic __inline void
93590075Sobrien_mm_store_ps (float *__P, __m128 __A)
93690075Sobrien{
93790075Sobrien  __builtin_ia32_storeaps (__P, (__v4sf)__A);
93890075Sobrien}
93990075Sobrien
94090075Sobrien/* Store four SPFP values.  The address need not be 16-byte aligned.  */
94190075Sobrienstatic __inline void
94290075Sobrien_mm_storeu_ps (float *__P, __m128 __A)
94390075Sobrien{
94490075Sobrien  __builtin_ia32_storeups (__P, (__v4sf)__A);
94590075Sobrien}
94690075Sobrien
947117395Skan/* Store four SPFP values in reverse order.  The address must be aligned.  */
94890075Sobrienstatic __inline void
94990075Sobrien_mm_storer_ps (float *__P, __m128 __A)
95090075Sobrien{
95190075Sobrien  __v4sf __va = (__v4sf)__A;
95290075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
95390075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
95490075Sobrien}
95590075Sobrien
95690075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
95790075Sobrienstatic __inline __m128
95890075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
95990075Sobrien{
96090075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
96190075Sobrien}
96290075Sobrien
96390075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
96490075Sobrien#if 0
96590075Sobrienstatic __inline int
96690075Sobrien_mm_extract_pi16 (__m64 __A, int __N)
96790075Sobrien{
96890075Sobrien  return __builtin_ia32_pextrw ((__v4hi)__A, __N);
96990075Sobrien}
970122180Skan
971122180Skanstatic __inline int
972122180Skan_m_pextrw (__m64 __A, int __N)
973122180Skan{
974122180Skan  return _mm_extract_pi16 (__A, __N);
975122180Skan}
97690075Sobrien#else
97790075Sobrien#define _mm_extract_pi16(A, N) \
97890075Sobrien  __builtin_ia32_pextrw ((__v4hi)(A), (N))
979122180Skan#define _m_pextrw(A, N)		_mm_extract_pi16((A), (N))
98090075Sobrien#endif
98190075Sobrien
98290075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
98390075Sobrien   immediate.  */
98490075Sobrien#if 0
98590075Sobrienstatic __inline __m64
98690075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N)
98790075Sobrien{
98890075Sobrien  return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
98990075Sobrien}
990122180Skan
991122180Skanstatic __inline __m64
992122180Skan_m_pinsrw (__m64 __A, int __D, int __N)
993122180Skan{
994122180Skan  return _mm_insert_pi16 (__A, __D, __N);
995122180Skan}
99690075Sobrien#else
99790075Sobrien#define _mm_insert_pi16(A, D, N) \
99890075Sobrien  ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
999122180Skan#define _m_pinsrw(A, D, N)	 _mm_insert_pi16((A), (D), (N))
100090075Sobrien#endif
100190075Sobrien
100290075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
100390075Sobrienstatic __inline __m64
100490075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
100590075Sobrien{
100690075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
100790075Sobrien}
100890075Sobrien
1009122180Skanstatic __inline __m64
1010122180Skan_m_pmaxsw (__m64 __A, __m64 __B)
1011122180Skan{
1012122180Skan  return _mm_max_pi16 (__A, __B);
1013122180Skan}
1014122180Skan
101590075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
101690075Sobrienstatic __inline __m64
101790075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
101890075Sobrien{
101990075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
102090075Sobrien}
102190075Sobrien
1022122180Skanstatic __inline __m64
1023122180Skan_m_pmaxub (__m64 __A, __m64 __B)
1024122180Skan{
1025122180Skan  return _mm_max_pu8 (__A, __B);
1026122180Skan}
1027122180Skan
102890075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
102990075Sobrienstatic __inline __m64
103090075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
103190075Sobrien{
103290075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
103390075Sobrien}
103490075Sobrien
1035122180Skanstatic __inline __m64
1036122180Skan_m_pminsw (__m64 __A, __m64 __B)
1037122180Skan{
1038122180Skan  return _mm_min_pi16 (__A, __B);
1039122180Skan}
1040122180Skan
104190075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
104290075Sobrienstatic __inline __m64
104390075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
104490075Sobrien{
104590075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
104690075Sobrien}
104790075Sobrien
1048122180Skanstatic __inline __m64
1049122180Skan_m_pminub (__m64 __A, __m64 __B)
1050122180Skan{
1051122180Skan  return _mm_min_pu8 (__A, __B);
1052122180Skan}
1053122180Skan
105490075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
105590075Sobrienstatic __inline int
105690075Sobrien_mm_movemask_pi8 (__m64 __A)
105790075Sobrien{
105890075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
105990075Sobrien}
106090075Sobrien
1061122180Skanstatic __inline int
1062122180Skan_m_pmovmskb (__m64 __A)
1063122180Skan{
1064122180Skan  return _mm_movemask_pi8 (__A);
1065122180Skan}
1066122180Skan
106790075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
106890075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
106990075Sobrienstatic __inline __m64
107090075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
107190075Sobrien{
107290075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
107390075Sobrien}
107490075Sobrien
1075122180Skanstatic __inline __m64
1076122180Skan_m_pmulhuw (__m64 __A, __m64 __B)
1077122180Skan{
1078122180Skan  return _mm_mulhi_pu16 (__A, __B);
1079122180Skan}
1080122180Skan
108190075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
108290075Sobrien   must be an immediate.  */
108390075Sobrien#if 0
108490075Sobrienstatic __inline __m64
108590075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
108690075Sobrien{
108790075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
108890075Sobrien}
1089122180Skan
1090122180Skanstatic __inline __m64
1091122180Skan_m_pshufw (__m64 __A, int __N)
1092122180Skan{
1093122180Skan  return _mm_shuffle_pi16 (__A, __N);
1094122180Skan}
109590075Sobrien#else
109690075Sobrien#define _mm_shuffle_pi16(A, N) \
109790075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1098122180Skan#define _m_pshufw(A, N)		_mm_shuffle_pi16 ((A), (N))
109990075Sobrien#endif
110090075Sobrien
110190075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
110290075Sobrien   byte in the selector N determines whether the corresponding byte from
110390075Sobrien   A is stored.  */
110490075Sobrienstatic __inline void
110590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
110690075Sobrien{
110790075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
110890075Sobrien}
110990075Sobrien
1110122180Skanstatic __inline void
1111122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1112122180Skan{
1113122180Skan  _mm_maskmove_si64 (__A, __N, __P);
1114122180Skan}
1115122180Skan
111690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
111790075Sobrienstatic __inline __m64
111890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
111990075Sobrien{
112090075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
112190075Sobrien}
112290075Sobrien
1123122180Skanstatic __inline __m64
1124122180Skan_m_pavgb (__m64 __A, __m64 __B)
1125122180Skan{
1126122180Skan  return _mm_avg_pu8 (__A, __B);
1127122180Skan}
1128122180Skan
112990075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
113090075Sobrienstatic __inline __m64
113190075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
113290075Sobrien{
113390075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
113490075Sobrien}
113590075Sobrien
1136122180Skanstatic __inline __m64
1137122180Skan_m_pavgw (__m64 __A, __m64 __B)
1138122180Skan{
1139122180Skan  return _mm_avg_pu16 (__A, __B);
1140122180Skan}
1141122180Skan
114290075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
114390075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
114490075Sobrien   upper words are cleared.  */
114590075Sobrienstatic __inline __m64
114690075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
114790075Sobrien{
114890075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
114990075Sobrien}
115090075Sobrien
1151122180Skanstatic __inline __m64
1152122180Skan_m_psadbw (__m64 __A, __m64 __B)
1153122180Skan{
1154122180Skan  return _mm_sad_pu8 (__A, __B);
1155122180Skan}
1156122180Skan
115790075Sobrien/* Loads one cache line from address P to a location "closer" to the
115890075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
115990075Sobrien#if 0
116090075Sobrienstatic __inline void
116190075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
116290075Sobrien{
116390075Sobrien  __builtin_prefetch (__P, 0, __I);
116490075Sobrien}
116590075Sobrien#else
116690075Sobrien#define _mm_prefetch(P, I) \
116790075Sobrien  __builtin_prefetch ((P), 0, (I))
116890075Sobrien#endif
116990075Sobrien
117090075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
117190075Sobrienstatic __inline void
117290075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
117390075Sobrien{
1174117395Skan  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
117590075Sobrien}
117690075Sobrien
117790075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
117890075Sobrienstatic __inline void
117990075Sobrien_mm_stream_ps (float *__P, __m128 __A)
118090075Sobrien{
118190075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
118290075Sobrien}
118390075Sobrien
1184132718Skan/* Guarantees that every preceding store is globally visible before
118590075Sobrien   any subsequent store.  */
118690075Sobrienstatic __inline void
118790075Sobrien_mm_sfence (void)
118890075Sobrien{
118990075Sobrien  __builtin_ia32_sfence ();
119090075Sobrien}
119190075Sobrien
119290075Sobrien/* The execution of the next instruction is delayed by an implementation
119390075Sobrien   specific amount of time.  The instruction does not modify the
119490075Sobrien   architectural state.  */
119590075Sobrienstatic __inline void
119690075Sobrien_mm_pause (void)
119790075Sobrien{
119890075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
119990075Sobrien}
120090075Sobrien
120190075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
120290075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
120390075Sobriendo {									\
120490075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
120590075Sobrien  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
1206107590Sobrien  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
1207107590Sobrien  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
120890075Sobrien  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
120990075Sobrien  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
121090075Sobrien  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
121190075Sobrien  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
121290075Sobrien  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
121390075Sobrien} while (0)
121490075Sobrien
1215122180Skan/* For backward source compatibility.  */
1216122180Skan#include <emmintrin.h>
1217117395Skan
1218117395Skan#endif /* __SSE__ */
121990075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1220