xmmintrin.h revision 107590
190075Sobrien/* Copyright (C) 2002 Free Software Foundation, Inc.
290075Sobrien
390075Sobrien   This file is part of GNU CC.
490075Sobrien
590075Sobrien   GNU CC is free software; you can redistribute it and/or modify
690075Sobrien   it under the terms of the GNU General Public License as published by
790075Sobrien   the Free Software Foundation; either version 2, or (at your option)
890075Sobrien   any later version.
990075Sobrien
1090075Sobrien   GNU CC is distributed in the hope that it will be useful,
1190075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1290075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1390075Sobrien   GNU General Public License for more details.
1490075Sobrien
1590075Sobrien   You should have received a copy of the GNU General Public License
1690075Sobrien   along with GNU CC; see the file COPYING.  If not, write to
1790075Sobrien   the Free Software Foundation, 59 Temple Place - Suite 330,
1890075Sobrien   Boston, MA 02111-1307, USA.  */
1990075Sobrien
2090075Sobrien/* As a special exception, if you include this header file into source
2190075Sobrien   files compiled by GCC, this header file does not by itself cause
2290075Sobrien   the resulting executable to be covered by the GNU General Public
2390075Sobrien   License.  This exception does not however invalidate any other
2490075Sobrien   reasons why the executable file might be covered by the GNU General
2590075Sobrien   Public License.  */
2690075Sobrien
2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
2890075Sobrien   User Guide and Reference, version 5.0.  */
2990075Sobrien
3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3190075Sobrien#define _XMMINTRIN_H_INCLUDED
3290075Sobrien
3390075Sobrien/* We need type definitions from the MMX header file.  */
3490075Sobrien#include <mmintrin.h>
3590075Sobrien
3690075Sobrien/* The data type indended for user use.  */
3790075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__)));
3890075Sobrien
3990075Sobrien/* Internal data types for implementing the instrinsics.  */
4090075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
4190075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__)));
4290075Sobrien
4390075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
4490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
4590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
4690075Sobrien
4790075Sobrien/* Constants for use with _mm_prefetch.  */
4890075Sobrienenum _mm_hint
4990075Sobrien{
5090075Sobrien  _MM_HINT_T0 = 3,
5190075Sobrien  _MM_HINT_T1 = 2,
5290075Sobrien  _MM_HINT_T2 = 1,
5390075Sobrien  _MM_HINT_NTA = 0
5490075Sobrien};
5590075Sobrien
5690075Sobrien/* Bits in the MXCSR.  */
5790075Sobrien#define _MM_EXCEPT_MASK       0x003f
5890075Sobrien#define _MM_EXCEPT_INVALID    0x0001
5990075Sobrien#define _MM_EXCEPT_DENORM     0x0002
6090075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
6190075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
6290075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
6390075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
6490075Sobrien
6590075Sobrien#define _MM_MASK_MASK         0x1f80
6690075Sobrien#define _MM_MASK_INVALID      0x0080
6790075Sobrien#define _MM_MASK_DENORM       0x0100
6890075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
6990075Sobrien#define _MM_MASK_OVERFLOW     0x0400
7090075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
7190075Sobrien#define _MM_MASK_INEXACT      0x1000
7290075Sobrien
7390075Sobrien#define _MM_ROUND_MASK        0x6000
7490075Sobrien#define _MM_ROUND_NEAREST     0x0000
7590075Sobrien#define _MM_ROUND_DOWN        0x2000
7690075Sobrien#define _MM_ROUND_UP          0x4000
7790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
7890075Sobrien
7990075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
8090075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
8190075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
8290075Sobrien
8390075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
8490075Sobrien   floating-point) values of A and B; the upper three SPFP values are
8590075Sobrien   passed through from A.  */
8690075Sobrien
8790075Sobrienstatic __inline __m128
8890075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
8990075Sobrien{
9090075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
9190075Sobrien}
9290075Sobrien
9390075Sobrienstatic __inline __m128
9490075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
9590075Sobrien{
9690075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
9790075Sobrien}
9890075Sobrien
9990075Sobrienstatic __inline __m128
10090075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
10190075Sobrien{
10290075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
10390075Sobrien}
10490075Sobrien
10590075Sobrienstatic __inline __m128
10690075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
10790075Sobrien{
10890075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
10990075Sobrien}
11090075Sobrien
11190075Sobrienstatic __inline __m128
11290075Sobrien_mm_sqrt_ss (__m128 __A)
11390075Sobrien{
11490075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
11590075Sobrien}
11690075Sobrien
11790075Sobrienstatic __inline __m128
11890075Sobrien_mm_rcp_ss (__m128 __A)
11990075Sobrien{
12090075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
12190075Sobrien}
12290075Sobrien
12390075Sobrienstatic __inline __m128
12490075Sobrien_mm_rsqrt_ss (__m128 __A)
12590075Sobrien{
12690075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
12790075Sobrien}
12890075Sobrien
12990075Sobrienstatic __inline __m128
13090075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
13190075Sobrien{
13290075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
13390075Sobrien}
13490075Sobrien
13590075Sobrienstatic __inline __m128
13690075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
13790075Sobrien{
13890075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
13990075Sobrien}
14090075Sobrien
14190075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
14290075Sobrien
14390075Sobrienstatic __inline __m128
14490075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
14590075Sobrien{
14690075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
14790075Sobrien}
14890075Sobrien
14990075Sobrienstatic __inline __m128
15090075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
15190075Sobrien{
15290075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
15390075Sobrien}
15490075Sobrien
15590075Sobrienstatic __inline __m128
15690075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
15790075Sobrien{
15890075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
15990075Sobrien}
16090075Sobrien
16190075Sobrienstatic __inline __m128
16290075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
16390075Sobrien{
16490075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
16590075Sobrien}
16690075Sobrien
16790075Sobrienstatic __inline __m128
16890075Sobrien_mm_sqrt_ps (__m128 __A)
16990075Sobrien{
17090075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
17190075Sobrien}
17290075Sobrien
17390075Sobrienstatic __inline __m128
17490075Sobrien_mm_rcp_ps (__m128 __A)
17590075Sobrien{
17690075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
17790075Sobrien}
17890075Sobrien
17990075Sobrienstatic __inline __m128
18090075Sobrien_mm_rsqrt_ps (__m128 __A)
18190075Sobrien{
18290075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
18390075Sobrien}
18490075Sobrien
18590075Sobrienstatic __inline __m128
18690075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
18790075Sobrien{
18890075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
18990075Sobrien}
19090075Sobrien
19190075Sobrienstatic __inline __m128
19290075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
19390075Sobrien{
19490075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
19590075Sobrien}
19690075Sobrien
19790075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
19890075Sobrien
19990075Sobrienstatic __inline __m128
20090075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
20190075Sobrien{
20290075Sobrien  return __builtin_ia32_andps (__A, __B);
20390075Sobrien}
20490075Sobrien
20590075Sobrienstatic __inline __m128
20690075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
20790075Sobrien{
20890075Sobrien  return __builtin_ia32_andnps (__A, __B);
20990075Sobrien}
21090075Sobrien
21190075Sobrienstatic __inline __m128
21290075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
21390075Sobrien{
21490075Sobrien  return __builtin_ia32_orps (__A, __B);
21590075Sobrien}
21690075Sobrien
21790075Sobrienstatic __inline __m128
21890075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
21990075Sobrien{
22090075Sobrien  return __builtin_ia32_xorps (__A, __B);
22190075Sobrien}
22290075Sobrien
22390075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
22490075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
22590075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
22690075Sobrien
22790075Sobrienstatic __inline __m128
22890075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
22990075Sobrien{
23090075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
23190075Sobrien}
23290075Sobrien
23390075Sobrienstatic __inline __m128
23490075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
23590075Sobrien{
23690075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
23790075Sobrien}
23890075Sobrien
23990075Sobrienstatic __inline __m128
24090075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
24190075Sobrien{
24290075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
24390075Sobrien}
24490075Sobrien
24590075Sobrienstatic __inline __m128
24690075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
24790075Sobrien{
248107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
249107590Sobrien					(__v4sf)
250107590Sobrien					__builtin_ia32_cmpltss ((__v4sf) __B,
251107590Sobrien								(__v4sf)
252107590Sobrien								__A));
25390075Sobrien}
25490075Sobrien
25590075Sobrienstatic __inline __m128
25690075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
25790075Sobrien{
258107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
259107590Sobrien					(__v4sf)
260107590Sobrien					__builtin_ia32_cmpless ((__v4sf) __B,
261107590Sobrien								(__v4sf)
262107590Sobrien								__A));
26390075Sobrien}
26490075Sobrien
26590075Sobrienstatic __inline __m128
26690075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
26790075Sobrien{
26890075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
26990075Sobrien}
27090075Sobrien
27190075Sobrienstatic __inline __m128
27290075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
27390075Sobrien{
27490075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
27590075Sobrien}
27690075Sobrien
27790075Sobrienstatic __inline __m128
27890075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
27990075Sobrien{
28090075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
28190075Sobrien}
28290075Sobrien
28390075Sobrienstatic __inline __m128
28490075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
28590075Sobrien{
286107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
287107590Sobrien					(__v4sf)
288107590Sobrien					__builtin_ia32_cmpnltss ((__v4sf) __B,
289107590Sobrien								 (__v4sf)
290107590Sobrien								 __A));
29190075Sobrien}
29290075Sobrien
29390075Sobrienstatic __inline __m128
29490075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
29590075Sobrien{
296107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
297107590Sobrien					(__v4sf)
298107590Sobrien					__builtin_ia32_cmpnless ((__v4sf) __B,
299107590Sobrien								 (__v4sf)
300107590Sobrien								 __A));
30190075Sobrien}
30290075Sobrien
30390075Sobrienstatic __inline __m128
30490075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
30590075Sobrien{
30690075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
30790075Sobrien}
30890075Sobrien
30990075Sobrienstatic __inline __m128
31090075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
31190075Sobrien{
31290075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
31390075Sobrien}
31490075Sobrien
31590075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
31690075Sobrien   element, if the comparison is true, place a mask of all ones in the
31790075Sobrien   result, otherwise a mask of zeros.  */
31890075Sobrien
31990075Sobrienstatic __inline __m128
32090075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
32190075Sobrien{
32290075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
32390075Sobrien}
32490075Sobrien
32590075Sobrienstatic __inline __m128
32690075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
32790075Sobrien{
32890075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
32990075Sobrien}
33090075Sobrien
33190075Sobrienstatic __inline __m128
33290075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
33390075Sobrien{
33490075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
33590075Sobrien}
33690075Sobrien
33790075Sobrienstatic __inline __m128
33890075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
33990075Sobrien{
34090075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
34190075Sobrien}
34290075Sobrien
34390075Sobrienstatic __inline __m128
34490075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
34590075Sobrien{
34690075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
34790075Sobrien}
34890075Sobrien
34990075Sobrienstatic __inline __m128
35090075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
35190075Sobrien{
35290075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
35390075Sobrien}
35490075Sobrien
35590075Sobrienstatic __inline __m128
35690075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
35790075Sobrien{
35890075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
35990075Sobrien}
36090075Sobrien
36190075Sobrienstatic __inline __m128
36290075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
36390075Sobrien{
36490075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
36590075Sobrien}
36690075Sobrien
36790075Sobrienstatic __inline __m128
36890075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
36990075Sobrien{
37090075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
37190075Sobrien}
37290075Sobrien
37390075Sobrienstatic __inline __m128
37490075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
37590075Sobrien{
37690075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
37790075Sobrien}
37890075Sobrien
37990075Sobrienstatic __inline __m128
38090075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
38190075Sobrien{
38290075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
38390075Sobrien}
38490075Sobrien
38590075Sobrienstatic __inline __m128
38690075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
38790075Sobrien{
38890075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
38990075Sobrien}
39090075Sobrien
39190075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
39290075Sobrien   and 0 if false.  */
39390075Sobrien
39490075Sobrienstatic __inline int
39590075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
39690075Sobrien{
39790075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
39890075Sobrien}
39990075Sobrien
40090075Sobrienstatic __inline int
40190075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
40290075Sobrien{
40390075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
40490075Sobrien}
40590075Sobrien
40690075Sobrienstatic __inline int
40790075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
40890075Sobrien{
40990075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
41090075Sobrien}
41190075Sobrien
41290075Sobrienstatic __inline int
41390075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
41490075Sobrien{
41590075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
41690075Sobrien}
41790075Sobrien
41890075Sobrienstatic __inline int
41990075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
42090075Sobrien{
42190075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
42290075Sobrien}
42390075Sobrien
42490075Sobrienstatic __inline int
42590075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
42690075Sobrien{
42790075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
42890075Sobrien}
42990075Sobrien
43090075Sobrienstatic __inline int
43190075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
43290075Sobrien{
43390075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
43490075Sobrien}
43590075Sobrien
43690075Sobrienstatic __inline int
43790075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
43890075Sobrien{
43990075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
44090075Sobrien}
44190075Sobrien
44290075Sobrienstatic __inline int
44390075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
44490075Sobrien{
44590075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
44690075Sobrien}
44790075Sobrien
44890075Sobrienstatic __inline int
44990075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
45090075Sobrien{
45190075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
45290075Sobrien}
45390075Sobrien
45490075Sobrienstatic __inline int
45590075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
45690075Sobrien{
45790075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
45890075Sobrien}
45990075Sobrien
46090075Sobrienstatic __inline int
46190075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
46290075Sobrien{
46390075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
46490075Sobrien}
46590075Sobrien
46690075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
46790075Sobrien   rounding mode.  */
46890075Sobrienstatic __inline int
46990075Sobrien_mm_cvtss_si32 (__m128 __A)
47090075Sobrien{
47190075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
47290075Sobrien}
47390075Sobrien
47490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
47590075Sobrien   current rounding mode.  Return the integers in packed form.  */
47690075Sobrienstatic __inline __m64
47790075Sobrien_mm_cvtps_pi32 (__m128 __A)
47890075Sobrien{
47990075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
48090075Sobrien}
48190075Sobrien
48290075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
48390075Sobrienstatic __inline int
48490075Sobrien_mm_cvttss_si32 (__m128 __A)
48590075Sobrien{
48690075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
48790075Sobrien}
48890075Sobrien
48990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
49090075Sobrien   integers in packed form.  */
49190075Sobrienstatic __inline __m64
49290075Sobrien_mm_cvttps_pi32 (__m128 __A)
49390075Sobrien{
49490075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
49590075Sobrien}
49690075Sobrien
49790075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
49890075Sobrienstatic __inline __m128
49990075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
50090075Sobrien{
50190075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
50290075Sobrien}
50390075Sobrien
50490075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
50590075Sobrien   as the two lower elements in A.  */
50690075Sobrienstatic __inline __m128
50790075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
50890075Sobrien{
50990075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
51090075Sobrien}
51190075Sobrien
51290075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
51390075Sobrienstatic __inline __m128
51490075Sobrien_mm_cvtpi16_ps (__m64 __A)
51590075Sobrien{
51690075Sobrien  __v4hi __sign;
51790075Sobrien  __v2si __hisi, __losi;
51890075Sobrien  __v4sf __r;
51990075Sobrien
52090075Sobrien  /* This comparison against zero gives us a mask that can be used to
52190075Sobrien     fill in the missing sign bits in the unpack operations below, so
52290075Sobrien     that we get signed values after unpacking.  */
52390075Sobrien  __sign = (__v4hi) __builtin_ia32_mmx_zero ();
52490075Sobrien  __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
52590075Sobrien
52690075Sobrien  /* Convert the four words to doublewords.  */
52790075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
52890075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
52990075Sobrien
53090075Sobrien  /* Convert the doublewords to floating point two at a time.  */
53190075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
53290075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
53390075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
53490075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
53590075Sobrien
53690075Sobrien  return (__m128) __r;
53790075Sobrien}
53890075Sobrien
53990075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
54090075Sobrienstatic __inline __m128
54190075Sobrien_mm_cvtpu16_ps (__m64 __A)
54290075Sobrien{
54390075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
54490075Sobrien  __v2si __hisi, __losi;
54590075Sobrien  __v4sf __r;
54690075Sobrien
54790075Sobrien  /* Convert the four words to doublewords.  */
54890075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
54990075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
55090075Sobrien
55190075Sobrien  /* Convert the doublewords to floating point two at a time.  */
55290075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
55390075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
55490075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
55590075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
55690075Sobrien
55790075Sobrien  return (__m128) __r;
55890075Sobrien}
55990075Sobrien
56090075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
56190075Sobrienstatic __inline __m128
56290075Sobrien_mm_cvtpi8_ps (__m64 __A)
56390075Sobrien{
56490075Sobrien  __v8qi __sign;
56590075Sobrien
56690075Sobrien  /* This comparison against zero gives us a mask that can be used to
56790075Sobrien     fill in the missing sign bits in the unpack operations below, so
56890075Sobrien     that we get signed values after unpacking.  */
56990075Sobrien  __sign = (__v8qi) __builtin_ia32_mmx_zero ();
57090075Sobrien  __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
57190075Sobrien
57290075Sobrien  /* Convert the four low bytes to words.  */
57390075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
57490075Sobrien
57590075Sobrien  return _mm_cvtpi16_ps(__A);
57690075Sobrien}
57790075Sobrien
57890075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
57990075Sobrienstatic __inline __m128
58090075Sobrien_mm_cvtpu8_ps(__m64 __A)
58190075Sobrien{
58290075Sobrien  __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
58390075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
58490075Sobrien  return _mm_cvtpu16_ps(__A);
58590075Sobrien}
58690075Sobrien
58790075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
58890075Sobrienstatic __inline __m128
58990075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
59090075Sobrien{
59190075Sobrien  __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
59290075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
59390075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
59490075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
59590075Sobrien}
59690075Sobrien
59790075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
59890075Sobrienstatic __inline __m64
59990075Sobrien_mm_cvtps_pi16(__m128 __A)
60090075Sobrien{
60190075Sobrien  __v4sf __hisf = (__v4sf)__A;
60290075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
60390075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
60490075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
60590075Sobrien  return (__m64) __builtin_ia32_packssdw (__losi, __hisi);
60690075Sobrien}
60790075Sobrien
60890075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
60990075Sobrienstatic __inline __m64
61090075Sobrien_mm_cvtps_pi8(__m128 __A)
61190075Sobrien{
61290075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
61390075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
61490075Sobrien  return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
61590075Sobrien}
61690075Sobrien
61790075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
61890075Sobrien#if 0
61990075Sobrienstatic __inline __m128
62090075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
62190075Sobrien{
62290075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
62390075Sobrien}
62490075Sobrien#else
62590075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
62690075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
62790075Sobrien#endif
62890075Sobrien
62990075Sobrien
63090075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
63190075Sobrienstatic __inline __m128
63290075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
63390075Sobrien{
63490075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
63590075Sobrien}
63690075Sobrien
63790075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
63890075Sobrienstatic __inline __m128
63990075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
64090075Sobrien{
64190075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
64290075Sobrien}
64390075Sobrien
64490075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
64590075Sobrien   the lower two values are passed through from A.  */
64690075Sobrienstatic __inline __m128
64790075Sobrien_mm_loadh_pi (__m128 __A, __m64 *__P)
64890075Sobrien{
64990075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
65090075Sobrien}
65190075Sobrien
65290075Sobrien/* Stores the upper two SPFP values of A into P.  */
65390075Sobrienstatic __inline void
65490075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
65590075Sobrien{
65690075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
65790075Sobrien}
65890075Sobrien
65990075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
66090075Sobrienstatic __inline __m128
66190075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
66290075Sobrien{
66390075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
66490075Sobrien}
66590075Sobrien
66690075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
66790075Sobrienstatic __inline __m128
66890075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
66990075Sobrien{
67090075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
67190075Sobrien}
67290075Sobrien
67390075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
67490075Sobrien   the upper two values are passed through from A.  */
67590075Sobrienstatic __inline __m128
67690075Sobrien_mm_loadl_pi (__m128 __A, __m64 *__P)
67790075Sobrien{
67890075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
67990075Sobrien}
68090075Sobrien
68190075Sobrien/* Stores the lower two SPFP values of A into P.  */
68290075Sobrienstatic __inline void
68390075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
68490075Sobrien{
68590075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
68690075Sobrien}
68790075Sobrien
68890075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
68990075Sobrienstatic __inline int
69090075Sobrien_mm_movemask_ps (__m128 __A)
69190075Sobrien{
69290075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
69390075Sobrien}
69490075Sobrien
69590075Sobrien/* Return the contents of the control register.  */
69690075Sobrienstatic __inline unsigned int
69790075Sobrien_mm_getcsr (void)
69890075Sobrien{
69990075Sobrien  return __builtin_ia32_stmxcsr ();
70090075Sobrien}
70190075Sobrien
70290075Sobrien/* Read exception bits from the control register.  */
70390075Sobrienstatic __inline unsigned int
70490075Sobrien_MM_GET_EXCEPTION_STATE (void)
70590075Sobrien{
70690075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
70790075Sobrien}
70890075Sobrien
70990075Sobrienstatic __inline unsigned int
71090075Sobrien_MM_GET_EXCEPTION_MASK (void)
71190075Sobrien{
71290075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
71390075Sobrien}
71490075Sobrien
71590075Sobrienstatic __inline unsigned int
71690075Sobrien_MM_GET_ROUNDING_MODE (void)
71790075Sobrien{
71890075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
71990075Sobrien}
72090075Sobrien
72190075Sobrienstatic __inline unsigned int
72290075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
72390075Sobrien{
72490075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
72590075Sobrien}
72690075Sobrien
72790075Sobrien/* Set the control register to I.  */
72890075Sobrienstatic __inline void
72990075Sobrien_mm_setcsr (unsigned int __I)
73090075Sobrien{
73190075Sobrien  __builtin_ia32_ldmxcsr (__I);
73290075Sobrien}
73390075Sobrien
73490075Sobrien/* Set exception bits in the control register.  */
73590075Sobrienstatic __inline void
73690075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
73790075Sobrien{
73890075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
73990075Sobrien}
74090075Sobrien
74190075Sobrienstatic __inline void
74290075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
74390075Sobrien{
74490075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
74590075Sobrien}
74690075Sobrien
74790075Sobrienstatic __inline void
74890075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
74990075Sobrien{
75090075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
75190075Sobrien}
75290075Sobrien
75390075Sobrienstatic __inline void
75490075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
75590075Sobrien{
75690075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
75790075Sobrien}
75890075Sobrien
75990075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
76090075Sobrienstatic __inline __m128
76190075Sobrien_mm_load_ss (float *__P)
76290075Sobrien{
76390075Sobrien  return (__m128) __builtin_ia32_loadss (__P);
76490075Sobrien}
76590075Sobrien
76690075Sobrien/* Create a vector with all four elements equal to *P.  */
76790075Sobrienstatic __inline __m128
76890075Sobrien_mm_load1_ps (float *__P)
76990075Sobrien{
77090075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (__P);
77190075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
77290075Sobrien}
77390075Sobrien
77490075Sobrienstatic __inline __m128
77590075Sobrien_mm_load_ps1 (float *__P)
77690075Sobrien{
77790075Sobrien  return _mm_load1_ps (__P);
77890075Sobrien}
77990075Sobrien
78090075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
78190075Sobrienstatic __inline __m128
78290075Sobrien_mm_load_ps (float *__P)
78390075Sobrien{
78490075Sobrien  return (__m128) __builtin_ia32_loadaps (__P);
78590075Sobrien}
78690075Sobrien
78790075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
78890075Sobrienstatic __inline __m128
78990075Sobrien_mm_loadu_ps (float *__P)
79090075Sobrien{
79190075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
79290075Sobrien}
79390075Sobrien
79490075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
79590075Sobrienstatic __inline __m128
79690075Sobrien_mm_loadr_ps (float *__P)
79790075Sobrien{
79890075Sobrien  __v4sf __tmp = __builtin_ia32_loadaps (__P);
79990075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
80090075Sobrien}
80190075Sobrien
80290075Sobrien/* Create a vector with element 0 as F and the rest zero.  */
80390075Sobrienstatic __inline __m128
80490075Sobrien_mm_set_ss (float __F)
80590075Sobrien{
80690075Sobrien  return (__m128) __builtin_ia32_loadss (&__F);
80790075Sobrien}
80890075Sobrien
80990075Sobrien/* Create a vector with all four elements equal to F.  */
81090075Sobrienstatic __inline __m128
81190075Sobrien_mm_set1_ps (float __F)
81290075Sobrien{
81390075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (&__F);
81490075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
81590075Sobrien}
81690075Sobrien
81790075Sobrienstatic __inline __m128
81890075Sobrien_mm_set_ps1 (float __F)
81990075Sobrien{
82090075Sobrien  return _mm_set1_ps (__F);
82190075Sobrien}
82290075Sobrien
82390075Sobrien/* Create the vector [Z Y X W].  */
82490075Sobrienstatic __inline __m128
82590075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W)
82690075Sobrien{
82790075Sobrien  union {
82890075Sobrien    float __a[4];
82990075Sobrien    __m128 __v;
83090075Sobrien  } __u;
83190075Sobrien
83290075Sobrien  __u.__a[0] = __W;
83390075Sobrien  __u.__a[1] = __X;
83490075Sobrien  __u.__a[2] = __Y;
83590075Sobrien  __u.__a[3] = __Z;
83690075Sobrien
83790075Sobrien  return __u.__v;
83890075Sobrien}
83990075Sobrien
84090075Sobrien/* Create the vector [W X Y Z].  */
84190075Sobrienstatic __inline __m128
84290075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W)
84390075Sobrien{
84490075Sobrien  return _mm_set_ps (__W, __X, __Y, __Z);
84590075Sobrien}
84690075Sobrien
84790075Sobrien/* Create a vector of zeros.  */
84890075Sobrienstatic __inline __m128
84990075Sobrien_mm_setzero_ps (void)
85090075Sobrien{
85190075Sobrien  return (__m128) __builtin_ia32_setzerops ();
85290075Sobrien}
85390075Sobrien
85490075Sobrien/* Stores the lower SPFP value.  */
85590075Sobrienstatic __inline void
85690075Sobrien_mm_store_ss (float *__P, __m128 __A)
85790075Sobrien{
85890075Sobrien  __builtin_ia32_storess (__P, (__v4sf)__A);
85990075Sobrien}
86090075Sobrien
86190075Sobrien/* Store the lower SPFP value across four words.  */
86290075Sobrienstatic __inline void
86390075Sobrien_mm_store1_ps (float *__P, __m128 __A)
86490075Sobrien{
86590075Sobrien  __v4sf __va = (__v4sf)__A;
86690075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
86790075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
86890075Sobrien}
86990075Sobrien
87090075Sobrienstatic __inline void
87190075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
87290075Sobrien{
87390075Sobrien  _mm_store1_ps (__P, __A);
87490075Sobrien}
87590075Sobrien
87690075Sobrien/* Store four SPFP values.  The address must be 16-byte aligned.  */
87790075Sobrienstatic __inline void
87890075Sobrien_mm_store_ps (float *__P, __m128 __A)
87990075Sobrien{
88090075Sobrien  __builtin_ia32_storeaps (__P, (__v4sf)__A);
88190075Sobrien}
88290075Sobrien
88390075Sobrien/* Store four SPFP values.  The address need not be 16-byte aligned.  */
88490075Sobrienstatic __inline void
88590075Sobrien_mm_storeu_ps (float *__P, __m128 __A)
88690075Sobrien{
88790075Sobrien  __builtin_ia32_storeups (__P, (__v4sf)__A);
88890075Sobrien}
88990075Sobrien
89090075Sobrien/* Store four SPFP values in reverse order.  The addres must be aligned.  */
89190075Sobrienstatic __inline void
89290075Sobrien_mm_storer_ps (float *__P, __m128 __A)
89390075Sobrien{
89490075Sobrien  __v4sf __va = (__v4sf)__A;
89590075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
89690075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
89790075Sobrien}
89890075Sobrien
89990075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
90090075Sobrienstatic __inline __m128
90190075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
90290075Sobrien{
90390075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
90490075Sobrien}
90590075Sobrien
90690075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
90790075Sobrien#if 0
90890075Sobrienstatic __inline int
90990075Sobrien_mm_extract_pi16 (__m64 __A, int __N)
91090075Sobrien{
91190075Sobrien  return __builtin_ia32_pextrw ((__v4hi)__A, __N);
91290075Sobrien}
91390075Sobrien#else
91490075Sobrien#define _mm_extract_pi16(A, N) \
91590075Sobrien  __builtin_ia32_pextrw ((__v4hi)(A), (N))
91690075Sobrien#endif
91790075Sobrien
91890075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
91990075Sobrien   immediate.  */
92090075Sobrien#if 0
92190075Sobrienstatic __inline __m64
92290075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N)
92390075Sobrien{
92490075Sobrien  return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
92590075Sobrien}
92690075Sobrien#else
92790075Sobrien#define _mm_insert_pi16(A, D, N) \
92890075Sobrien  ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
92990075Sobrien#endif
93090075Sobrien
93190075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
93290075Sobrienstatic __inline __m64
93390075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
93490075Sobrien{
93590075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
93690075Sobrien}
93790075Sobrien
93890075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
93990075Sobrienstatic __inline __m64
94090075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
94190075Sobrien{
94290075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
94390075Sobrien}
94490075Sobrien
94590075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
94690075Sobrienstatic __inline __m64
94790075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
94890075Sobrien{
94990075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
95090075Sobrien}
95190075Sobrien
95290075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
95390075Sobrienstatic __inline __m64
95490075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
95590075Sobrien{
95690075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
95790075Sobrien}
95890075Sobrien
95990075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
96090075Sobrienstatic __inline int
96190075Sobrien_mm_movemask_pi8 (__m64 __A)
96290075Sobrien{
96390075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
96490075Sobrien}
96590075Sobrien
96690075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
96790075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
96890075Sobrienstatic __inline __m64
96990075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
97090075Sobrien{
97190075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
97290075Sobrien}
97390075Sobrien
97490075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
97590075Sobrien   must be an immediate.  */
97690075Sobrien#if 0
97790075Sobrienstatic __inline __m64
97890075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
97990075Sobrien{
98090075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
98190075Sobrien}
98290075Sobrien#else
98390075Sobrien#define _mm_shuffle_pi16(A, N) \
98490075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
98590075Sobrien#endif
98690075Sobrien
98790075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
98890075Sobrien   byte in the selector N determines whether the corresponding byte from
98990075Sobrien   A is stored.  */
99090075Sobrienstatic __inline void
99190075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
99290075Sobrien{
99390075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
99490075Sobrien}
99590075Sobrien
99690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
99790075Sobrienstatic __inline __m64
99890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
99990075Sobrien{
100090075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
100190075Sobrien}
100290075Sobrien
100390075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
100490075Sobrienstatic __inline __m64
100590075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
100690075Sobrien{
100790075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
100890075Sobrien}
100990075Sobrien
101090075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
101190075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
101290075Sobrien   upper words are cleared.  */
101390075Sobrienstatic __inline __m64
101490075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
101590075Sobrien{
101690075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
101790075Sobrien}
101890075Sobrien
101990075Sobrien/* Loads one cache line from address P to a location "closer" to the
102090075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
102190075Sobrien#if 0
102290075Sobrienstatic __inline void
102390075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
102490075Sobrien{
102590075Sobrien  __builtin_prefetch (__P, 0, __I);
102690075Sobrien}
102790075Sobrien#else
102890075Sobrien#define _mm_prefetch(P, I) \
102990075Sobrien  __builtin_prefetch ((P), 0, (I))
103090075Sobrien#endif
103190075Sobrien
103290075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
103390075Sobrienstatic __inline void
103490075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
103590075Sobrien{
1036107590Sobrien  __builtin_ia32_movntq (__P, (long long)__A);
103790075Sobrien}
103890075Sobrien
103990075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
104090075Sobrienstatic __inline void
104190075Sobrien_mm_stream_ps (float *__P, __m128 __A)
104290075Sobrien{
104390075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
104490075Sobrien}
104590075Sobrien
104690075Sobrien/* Guarantees that every preceeding store is globally visible before
104790075Sobrien   any subsequent store.  */
104890075Sobrienstatic __inline void
104990075Sobrien_mm_sfence (void)
105090075Sobrien{
105190075Sobrien  __builtin_ia32_sfence ();
105290075Sobrien}
105390075Sobrien
105490075Sobrien/* The execution of the next instruction is delayed by an implementation
105590075Sobrien   specific amount of time.  The instruction does not modify the
105690075Sobrien   architectural state.  */
105790075Sobrienstatic __inline void
105890075Sobrien_mm_pause (void)
105990075Sobrien{
106090075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
106190075Sobrien}
106290075Sobrien
106390075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
106490075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
106590075Sobriendo {									\
106690075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
106790075Sobrien  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
1068107590Sobrien  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
1069107590Sobrien  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
107090075Sobrien  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
107190075Sobrien  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
107290075Sobrien  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
107390075Sobrien  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
107490075Sobrien  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
107590075Sobrien} while (0)
107690075Sobrien
107790075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1078