xmmintrin.h revision 90075
190075Sobrien/* Copyright (C) 2002 Free Software Foundation, Inc.
290075Sobrien
390075Sobrien   This file is part of GNU CC.
490075Sobrien
590075Sobrien   GNU CC is free software; you can redistribute it and/or modify
690075Sobrien   it under the terms of the GNU General Public License as published by
790075Sobrien   the Free Software Foundation; either version 2, or (at your option)
890075Sobrien   any later version.
990075Sobrien
1090075Sobrien   GNU CC is distributed in the hope that it will be useful,
1190075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1290075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1390075Sobrien   GNU General Public License for more details.
1490075Sobrien
1590075Sobrien   You should have received a copy of the GNU General Public License
1690075Sobrien   along with GNU CC; see the file COPYING.  If not, write to
1790075Sobrien   the Free Software Foundation, 59 Temple Place - Suite 330,
1890075Sobrien   Boston, MA 02111-1307, USA.  */
1990075Sobrien
2090075Sobrien/* As a special exception, if you include this header file into source
2190075Sobrien   files compiled by GCC, this header file does not by itself cause
2290075Sobrien   the resulting executable to be covered by the GNU General Public
2390075Sobrien   License.  This exception does not however invalidate any other
2490075Sobrien   reasons why the executable file might be covered by the GNU General
2590075Sobrien   Public License.  */
2690075Sobrien
2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
2890075Sobrien   User Guide and Reference, version 5.0.  */
2990075Sobrien
3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3190075Sobrien#define _XMMINTRIN_H_INCLUDED
3290075Sobrien
3390075Sobrien/* We need type definitions from the MMX header file.  */
3490075Sobrien#include <mmintrin.h>
3590075Sobrien
3690075Sobrien/* The data type indended for user use.  */
3790075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__)));
3890075Sobrien
3990075Sobrien/* Internal data types for implementing the instrinsics.  */
4090075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
4190075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__)));
4290075Sobrien
4390075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
4490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
4590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
4690075Sobrien
4790075Sobrien/* Constants for use with _mm_prefetch.  */
4890075Sobrienenum _mm_hint
4990075Sobrien{
5090075Sobrien  _MM_HINT_T0 = 3,
5190075Sobrien  _MM_HINT_T1 = 2,
5290075Sobrien  _MM_HINT_T2 = 1,
5390075Sobrien  _MM_HINT_NTA = 0
5490075Sobrien};
5590075Sobrien
5690075Sobrien/* Bits in the MXCSR.  */
5790075Sobrien#define _MM_EXCEPT_MASK       0x003f
5890075Sobrien#define _MM_EXCEPT_INVALID    0x0001
5990075Sobrien#define _MM_EXCEPT_DENORM     0x0002
6090075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
6190075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
6290075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
6390075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
6490075Sobrien
6590075Sobrien#define _MM_MASK_MASK         0x1f80
6690075Sobrien#define _MM_MASK_INVALID      0x0080
6790075Sobrien#define _MM_MASK_DENORM       0x0100
6890075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
6990075Sobrien#define _MM_MASK_OVERFLOW     0x0400
7090075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
7190075Sobrien#define _MM_MASK_INEXACT      0x1000
7290075Sobrien
7390075Sobrien#define _MM_ROUND_MASK        0x6000
7490075Sobrien#define _MM_ROUND_NEAREST     0x0000
7590075Sobrien#define _MM_ROUND_DOWN        0x2000
7690075Sobrien#define _MM_ROUND_UP          0x4000
7790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
7890075Sobrien
7990075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
8090075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
8190075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
8290075Sobrien
8390075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
8490075Sobrien   floating-point) values of A and B; the upper three SPFP values are
8590075Sobrien   passed through from A.  */
8690075Sobrien
8790075Sobrienstatic __inline __m128
8890075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
8990075Sobrien{
9090075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
9190075Sobrien}
9290075Sobrien
9390075Sobrienstatic __inline __m128
9490075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
9590075Sobrien{
9690075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
9790075Sobrien}
9890075Sobrien
9990075Sobrienstatic __inline __m128
10090075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
10190075Sobrien{
10290075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
10390075Sobrien}
10490075Sobrien
10590075Sobrienstatic __inline __m128
10690075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
10790075Sobrien{
10890075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
10990075Sobrien}
11090075Sobrien
11190075Sobrienstatic __inline __m128
11290075Sobrien_mm_sqrt_ss (__m128 __A)
11390075Sobrien{
11490075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
11590075Sobrien}
11690075Sobrien
11790075Sobrienstatic __inline __m128
11890075Sobrien_mm_rcp_ss (__m128 __A)
11990075Sobrien{
12090075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
12190075Sobrien}
12290075Sobrien
12390075Sobrienstatic __inline __m128
12490075Sobrien_mm_rsqrt_ss (__m128 __A)
12590075Sobrien{
12690075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
12790075Sobrien}
12890075Sobrien
12990075Sobrienstatic __inline __m128
13090075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
13190075Sobrien{
13290075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
13390075Sobrien}
13490075Sobrien
13590075Sobrienstatic __inline __m128
13690075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
13790075Sobrien{
13890075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
13990075Sobrien}
14090075Sobrien
14190075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
14290075Sobrien
14390075Sobrienstatic __inline __m128
14490075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
14590075Sobrien{
14690075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
14790075Sobrien}
14890075Sobrien
14990075Sobrienstatic __inline __m128
15090075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
15190075Sobrien{
15290075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
15390075Sobrien}
15490075Sobrien
15590075Sobrienstatic __inline __m128
15690075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
15790075Sobrien{
15890075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
15990075Sobrien}
16090075Sobrien
16190075Sobrienstatic __inline __m128
16290075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
16390075Sobrien{
16490075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
16590075Sobrien}
16690075Sobrien
16790075Sobrienstatic __inline __m128
16890075Sobrien_mm_sqrt_ps (__m128 __A)
16990075Sobrien{
17090075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
17190075Sobrien}
17290075Sobrien
17390075Sobrienstatic __inline __m128
17490075Sobrien_mm_rcp_ps (__m128 __A)
17590075Sobrien{
17690075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
17790075Sobrien}
17890075Sobrien
17990075Sobrienstatic __inline __m128
18090075Sobrien_mm_rsqrt_ps (__m128 __A)
18190075Sobrien{
18290075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
18390075Sobrien}
18490075Sobrien
18590075Sobrienstatic __inline __m128
18690075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
18790075Sobrien{
18890075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
18990075Sobrien}
19090075Sobrien
19190075Sobrienstatic __inline __m128
19290075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
19390075Sobrien{
19490075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
19590075Sobrien}
19690075Sobrien
19790075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
19890075Sobrien
19990075Sobrienstatic __inline __m128
20090075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
20190075Sobrien{
20290075Sobrien  return __builtin_ia32_andps (__A, __B);
20390075Sobrien}
20490075Sobrien
20590075Sobrienstatic __inline __m128
20690075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
20790075Sobrien{
20890075Sobrien  return __builtin_ia32_andnps (__A, __B);
20990075Sobrien}
21090075Sobrien
21190075Sobrienstatic __inline __m128
21290075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
21390075Sobrien{
21490075Sobrien  return __builtin_ia32_orps (__A, __B);
21590075Sobrien}
21690075Sobrien
21790075Sobrienstatic __inline __m128
21890075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
21990075Sobrien{
22090075Sobrien  return __builtin_ia32_xorps (__A, __B);
22190075Sobrien}
22290075Sobrien
22390075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
22490075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
22590075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
22690075Sobrien
22790075Sobrienstatic __inline __m128
22890075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
22990075Sobrien{
23090075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
23190075Sobrien}
23290075Sobrien
23390075Sobrienstatic __inline __m128
23490075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
23590075Sobrien{
23690075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
23790075Sobrien}
23890075Sobrien
23990075Sobrienstatic __inline __m128
24090075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
24190075Sobrien{
24290075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
24390075Sobrien}
24490075Sobrien
24590075Sobrienstatic __inline __m128
24690075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
24790075Sobrien{
24890075Sobrien  return (__m128) __builtin_ia32_cmpgtss ((__v4sf)__A, (__v4sf)__B);
24990075Sobrien}
25090075Sobrien
25190075Sobrienstatic __inline __m128
25290075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
25390075Sobrien{
25490075Sobrien  return (__m128) __builtin_ia32_cmpgess ((__v4sf)__A, (__v4sf)__B);
25590075Sobrien}
25690075Sobrien
25790075Sobrienstatic __inline __m128
25890075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
25990075Sobrien{
26090075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
26190075Sobrien}
26290075Sobrien
26390075Sobrienstatic __inline __m128
26490075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
26590075Sobrien{
26690075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
26790075Sobrien}
26890075Sobrien
26990075Sobrienstatic __inline __m128
27090075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
27190075Sobrien{
27290075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
27390075Sobrien}
27490075Sobrien
27590075Sobrienstatic __inline __m128
27690075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
27790075Sobrien{
27890075Sobrien  return (__m128) __builtin_ia32_cmpngtss ((__v4sf)__A, (__v4sf)__B);
27990075Sobrien}
28090075Sobrien
28190075Sobrienstatic __inline __m128
28290075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
28390075Sobrien{
28490075Sobrien  return (__m128) __builtin_ia32_cmpngess ((__v4sf)__A, (__v4sf)__B);
28590075Sobrien}
28690075Sobrien
28790075Sobrienstatic __inline __m128
28890075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
28990075Sobrien{
29090075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
29190075Sobrien}
29290075Sobrien
29390075Sobrienstatic __inline __m128
29490075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
29590075Sobrien{
29690075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
29790075Sobrien}
29890075Sobrien
29990075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
30090075Sobrien   element, if the comparison is true, place a mask of all ones in the
30190075Sobrien   result, otherwise a mask of zeros.  */
30290075Sobrien
30390075Sobrienstatic __inline __m128
30490075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
30590075Sobrien{
30690075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
30790075Sobrien}
30890075Sobrien
30990075Sobrienstatic __inline __m128
31090075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
31190075Sobrien{
31290075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
31390075Sobrien}
31490075Sobrien
31590075Sobrienstatic __inline __m128
31690075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
31790075Sobrien{
31890075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
31990075Sobrien}
32090075Sobrien
32190075Sobrienstatic __inline __m128
32290075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
32390075Sobrien{
32490075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
32590075Sobrien}
32690075Sobrien
32790075Sobrienstatic __inline __m128
32890075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
32990075Sobrien{
33090075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
33190075Sobrien}
33290075Sobrien
33390075Sobrienstatic __inline __m128
33490075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
33590075Sobrien{
33690075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
33790075Sobrien}
33890075Sobrien
33990075Sobrienstatic __inline __m128
34090075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
34190075Sobrien{
34290075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
34390075Sobrien}
34490075Sobrien
34590075Sobrienstatic __inline __m128
34690075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
34790075Sobrien{
34890075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
34990075Sobrien}
35090075Sobrien
35190075Sobrienstatic __inline __m128
35290075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
35390075Sobrien{
35490075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
35590075Sobrien}
35690075Sobrien
35790075Sobrienstatic __inline __m128
35890075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
35990075Sobrien{
36090075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
36190075Sobrien}
36290075Sobrien
36390075Sobrienstatic __inline __m128
36490075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
36590075Sobrien{
36690075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
36790075Sobrien}
36890075Sobrien
36990075Sobrienstatic __inline __m128
37090075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
37190075Sobrien{
37290075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
37390075Sobrien}
37490075Sobrien
37590075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
37690075Sobrien   and 0 if false.  */
37790075Sobrien
37890075Sobrienstatic __inline int
37990075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
38090075Sobrien{
38190075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
38290075Sobrien}
38390075Sobrien
38490075Sobrienstatic __inline int
38590075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
38690075Sobrien{
38790075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
38890075Sobrien}
38990075Sobrien
39090075Sobrienstatic __inline int
39190075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
39290075Sobrien{
39390075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
39490075Sobrien}
39590075Sobrien
39690075Sobrienstatic __inline int
39790075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
39890075Sobrien{
39990075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
40090075Sobrien}
40190075Sobrien
40290075Sobrienstatic __inline int
40390075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
40490075Sobrien{
40590075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
40690075Sobrien}
40790075Sobrien
40890075Sobrienstatic __inline int
40990075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
41090075Sobrien{
41190075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
41290075Sobrien}
41390075Sobrien
41490075Sobrienstatic __inline int
41590075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
41690075Sobrien{
41790075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
41890075Sobrien}
41990075Sobrien
42090075Sobrienstatic __inline int
42190075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
42290075Sobrien{
42390075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
42490075Sobrien}
42590075Sobrien
42690075Sobrienstatic __inline int
42790075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
42890075Sobrien{
42990075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
43090075Sobrien}
43190075Sobrien
43290075Sobrienstatic __inline int
43390075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
43490075Sobrien{
43590075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
43690075Sobrien}
43790075Sobrien
43890075Sobrienstatic __inline int
43990075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
44090075Sobrien{
44190075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
44290075Sobrien}
44390075Sobrien
44490075Sobrienstatic __inline int
44590075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
44690075Sobrien{
44790075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
44890075Sobrien}
44990075Sobrien
45090075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
45190075Sobrien   rounding mode.  */
45290075Sobrienstatic __inline int
45390075Sobrien_mm_cvtss_si32 (__m128 __A)
45490075Sobrien{
45590075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
45690075Sobrien}
45790075Sobrien
45890075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
45990075Sobrien   current rounding mode.  Return the integers in packed form.  */
46090075Sobrienstatic __inline __m64
46190075Sobrien_mm_cvtps_pi32 (__m128 __A)
46290075Sobrien{
46390075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
46490075Sobrien}
46590075Sobrien
46690075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
46790075Sobrienstatic __inline int
46890075Sobrien_mm_cvttss_si32 (__m128 __A)
46990075Sobrien{
47090075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
47190075Sobrien}
47290075Sobrien
47390075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
47490075Sobrien   integers in packed form.  */
47590075Sobrienstatic __inline __m64
47690075Sobrien_mm_cvttps_pi32 (__m128 __A)
47790075Sobrien{
47890075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
47990075Sobrien}
48090075Sobrien
48190075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
48290075Sobrienstatic __inline __m128
48390075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
48490075Sobrien{
48590075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
48690075Sobrien}
48790075Sobrien
48890075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
48990075Sobrien   as the two lower elements in A.  */
49090075Sobrienstatic __inline __m128
49190075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
49290075Sobrien{
49390075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
49490075Sobrien}
49590075Sobrien
49690075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
49790075Sobrienstatic __inline __m128
49890075Sobrien_mm_cvtpi16_ps (__m64 __A)
49990075Sobrien{
50090075Sobrien  __v4hi __sign;
50190075Sobrien  __v2si __hisi, __losi;
50290075Sobrien  __v4sf __r;
50390075Sobrien
50490075Sobrien  /* This comparison against zero gives us a mask that can be used to
50590075Sobrien     fill in the missing sign bits in the unpack operations below, so
50690075Sobrien     that we get signed values after unpacking.  */
50790075Sobrien  __sign = (__v4hi) __builtin_ia32_mmx_zero ();
50890075Sobrien  __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A);
50990075Sobrien
51090075Sobrien  /* Convert the four words to doublewords.  */
51190075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
51290075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
51390075Sobrien
51490075Sobrien  /* Convert the doublewords to floating point two at a time.  */
51590075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
51690075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
51790075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
51890075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
51990075Sobrien
52090075Sobrien  return (__m128) __r;
52190075Sobrien}
52290075Sobrien
52390075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
52490075Sobrienstatic __inline __m128
52590075Sobrien_mm_cvtpu16_ps (__m64 __A)
52690075Sobrien{
52790075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
52890075Sobrien  __v2si __hisi, __losi;
52990075Sobrien  __v4sf __r;
53090075Sobrien
53190075Sobrien  /* Convert the four words to doublewords.  */
53290075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero);
53390075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero);
53490075Sobrien
53590075Sobrien  /* Convert the doublewords to floating point two at a time.  */
53690075Sobrien  __r = (__v4sf) __builtin_ia32_setzerops ();
53790075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
53890075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
53990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
54090075Sobrien
54190075Sobrien  return (__m128) __r;
54290075Sobrien}
54390075Sobrien
54490075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
54590075Sobrienstatic __inline __m128
54690075Sobrien_mm_cvtpi8_ps (__m64 __A)
54790075Sobrien{
54890075Sobrien  __v8qi __sign;
54990075Sobrien
55090075Sobrien  /* This comparison against zero gives us a mask that can be used to
55190075Sobrien     fill in the missing sign bits in the unpack operations below, so
55290075Sobrien     that we get signed values after unpacking.  */
55390075Sobrien  __sign = (__v8qi) __builtin_ia32_mmx_zero ();
55490075Sobrien  __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A);
55590075Sobrien
55690075Sobrien  /* Convert the four low bytes to words.  */
55790075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
55890075Sobrien
55990075Sobrien  return _mm_cvtpi16_ps(__A);
56090075Sobrien}
56190075Sobrien
56290075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
56390075Sobrienstatic __inline __m128
56490075Sobrien_mm_cvtpu8_ps(__m64 __A)
56590075Sobrien{
56690075Sobrien  __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero ();
56790075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero);
56890075Sobrien  return _mm_cvtpu16_ps(__A);
56990075Sobrien}
57090075Sobrien
57190075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
57290075Sobrienstatic __inline __m128
57390075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
57490075Sobrien{
57590075Sobrien  __v4sf __zero = (__v4sf) __builtin_ia32_setzerops ();
57690075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
57790075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
57890075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
57990075Sobrien}
58090075Sobrien
58190075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
58290075Sobrienstatic __inline __m64
58390075Sobrien_mm_cvtps_pi16(__m128 __A)
58490075Sobrien{
58590075Sobrien  __v4sf __hisf = (__v4sf)__A;
58690075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
58790075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
58890075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
58990075Sobrien  return (__m64) __builtin_ia32_packssdw (__losi, __hisi);
59090075Sobrien}
59190075Sobrien
59290075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
59390075Sobrienstatic __inline __m64
59490075Sobrien_mm_cvtps_pi8(__m128 __A)
59590075Sobrien{
59690075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
59790075Sobrien  __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero ();
59890075Sobrien  return (__m64) __builtin_ia32_packsswb (__tmp, __zero);
59990075Sobrien}
60090075Sobrien
60190075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
60290075Sobrien#if 0
60390075Sobrienstatic __inline __m128
60490075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
60590075Sobrien{
60690075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
60790075Sobrien}
60890075Sobrien#else
60990075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
61090075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
61190075Sobrien#endif
61290075Sobrien
61390075Sobrien
61490075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
61590075Sobrienstatic __inline __m128
61690075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
61790075Sobrien{
61890075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
61990075Sobrien}
62090075Sobrien
62190075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
62290075Sobrienstatic __inline __m128
62390075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
62490075Sobrien{
62590075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
62690075Sobrien}
62790075Sobrien
62890075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
62990075Sobrien   the lower two values are passed through from A.  */
63090075Sobrienstatic __inline __m128
63190075Sobrien_mm_loadh_pi (__m128 __A, __m64 *__P)
63290075Sobrien{
63390075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
63490075Sobrien}
63590075Sobrien
63690075Sobrien/* Stores the upper two SPFP values of A into P.  */
63790075Sobrienstatic __inline void
63890075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
63990075Sobrien{
64090075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
64190075Sobrien}
64290075Sobrien
64390075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
64490075Sobrienstatic __inline __m128
64590075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
64690075Sobrien{
64790075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
64890075Sobrien}
64990075Sobrien
65090075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
65190075Sobrienstatic __inline __m128
65290075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
65390075Sobrien{
65490075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
65590075Sobrien}
65690075Sobrien
65790075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
65890075Sobrien   the upper two values are passed through from A.  */
65990075Sobrienstatic __inline __m128
66090075Sobrien_mm_loadl_pi (__m128 __A, __m64 *__P)
66190075Sobrien{
66290075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
66390075Sobrien}
66490075Sobrien
66590075Sobrien/* Stores the lower two SPFP values of A into P.  */
66690075Sobrienstatic __inline void
66790075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
66890075Sobrien{
66990075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
67090075Sobrien}
67190075Sobrien
67290075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
67390075Sobrienstatic __inline int
67490075Sobrien_mm_movemask_ps (__m128 __A)
67590075Sobrien{
67690075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
67790075Sobrien}
67890075Sobrien
67990075Sobrien/* Return the contents of the control register.  */
68090075Sobrienstatic __inline unsigned int
68190075Sobrien_mm_getcsr (void)
68290075Sobrien{
68390075Sobrien  return __builtin_ia32_stmxcsr ();
68490075Sobrien}
68590075Sobrien
68690075Sobrien/* Read exception bits from the control register.  */
68790075Sobrienstatic __inline unsigned int
68890075Sobrien_MM_GET_EXCEPTION_STATE (void)
68990075Sobrien{
69090075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
69190075Sobrien}
69290075Sobrien
69390075Sobrienstatic __inline unsigned int
69490075Sobrien_MM_GET_EXCEPTION_MASK (void)
69590075Sobrien{
69690075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
69790075Sobrien}
69890075Sobrien
69990075Sobrienstatic __inline unsigned int
70090075Sobrien_MM_GET_ROUNDING_MODE (void)
70190075Sobrien{
70290075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
70390075Sobrien}
70490075Sobrien
70590075Sobrienstatic __inline unsigned int
70690075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
70790075Sobrien{
70890075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
70990075Sobrien}
71090075Sobrien
71190075Sobrien/* Set the control register to I.  */
71290075Sobrienstatic __inline void
71390075Sobrien_mm_setcsr (unsigned int __I)
71490075Sobrien{
71590075Sobrien  __builtin_ia32_ldmxcsr (__I);
71690075Sobrien}
71790075Sobrien
71890075Sobrien/* Set exception bits in the control register.  */
71990075Sobrienstatic __inline void
72090075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
72190075Sobrien{
72290075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
72390075Sobrien}
72490075Sobrien
72590075Sobrienstatic __inline void
72690075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
72790075Sobrien{
72890075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
72990075Sobrien}
73090075Sobrien
73190075Sobrienstatic __inline void
73290075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
73390075Sobrien{
73490075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
73590075Sobrien}
73690075Sobrien
73790075Sobrienstatic __inline void
73890075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
73990075Sobrien{
74090075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
74190075Sobrien}
74290075Sobrien
74390075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
74490075Sobrienstatic __inline __m128
74590075Sobrien_mm_load_ss (float *__P)
74690075Sobrien{
74790075Sobrien  return (__m128) __builtin_ia32_loadss (__P);
74890075Sobrien}
74990075Sobrien
75090075Sobrien/* Create a vector with all four elements equal to *P.  */
75190075Sobrienstatic __inline __m128
75290075Sobrien_mm_load1_ps (float *__P)
75390075Sobrien{
75490075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (__P);
75590075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
75690075Sobrien}
75790075Sobrien
75890075Sobrienstatic __inline __m128
75990075Sobrien_mm_load_ps1 (float *__P)
76090075Sobrien{
76190075Sobrien  return _mm_load1_ps (__P);
76290075Sobrien}
76390075Sobrien
76490075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
76590075Sobrienstatic __inline __m128
76690075Sobrien_mm_load_ps (float *__P)
76790075Sobrien{
76890075Sobrien  return (__m128) __builtin_ia32_loadaps (__P);
76990075Sobrien}
77090075Sobrien
77190075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
77290075Sobrienstatic __inline __m128
77390075Sobrien_mm_loadu_ps (float *__P)
77490075Sobrien{
77590075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
77690075Sobrien}
77790075Sobrien
77890075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
77990075Sobrienstatic __inline __m128
78090075Sobrien_mm_loadr_ps (float *__P)
78190075Sobrien{
78290075Sobrien  __v4sf __tmp = __builtin_ia32_loadaps (__P);
78390075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
78490075Sobrien}
78590075Sobrien
78690075Sobrien/* Create a vector with element 0 as F and the rest zero.  */
78790075Sobrienstatic __inline __m128
78890075Sobrien_mm_set_ss (float __F)
78990075Sobrien{
79090075Sobrien  return (__m128) __builtin_ia32_loadss (&__F);
79190075Sobrien}
79290075Sobrien
79390075Sobrien/* Create a vector with all four elements equal to F.  */
79490075Sobrienstatic __inline __m128
79590075Sobrien_mm_set1_ps (float __F)
79690075Sobrien{
79790075Sobrien  __v4sf __tmp = __builtin_ia32_loadss (&__F);
79890075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0));
79990075Sobrien}
80090075Sobrien
80190075Sobrienstatic __inline __m128
80290075Sobrien_mm_set_ps1 (float __F)
80390075Sobrien{
80490075Sobrien  return _mm_set1_ps (__F);
80590075Sobrien}
80690075Sobrien
80790075Sobrien/* Create the vector [Z Y X W].  */
80890075Sobrienstatic __inline __m128
80990075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W)
81090075Sobrien{
81190075Sobrien  union {
81290075Sobrien    float __a[4];
81390075Sobrien    __m128 __v;
81490075Sobrien  } __u;
81590075Sobrien
81690075Sobrien  __u.__a[0] = __W;
81790075Sobrien  __u.__a[1] = __X;
81890075Sobrien  __u.__a[2] = __Y;
81990075Sobrien  __u.__a[3] = __Z;
82090075Sobrien
82190075Sobrien  return __u.__v;
82290075Sobrien}
82390075Sobrien
82490075Sobrien/* Create the vector [W X Y Z].  */
82590075Sobrienstatic __inline __m128
82690075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W)
82790075Sobrien{
82890075Sobrien  return _mm_set_ps (__W, __X, __Y, __Z);
82990075Sobrien}
83090075Sobrien
83190075Sobrien/* Create a vector of zeros.  */
83290075Sobrienstatic __inline __m128
83390075Sobrien_mm_setzero_ps (void)
83490075Sobrien{
83590075Sobrien  return (__m128) __builtin_ia32_setzerops ();
83690075Sobrien}
83790075Sobrien
83890075Sobrien/* Stores the lower SPFP value.  */
83990075Sobrienstatic __inline void
84090075Sobrien_mm_store_ss (float *__P, __m128 __A)
84190075Sobrien{
84290075Sobrien  __builtin_ia32_storess (__P, (__v4sf)__A);
84390075Sobrien}
84490075Sobrien
84590075Sobrien/* Store the lower SPFP value across four words.  */
84690075Sobrienstatic __inline void
84790075Sobrien_mm_store1_ps (float *__P, __m128 __A)
84890075Sobrien{
84990075Sobrien  __v4sf __va = (__v4sf)__A;
85090075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
85190075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
85290075Sobrien}
85390075Sobrien
85490075Sobrienstatic __inline void
85590075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
85690075Sobrien{
85790075Sobrien  _mm_store1_ps (__P, __A);
85890075Sobrien}
85990075Sobrien
86090075Sobrien/* Store four SPFP values.  The address must be 16-byte aligned.  */
86190075Sobrienstatic __inline void
86290075Sobrien_mm_store_ps (float *__P, __m128 __A)
86390075Sobrien{
86490075Sobrien  __builtin_ia32_storeaps (__P, (__v4sf)__A);
86590075Sobrien}
86690075Sobrien
86790075Sobrien/* Store four SPFP values.  The address need not be 16-byte aligned.  */
86890075Sobrienstatic __inline void
86990075Sobrien_mm_storeu_ps (float *__P, __m128 __A)
87090075Sobrien{
87190075Sobrien  __builtin_ia32_storeups (__P, (__v4sf)__A);
87290075Sobrien}
87390075Sobrien
87490075Sobrien/* Store four SPFP values in reverse order.  The addres must be aligned.  */
87590075Sobrienstatic __inline void
87690075Sobrien_mm_storer_ps (float *__P, __m128 __A)
87790075Sobrien{
87890075Sobrien  __v4sf __va = (__v4sf)__A;
87990075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
88090075Sobrien  __builtin_ia32_storeaps (__P, __tmp);
88190075Sobrien}
88290075Sobrien
88390075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
88490075Sobrienstatic __inline __m128
88590075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
88690075Sobrien{
88790075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
88890075Sobrien}
88990075Sobrien
89090075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
89190075Sobrien#if 0
89290075Sobrienstatic __inline int
89390075Sobrien_mm_extract_pi16 (__m64 __A, int __N)
89490075Sobrien{
89590075Sobrien  return __builtin_ia32_pextrw ((__v4hi)__A, __N);
89690075Sobrien}
89790075Sobrien#else
89890075Sobrien#define _mm_extract_pi16(A, N) \
89990075Sobrien  __builtin_ia32_pextrw ((__v4hi)(A), (N))
90090075Sobrien#endif
90190075Sobrien
90290075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
90390075Sobrien   immediate.  */
90490075Sobrien#if 0
90590075Sobrienstatic __inline __m64
90690075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N)
90790075Sobrien{
90890075Sobrien  return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N);
90990075Sobrien}
91090075Sobrien#else
91190075Sobrien#define _mm_insert_pi16(A, D, N) \
91290075Sobrien  ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N)))
91390075Sobrien#endif
91490075Sobrien
91590075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
91690075Sobrienstatic __inline __m64
91790075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
91890075Sobrien{
91990075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
92090075Sobrien}
92190075Sobrien
92290075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
92390075Sobrienstatic __inline __m64
92490075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
92590075Sobrien{
92690075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
92790075Sobrien}
92890075Sobrien
92990075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
93090075Sobrienstatic __inline __m64
93190075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
93290075Sobrien{
93390075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
93490075Sobrien}
93590075Sobrien
93690075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
93790075Sobrienstatic __inline __m64
93890075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
93990075Sobrien{
94090075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
94190075Sobrien}
94290075Sobrien
94390075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
94490075Sobrienstatic __inline int
94590075Sobrien_mm_movemask_pi8 (__m64 __A)
94690075Sobrien{
94790075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
94890075Sobrien}
94990075Sobrien
95090075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
95190075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
95290075Sobrienstatic __inline __m64
95390075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
95490075Sobrien{
95590075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
95690075Sobrien}
95790075Sobrien
95890075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
95990075Sobrien   must be an immediate.  */
96090075Sobrien#if 0
96190075Sobrienstatic __inline __m64
96290075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
96390075Sobrien{
96490075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
96590075Sobrien}
96690075Sobrien#else
96790075Sobrien#define _mm_shuffle_pi16(A, N) \
96890075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
96990075Sobrien#endif
97090075Sobrien
97190075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
97290075Sobrien   byte in the selector N determines whether the corresponding byte from
97390075Sobrien   A is stored.  */
97490075Sobrienstatic __inline void
97590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
97690075Sobrien{
97790075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
97890075Sobrien}
97990075Sobrien
98090075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
98190075Sobrienstatic __inline __m64
98290075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
98390075Sobrien{
98490075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
98590075Sobrien}
98690075Sobrien
98790075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
98890075Sobrienstatic __inline __m64
98990075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
99090075Sobrien{
99190075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
99290075Sobrien}
99390075Sobrien
99490075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
99590075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
99690075Sobrien   upper words are cleared.  */
99790075Sobrienstatic __inline __m64
99890075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
99990075Sobrien{
100090075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
100190075Sobrien}
100290075Sobrien
100390075Sobrien/* Loads one cache line from address P to a location "closer" to the
100490075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
100590075Sobrien#if 0
100690075Sobrienstatic __inline void
100790075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
100890075Sobrien{
100990075Sobrien  __builtin_prefetch (__P, 0, __I);
101090075Sobrien}
101190075Sobrien#else
101290075Sobrien#define _mm_prefetch(P, I) \
101390075Sobrien  __builtin_prefetch ((P), 0, (I))
101490075Sobrien#endif
101590075Sobrien
101690075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
101790075Sobrienstatic __inline void
101890075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
101990075Sobrien{
102090075Sobrien  __builtin_ia32_movntq (__P, __A);
102190075Sobrien}
102290075Sobrien
102390075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
102490075Sobrienstatic __inline void
102590075Sobrien_mm_stream_ps (float *__P, __m128 __A)
102690075Sobrien{
102790075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
102890075Sobrien}
102990075Sobrien
103090075Sobrien/* Guarantees that every preceeding store is globally visible before
103190075Sobrien   any subsequent store.  */
103290075Sobrienstatic __inline void
103390075Sobrien_mm_sfence (void)
103490075Sobrien{
103590075Sobrien  __builtin_ia32_sfence ();
103690075Sobrien}
103790075Sobrien
103890075Sobrien/* The execution of the next instruction is delayed by an implementation
103990075Sobrien   specific amount of time.  The instruction does not modify the
104090075Sobrien   architectural state.  */
104190075Sobrienstatic __inline void
104290075Sobrien_mm_pause (void)
104390075Sobrien{
104490075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
104590075Sobrien}
104690075Sobrien
104790075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
104890075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
104990075Sobriendo {									\
105090075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
105190075Sobrien  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
105290075Sobrien  __v4sf __t1 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
105390075Sobrien  __v4sf __t2 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
105490075Sobrien  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
105590075Sobrien  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
105690075Sobrien  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
105790075Sobrien  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
105890075Sobrien  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
105990075Sobrien} while (0)
106090075Sobrien
106190075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1062