1169689Skan/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007
2169689Skan   Free Software Foundation, Inc.
390075Sobrien
4132718Skan   This file is part of GCC.
590075Sobrien
6132718Skan   GCC is free software; you can redistribute it and/or modify
790075Sobrien   it under the terms of the GNU General Public License as published by
890075Sobrien   the Free Software Foundation; either version 2, or (at your option)
990075Sobrien   any later version.
1090075Sobrien
11132718Skan   GCC is distributed in the hope that it will be useful,
1290075Sobrien   but WITHOUT ANY WARRANTY; without even the implied warranty of
1390075Sobrien   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1490075Sobrien   GNU General Public License for more details.
1590075Sobrien
1690075Sobrien   You should have received a copy of the GNU General Public License
17132718Skan   along with GCC; see the file COPYING.  If not, write to
18169689Skan   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19169689Skan   Boston, MA 02110-1301, USA.  */
2090075Sobrien
2190075Sobrien/* As a special exception, if you include this header file into source
2290075Sobrien   files compiled by GCC, this header file does not by itself cause
2390075Sobrien   the resulting executable to be covered by the GNU General Public
2490075Sobrien   License.  This exception does not however invalidate any other
2590075Sobrien   reasons why the executable file might be covered by the GNU General
2690075Sobrien   Public License.  */
2790075Sobrien
2890075Sobrien/* Implemented from the specification included in the Intel C++ Compiler
29169689Skan   User Guide and Reference, version 9.0.  */
3090075Sobrien
3190075Sobrien#ifndef _XMMINTRIN_H_INCLUDED
3290075Sobrien#define _XMMINTRIN_H_INCLUDED
3390075Sobrien
34117395Skan#ifndef __SSE__
35117395Skan# error "SSE instruction set not enabled"
36117395Skan#else
37117395Skan
3890075Sobrien/* We need type definitions from the MMX header file.  */
3990075Sobrien#include <mmintrin.h>
4090075Sobrien
41169689Skan/* Get _mm_malloc () and _mm_free ().  */
42242182Skan#if __STDC_HOSTED__
43169689Skan#include <mm_malloc.h>
44242182Skan#endif
4590075Sobrien
46169689Skan/* The Intel API is flexible enough that we must allow aliasing with other
47169689Skan   vector types, and their scalar components.  */
48169689Skantypedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
49169689Skan
50132718Skan/* Internal data types for implementing the intrinsics.  */
51169689Skantypedef float __v4sf __attribute__ ((__vector_size__ (16)));
5290075Sobrien
5390075Sobrien/* Create a selector for use with the SHUFPS instruction.  */
5490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
5590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
5690075Sobrien
5790075Sobrien/* Constants for use with _mm_prefetch.  */
5890075Sobrienenum _mm_hint
5990075Sobrien{
6090075Sobrien  _MM_HINT_T0 = 3,
6190075Sobrien  _MM_HINT_T1 = 2,
6290075Sobrien  _MM_HINT_T2 = 1,
6390075Sobrien  _MM_HINT_NTA = 0
6490075Sobrien};
6590075Sobrien
6690075Sobrien/* Bits in the MXCSR.  */
6790075Sobrien#define _MM_EXCEPT_MASK       0x003f
6890075Sobrien#define _MM_EXCEPT_INVALID    0x0001
6990075Sobrien#define _MM_EXCEPT_DENORM     0x0002
7090075Sobrien#define _MM_EXCEPT_DIV_ZERO   0x0004
7190075Sobrien#define _MM_EXCEPT_OVERFLOW   0x0008
7290075Sobrien#define _MM_EXCEPT_UNDERFLOW  0x0010
7390075Sobrien#define _MM_EXCEPT_INEXACT    0x0020
7490075Sobrien
7590075Sobrien#define _MM_MASK_MASK         0x1f80
7690075Sobrien#define _MM_MASK_INVALID      0x0080
7790075Sobrien#define _MM_MASK_DENORM       0x0100
7890075Sobrien#define _MM_MASK_DIV_ZERO     0x0200
7990075Sobrien#define _MM_MASK_OVERFLOW     0x0400
8090075Sobrien#define _MM_MASK_UNDERFLOW    0x0800
8190075Sobrien#define _MM_MASK_INEXACT      0x1000
8290075Sobrien
8390075Sobrien#define _MM_ROUND_MASK        0x6000
8490075Sobrien#define _MM_ROUND_NEAREST     0x0000
8590075Sobrien#define _MM_ROUND_DOWN        0x2000
8690075Sobrien#define _MM_ROUND_UP          0x4000
8790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000
8890075Sobrien
8990075Sobrien#define _MM_FLUSH_ZERO_MASK   0x8000
9090075Sobrien#define _MM_FLUSH_ZERO_ON     0x8000
9190075Sobrien#define _MM_FLUSH_ZERO_OFF    0x0000
9290075Sobrien
93169689Skan/* Create a vector of zeros.  */
94169689Skanstatic __inline __m128 __attribute__((__always_inline__))
95169689Skan_mm_setzero_ps (void)
96169689Skan{
97169689Skan  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
98169689Skan}
99169689Skan
10090075Sobrien/* Perform the respective operation on the lower SPFP (single-precision
10190075Sobrien   floating-point) values of A and B; the upper three SPFP values are
10290075Sobrien   passed through from A.  */
10390075Sobrien
104169689Skanstatic __inline __m128 __attribute__((__always_inline__))
10590075Sobrien_mm_add_ss (__m128 __A, __m128 __B)
10690075Sobrien{
10790075Sobrien  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
10890075Sobrien}
10990075Sobrien
110169689Skanstatic __inline __m128 __attribute__((__always_inline__))
11190075Sobrien_mm_sub_ss (__m128 __A, __m128 __B)
11290075Sobrien{
11390075Sobrien  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
11490075Sobrien}
11590075Sobrien
116169689Skanstatic __inline __m128 __attribute__((__always_inline__))
11790075Sobrien_mm_mul_ss (__m128 __A, __m128 __B)
11890075Sobrien{
11990075Sobrien  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
12090075Sobrien}
12190075Sobrien
122169689Skanstatic __inline __m128 __attribute__((__always_inline__))
12390075Sobrien_mm_div_ss (__m128 __A, __m128 __B)
12490075Sobrien{
12590075Sobrien  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
12690075Sobrien}
12790075Sobrien
128169689Skanstatic __inline __m128 __attribute__((__always_inline__))
12990075Sobrien_mm_sqrt_ss (__m128 __A)
13090075Sobrien{
13190075Sobrien  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
13290075Sobrien}
13390075Sobrien
134169689Skanstatic __inline __m128 __attribute__((__always_inline__))
13590075Sobrien_mm_rcp_ss (__m128 __A)
13690075Sobrien{
13790075Sobrien  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
13890075Sobrien}
13990075Sobrien
140169689Skanstatic __inline __m128 __attribute__((__always_inline__))
14190075Sobrien_mm_rsqrt_ss (__m128 __A)
14290075Sobrien{
14390075Sobrien  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
14490075Sobrien}
14590075Sobrien
146169689Skanstatic __inline __m128 __attribute__((__always_inline__))
14790075Sobrien_mm_min_ss (__m128 __A, __m128 __B)
14890075Sobrien{
14990075Sobrien  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
15090075Sobrien}
15190075Sobrien
152169689Skanstatic __inline __m128 __attribute__((__always_inline__))
15390075Sobrien_mm_max_ss (__m128 __A, __m128 __B)
15490075Sobrien{
15590075Sobrien  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
15690075Sobrien}
15790075Sobrien
15890075Sobrien/* Perform the respective operation on the four SPFP values in A and B.  */
15990075Sobrien
160169689Skanstatic __inline __m128 __attribute__((__always_inline__))
16190075Sobrien_mm_add_ps (__m128 __A, __m128 __B)
16290075Sobrien{
16390075Sobrien  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
16490075Sobrien}
16590075Sobrien
166169689Skanstatic __inline __m128 __attribute__((__always_inline__))
16790075Sobrien_mm_sub_ps (__m128 __A, __m128 __B)
16890075Sobrien{
16990075Sobrien  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
17090075Sobrien}
17190075Sobrien
172169689Skanstatic __inline __m128 __attribute__((__always_inline__))
17390075Sobrien_mm_mul_ps (__m128 __A, __m128 __B)
17490075Sobrien{
17590075Sobrien  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
17690075Sobrien}
17790075Sobrien
178169689Skanstatic __inline __m128 __attribute__((__always_inline__))
17990075Sobrien_mm_div_ps (__m128 __A, __m128 __B)
18090075Sobrien{
18190075Sobrien  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
18290075Sobrien}
18390075Sobrien
184169689Skanstatic __inline __m128 __attribute__((__always_inline__))
18590075Sobrien_mm_sqrt_ps (__m128 __A)
18690075Sobrien{
18790075Sobrien  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
18890075Sobrien}
18990075Sobrien
190169689Skanstatic __inline __m128 __attribute__((__always_inline__))
19190075Sobrien_mm_rcp_ps (__m128 __A)
19290075Sobrien{
19390075Sobrien  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
19490075Sobrien}
19590075Sobrien
196169689Skanstatic __inline __m128 __attribute__((__always_inline__))
19790075Sobrien_mm_rsqrt_ps (__m128 __A)
19890075Sobrien{
19990075Sobrien  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
20090075Sobrien}
20190075Sobrien
202169689Skanstatic __inline __m128 __attribute__((__always_inline__))
20390075Sobrien_mm_min_ps (__m128 __A, __m128 __B)
20490075Sobrien{
20590075Sobrien  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
20690075Sobrien}
20790075Sobrien
208169689Skanstatic __inline __m128 __attribute__((__always_inline__))
20990075Sobrien_mm_max_ps (__m128 __A, __m128 __B)
21090075Sobrien{
21190075Sobrien  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
21290075Sobrien}
21390075Sobrien
21490075Sobrien/* Perform logical bit-wise operations on 128-bit values.  */
21590075Sobrien
216169689Skanstatic __inline __m128 __attribute__((__always_inline__))
21790075Sobrien_mm_and_ps (__m128 __A, __m128 __B)
21890075Sobrien{
21990075Sobrien  return __builtin_ia32_andps (__A, __B);
22090075Sobrien}
22190075Sobrien
222169689Skanstatic __inline __m128 __attribute__((__always_inline__))
22390075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B)
22490075Sobrien{
22590075Sobrien  return __builtin_ia32_andnps (__A, __B);
22690075Sobrien}
22790075Sobrien
228169689Skanstatic __inline __m128 __attribute__((__always_inline__))
22990075Sobrien_mm_or_ps (__m128 __A, __m128 __B)
23090075Sobrien{
23190075Sobrien  return __builtin_ia32_orps (__A, __B);
23290075Sobrien}
23390075Sobrien
234169689Skanstatic __inline __m128 __attribute__((__always_inline__))
23590075Sobrien_mm_xor_ps (__m128 __A, __m128 __B)
23690075Sobrien{
23790075Sobrien  return __builtin_ia32_xorps (__A, __B);
23890075Sobrien}
23990075Sobrien
24090075Sobrien/* Perform a comparison on the lower SPFP values of A and B.  If the
24190075Sobrien   comparison is true, place a mask of all ones in the result, otherwise a
24290075Sobrien   mask of zeros.  The upper three SPFP values are passed through from A.  */
24390075Sobrien
244169689Skanstatic __inline __m128 __attribute__((__always_inline__))
24590075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B)
24690075Sobrien{
24790075Sobrien  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
24890075Sobrien}
24990075Sobrien
250169689Skanstatic __inline __m128 __attribute__((__always_inline__))
25190075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B)
25290075Sobrien{
25390075Sobrien  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
25490075Sobrien}
25590075Sobrien
256169689Skanstatic __inline __m128 __attribute__((__always_inline__))
25790075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B)
25890075Sobrien{
25990075Sobrien  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
26090075Sobrien}
26190075Sobrien
262169689Skanstatic __inline __m128 __attribute__((__always_inline__))
26390075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B)
26490075Sobrien{
265107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
266107590Sobrien					(__v4sf)
267107590Sobrien					__builtin_ia32_cmpltss ((__v4sf) __B,
268107590Sobrien								(__v4sf)
269107590Sobrien								__A));
27090075Sobrien}
27190075Sobrien
272169689Skanstatic __inline __m128 __attribute__((__always_inline__))
27390075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B)
27490075Sobrien{
275107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
276107590Sobrien					(__v4sf)
277107590Sobrien					__builtin_ia32_cmpless ((__v4sf) __B,
278107590Sobrien								(__v4sf)
279107590Sobrien								__A));
28090075Sobrien}
28190075Sobrien
282169689Skanstatic __inline __m128 __attribute__((__always_inline__))
28390075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B)
28490075Sobrien{
28590075Sobrien  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
28690075Sobrien}
28790075Sobrien
288169689Skanstatic __inline __m128 __attribute__((__always_inline__))
28990075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B)
29090075Sobrien{
29190075Sobrien  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
29290075Sobrien}
29390075Sobrien
294169689Skanstatic __inline __m128 __attribute__((__always_inline__))
29590075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B)
29690075Sobrien{
29790075Sobrien  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
29890075Sobrien}
29990075Sobrien
300169689Skanstatic __inline __m128 __attribute__((__always_inline__))
30190075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B)
30290075Sobrien{
303107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
304107590Sobrien					(__v4sf)
305107590Sobrien					__builtin_ia32_cmpnltss ((__v4sf) __B,
306107590Sobrien								 (__v4sf)
307107590Sobrien								 __A));
30890075Sobrien}
30990075Sobrien
310169689Skanstatic __inline __m128 __attribute__((__always_inline__))
31190075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B)
31290075Sobrien{
313107590Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
314107590Sobrien					(__v4sf)
315107590Sobrien					__builtin_ia32_cmpnless ((__v4sf) __B,
316107590Sobrien								 (__v4sf)
317107590Sobrien								 __A));
31890075Sobrien}
31990075Sobrien
320169689Skanstatic __inline __m128 __attribute__((__always_inline__))
32190075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B)
32290075Sobrien{
32390075Sobrien  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
32490075Sobrien}
32590075Sobrien
326169689Skanstatic __inline __m128 __attribute__((__always_inline__))
32790075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B)
32890075Sobrien{
32990075Sobrien  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
33090075Sobrien}
33190075Sobrien
33290075Sobrien/* Perform a comparison on the four SPFP values of A and B.  For each
33390075Sobrien   element, if the comparison is true, place a mask of all ones in the
33490075Sobrien   result, otherwise a mask of zeros.  */
33590075Sobrien
336169689Skanstatic __inline __m128 __attribute__((__always_inline__))
33790075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B)
33890075Sobrien{
33990075Sobrien  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
34090075Sobrien}
34190075Sobrien
342169689Skanstatic __inline __m128 __attribute__((__always_inline__))
34390075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B)
34490075Sobrien{
34590075Sobrien  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
34690075Sobrien}
34790075Sobrien
348169689Skanstatic __inline __m128 __attribute__((__always_inline__))
34990075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B)
35090075Sobrien{
35190075Sobrien  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
35290075Sobrien}
35390075Sobrien
354169689Skanstatic __inline __m128 __attribute__((__always_inline__))
35590075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B)
35690075Sobrien{
35790075Sobrien  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
35890075Sobrien}
35990075Sobrien
360169689Skanstatic __inline __m128 __attribute__((__always_inline__))
36190075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B)
36290075Sobrien{
36390075Sobrien  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
36490075Sobrien}
36590075Sobrien
366169689Skanstatic __inline __m128 __attribute__((__always_inline__))
36790075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B)
36890075Sobrien{
36990075Sobrien  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
37090075Sobrien}
37190075Sobrien
372169689Skanstatic __inline __m128 __attribute__((__always_inline__))
37390075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B)
37490075Sobrien{
37590075Sobrien  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
37690075Sobrien}
37790075Sobrien
378169689Skanstatic __inline __m128 __attribute__((__always_inline__))
37990075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B)
38090075Sobrien{
38190075Sobrien  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
38290075Sobrien}
38390075Sobrien
384169689Skanstatic __inline __m128 __attribute__((__always_inline__))
38590075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B)
38690075Sobrien{
38790075Sobrien  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
38890075Sobrien}
38990075Sobrien
390169689Skanstatic __inline __m128 __attribute__((__always_inline__))
39190075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B)
39290075Sobrien{
39390075Sobrien  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
39490075Sobrien}
39590075Sobrien
396169689Skanstatic __inline __m128 __attribute__((__always_inline__))
39790075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B)
39890075Sobrien{
39990075Sobrien  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
40090075Sobrien}
40190075Sobrien
402169689Skanstatic __inline __m128 __attribute__((__always_inline__))
40390075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B)
40490075Sobrien{
40590075Sobrien  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
40690075Sobrien}
40790075Sobrien
40890075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true
40990075Sobrien   and 0 if false.  */
41090075Sobrien
411169689Skanstatic __inline int __attribute__((__always_inline__))
41290075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B)
41390075Sobrien{
41490075Sobrien  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
41590075Sobrien}
41690075Sobrien
417169689Skanstatic __inline int __attribute__((__always_inline__))
41890075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B)
41990075Sobrien{
42090075Sobrien  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
42190075Sobrien}
42290075Sobrien
423169689Skanstatic __inline int __attribute__((__always_inline__))
42490075Sobrien_mm_comile_ss (__m128 __A, __m128 __B)
42590075Sobrien{
42690075Sobrien  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
42790075Sobrien}
42890075Sobrien
429169689Skanstatic __inline int __attribute__((__always_inline__))
43090075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B)
43190075Sobrien{
43290075Sobrien  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
43390075Sobrien}
43490075Sobrien
435169689Skanstatic __inline int __attribute__((__always_inline__))
43690075Sobrien_mm_comige_ss (__m128 __A, __m128 __B)
43790075Sobrien{
43890075Sobrien  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
43990075Sobrien}
44090075Sobrien
441169689Skanstatic __inline int __attribute__((__always_inline__))
44290075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B)
44390075Sobrien{
44490075Sobrien  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
44590075Sobrien}
44690075Sobrien
447169689Skanstatic __inline int __attribute__((__always_inline__))
44890075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B)
44990075Sobrien{
45090075Sobrien  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
45190075Sobrien}
45290075Sobrien
453169689Skanstatic __inline int __attribute__((__always_inline__))
45490075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B)
45590075Sobrien{
45690075Sobrien  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
45790075Sobrien}
45890075Sobrien
459169689Skanstatic __inline int __attribute__((__always_inline__))
46090075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B)
46190075Sobrien{
46290075Sobrien  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
46390075Sobrien}
46490075Sobrien
465169689Skanstatic __inline int __attribute__((__always_inline__))
46690075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B)
46790075Sobrien{
46890075Sobrien  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
46990075Sobrien}
47090075Sobrien
471169689Skanstatic __inline int __attribute__((__always_inline__))
47290075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B)
47390075Sobrien{
47490075Sobrien  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
47590075Sobrien}
47690075Sobrien
477169689Skanstatic __inline int __attribute__((__always_inline__))
47890075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B)
47990075Sobrien{
48090075Sobrien  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
48190075Sobrien}
48290075Sobrien
48390075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current
48490075Sobrien   rounding mode.  */
485169689Skanstatic __inline int __attribute__((__always_inline__))
48690075Sobrien_mm_cvtss_si32 (__m128 __A)
48790075Sobrien{
48890075Sobrien  return __builtin_ia32_cvtss2si ((__v4sf) __A);
48990075Sobrien}
49090075Sobrien
491169689Skanstatic __inline int __attribute__((__always_inline__))
492122180Skan_mm_cvt_ss2si (__m128 __A)
493122180Skan{
494122180Skan  return _mm_cvtss_si32 (__A);
495122180Skan}
496122180Skan
497117395Skan#ifdef __x86_64__
498169689Skan/* Convert the lower SPFP value to a 32-bit integer according to the
499169689Skan   current rounding mode.  */
500169689Skan
501169689Skan/* Intel intrinsic.  */
502169689Skanstatic __inline long long __attribute__((__always_inline__))
503169689Skan_mm_cvtss_si64 (__m128 __A)
504169689Skan{
505169689Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
506169689Skan}
507169689Skan
508169689Skan/* Microsoft intrinsic.  */
509169689Skanstatic __inline long long __attribute__((__always_inline__))
510117395Skan_mm_cvtss_si64x (__m128 __A)
511117395Skan{
512117395Skan  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
513117395Skan}
514117395Skan#endif
515117395Skan
51690075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the
51790075Sobrien   current rounding mode.  Return the integers in packed form.  */
518169689Skanstatic __inline __m64 __attribute__((__always_inline__))
51990075Sobrien_mm_cvtps_pi32 (__m128 __A)
52090075Sobrien{
52190075Sobrien  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
52290075Sobrien}
52390075Sobrien
524169689Skanstatic __inline __m64 __attribute__((__always_inline__))
525122180Skan_mm_cvt_ps2pi (__m128 __A)
526122180Skan{
527122180Skan  return _mm_cvtps_pi32 (__A);
528122180Skan}
529122180Skan
53090075Sobrien/* Truncate the lower SPFP value to a 32-bit integer.  */
531169689Skanstatic __inline int __attribute__((__always_inline__))
53290075Sobrien_mm_cvttss_si32 (__m128 __A)
53390075Sobrien{
53490075Sobrien  return __builtin_ia32_cvttss2si ((__v4sf) __A);
53590075Sobrien}
53690075Sobrien
537169689Skanstatic __inline int __attribute__((__always_inline__))
538122180Skan_mm_cvtt_ss2si (__m128 __A)
539122180Skan{
540122180Skan  return _mm_cvttss_si32 (__A);
541122180Skan}
542122180Skan
543117395Skan#ifdef __x86_64__
544117395Skan/* Truncate the lower SPFP value to a 32-bit integer.  */
545169689Skan
546169689Skan/* Intel intrinsic.  */
547169689Skanstatic __inline long long __attribute__((__always_inline__))
548169689Skan_mm_cvttss_si64 (__m128 __A)
549169689Skan{
550169689Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
551169689Skan}
552169689Skan
553169689Skan/* Microsoft intrinsic.  */
554169689Skanstatic __inline long long __attribute__((__always_inline__))
555117395Skan_mm_cvttss_si64x (__m128 __A)
556117395Skan{
557117395Skan  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
558117395Skan}
559117395Skan#endif
560117395Skan
56190075Sobrien/* Truncate the two lower SPFP values to 32-bit integers.  Return the
56290075Sobrien   integers in packed form.  */
563169689Skanstatic __inline __m64 __attribute__((__always_inline__))
56490075Sobrien_mm_cvttps_pi32 (__m128 __A)
56590075Sobrien{
56690075Sobrien  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
56790075Sobrien}
56890075Sobrien
569169689Skanstatic __inline __m64 __attribute__((__always_inline__))
570122180Skan_mm_cvtt_ps2pi (__m128 __A)
571122180Skan{
572122180Skan  return _mm_cvttps_pi32 (__A);
573122180Skan}
574122180Skan
57590075Sobrien/* Convert B to a SPFP value and insert it as element zero in A.  */
576169689Skanstatic __inline __m128 __attribute__((__always_inline__))
57790075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B)
57890075Sobrien{
57990075Sobrien  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
58090075Sobrien}
58190075Sobrien
582169689Skanstatic __inline __m128 __attribute__((__always_inline__))
583122180Skan_mm_cvt_si2ss (__m128 __A, int __B)
584122180Skan{
585122180Skan  return _mm_cvtsi32_ss (__A, __B);
586122180Skan}
587122180Skan
588117395Skan#ifdef __x86_64__
589117395Skan/* Convert B to a SPFP value and insert it as element zero in A.  */
590169689Skan
591169689Skan/* Intel intrinsic.  */
592169689Skanstatic __inline __m128 __attribute__((__always_inline__))
593169689Skan_mm_cvtsi64_ss (__m128 __A, long long __B)
594169689Skan{
595169689Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
596169689Skan}
597169689Skan
598169689Skan/* Microsoft intrinsic.  */
599169689Skanstatic __inline __m128 __attribute__((__always_inline__))
600117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B)
601117395Skan{
602117395Skan  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
603117395Skan}
604117395Skan#endif
605117395Skan
60690075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them
60790075Sobrien   as the two lower elements in A.  */
608169689Skanstatic __inline __m128 __attribute__((__always_inline__))
60990075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B)
61090075Sobrien{
61190075Sobrien  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
61290075Sobrien}
61390075Sobrien
614169689Skanstatic __inline __m128 __attribute__((__always_inline__))
615122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B)
616122180Skan{
617122180Skan  return _mm_cvtpi32_ps (__A, __B);
618122180Skan}
619122180Skan
62090075Sobrien/* Convert the four signed 16-bit values in A to SPFP form.  */
621169689Skanstatic __inline __m128 __attribute__((__always_inline__))
62290075Sobrien_mm_cvtpi16_ps (__m64 __A)
62390075Sobrien{
62490075Sobrien  __v4hi __sign;
62590075Sobrien  __v2si __hisi, __losi;
62690075Sobrien  __v4sf __r;
62790075Sobrien
62890075Sobrien  /* This comparison against zero gives us a mask that can be used to
62990075Sobrien     fill in the missing sign bits in the unpack operations below, so
63090075Sobrien     that we get signed values after unpacking.  */
631169689Skan  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
63290075Sobrien
63390075Sobrien  /* Convert the four words to doublewords.  */
63490075Sobrien  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
63590075Sobrien  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
63690075Sobrien
63790075Sobrien  /* Convert the doublewords to floating point two at a time.  */
638169689Skan  __r = (__v4sf) _mm_setzero_ps ();
63990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
64090075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
64190075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
64290075Sobrien
64390075Sobrien  return (__m128) __r;
64490075Sobrien}
64590075Sobrien
64690075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form.  */
647169689Skanstatic __inline __m128 __attribute__((__always_inline__))
64890075Sobrien_mm_cvtpu16_ps (__m64 __A)
64990075Sobrien{
65090075Sobrien  __v2si __hisi, __losi;
65190075Sobrien  __v4sf __r;
65290075Sobrien
65390075Sobrien  /* Convert the four words to doublewords.  */
654169689Skan  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
655169689Skan  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
65690075Sobrien
65790075Sobrien  /* Convert the doublewords to floating point two at a time.  */
658169689Skan  __r = (__v4sf) _mm_setzero_ps ();
65990075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __hisi);
66090075Sobrien  __r = __builtin_ia32_movlhps (__r, __r);
66190075Sobrien  __r = __builtin_ia32_cvtpi2ps (__r, __losi);
66290075Sobrien
66390075Sobrien  return (__m128) __r;
66490075Sobrien}
66590075Sobrien
66690075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form.  */
667169689Skanstatic __inline __m128 __attribute__((__always_inline__))
66890075Sobrien_mm_cvtpi8_ps (__m64 __A)
66990075Sobrien{
67090075Sobrien  __v8qi __sign;
67190075Sobrien
67290075Sobrien  /* This comparison against zero gives us a mask that can be used to
67390075Sobrien     fill in the missing sign bits in the unpack operations below, so
67490075Sobrien     that we get signed values after unpacking.  */
675169689Skan  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
67690075Sobrien
67790075Sobrien  /* Convert the four low bytes to words.  */
67890075Sobrien  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
67990075Sobrien
68090075Sobrien  return _mm_cvtpi16_ps(__A);
68190075Sobrien}
68290075Sobrien
68390075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
684169689Skanstatic __inline __m128 __attribute__((__always_inline__))
68590075Sobrien_mm_cvtpu8_ps(__m64 __A)
68690075Sobrien{
687169689Skan  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
68890075Sobrien  return _mm_cvtpu16_ps(__A);
68990075Sobrien}
69090075Sobrien
69190075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form.  */
692169689Skanstatic __inline __m128 __attribute__((__always_inline__))
69390075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
69490075Sobrien{
695169689Skan  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
69690075Sobrien  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
69790075Sobrien  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
69890075Sobrien  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
69990075Sobrien}
70090075Sobrien
70190075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers.  */
702169689Skanstatic __inline __m64 __attribute__((__always_inline__))
70390075Sobrien_mm_cvtps_pi16(__m128 __A)
70490075Sobrien{
70590075Sobrien  __v4sf __hisf = (__v4sf)__A;
70690075Sobrien  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
70790075Sobrien  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
70890075Sobrien  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
709117395Skan  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
71090075Sobrien}
71190075Sobrien
71290075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers.  */
713169689Skanstatic __inline __m64 __attribute__((__always_inline__))
71490075Sobrien_mm_cvtps_pi8(__m128 __A)
71590075Sobrien{
71690075Sobrien  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
717169689Skan  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
71890075Sobrien}
71990075Sobrien
72090075Sobrien/* Selects four specific SPFP values from A and B based on MASK.  */
72190075Sobrien#if 0
722169689Skanstatic __inline __m128 __attribute__((__always_inline__))
72390075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
72490075Sobrien{
72590075Sobrien  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
72690075Sobrien}
72790075Sobrien#else
72890075Sobrien#define _mm_shuffle_ps(A, B, MASK) \
72990075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK)))
73090075Sobrien#endif
73190075Sobrien
73290075Sobrien
73390075Sobrien/* Selects and interleaves the upper two SPFP values from A and B.  */
734169689Skanstatic __inline __m128 __attribute__((__always_inline__))
73590075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B)
73690075Sobrien{
73790075Sobrien  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
73890075Sobrien}
73990075Sobrien
74090075Sobrien/* Selects and interleaves the lower two SPFP values from A and B.  */
741169689Skanstatic __inline __m128 __attribute__((__always_inline__))
74290075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B)
74390075Sobrien{
74490075Sobrien  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
74590075Sobrien}
74690075Sobrien
74790075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P;
74890075Sobrien   the lower two values are passed through from A.  */
749169689Skanstatic __inline __m128 __attribute__((__always_inline__))
750117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P)
75190075Sobrien{
75290075Sobrien  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
75390075Sobrien}
75490075Sobrien
75590075Sobrien/* Stores the upper two SPFP values of A into P.  */
756169689Skanstatic __inline void __attribute__((__always_inline__))
75790075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A)
75890075Sobrien{
75990075Sobrien  __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
76090075Sobrien}
76190075Sobrien
76290075Sobrien/* Moves the upper two values of B into the lower two values of A.  */
763169689Skanstatic __inline __m128 __attribute__((__always_inline__))
76490075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B)
76590075Sobrien{
76690075Sobrien  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
76790075Sobrien}
76890075Sobrien
76990075Sobrien/* Moves the lower two values of B into the upper two values of A.  */
770169689Skanstatic __inline __m128 __attribute__((__always_inline__))
77190075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B)
77290075Sobrien{
77390075Sobrien  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
77490075Sobrien}
77590075Sobrien
77690075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P;
77790075Sobrien   the upper two values are passed through from A.  */
778169689Skanstatic __inline __m128 __attribute__((__always_inline__))
779117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P)
78090075Sobrien{
78190075Sobrien  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
78290075Sobrien}
78390075Sobrien
78490075Sobrien/* Stores the lower two SPFP values of A into P.  */
785169689Skanstatic __inline void __attribute__((__always_inline__))
78690075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A)
78790075Sobrien{
78890075Sobrien  __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
78990075Sobrien}
79090075Sobrien
79190075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
792169689Skanstatic __inline int __attribute__((__always_inline__))
79390075Sobrien_mm_movemask_ps (__m128 __A)
79490075Sobrien{
79590075Sobrien  return __builtin_ia32_movmskps ((__v4sf)__A);
79690075Sobrien}
79790075Sobrien
79890075Sobrien/* Return the contents of the control register.  */
799169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
80090075Sobrien_mm_getcsr (void)
80190075Sobrien{
80290075Sobrien  return __builtin_ia32_stmxcsr ();
80390075Sobrien}
80490075Sobrien
80590075Sobrien/* Read exception bits from the control register.  */
806169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
80790075Sobrien_MM_GET_EXCEPTION_STATE (void)
80890075Sobrien{
80990075Sobrien  return _mm_getcsr() & _MM_EXCEPT_MASK;
81090075Sobrien}
81190075Sobrien
812169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
81390075Sobrien_MM_GET_EXCEPTION_MASK (void)
81490075Sobrien{
81590075Sobrien  return _mm_getcsr() & _MM_MASK_MASK;
81690075Sobrien}
81790075Sobrien
818169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
81990075Sobrien_MM_GET_ROUNDING_MODE (void)
82090075Sobrien{
82190075Sobrien  return _mm_getcsr() & _MM_ROUND_MASK;
82290075Sobrien}
82390075Sobrien
824169689Skanstatic __inline unsigned int __attribute__((__always_inline__))
82590075Sobrien_MM_GET_FLUSH_ZERO_MODE (void)
82690075Sobrien{
82790075Sobrien  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
82890075Sobrien}
82990075Sobrien
83090075Sobrien/* Set the control register to I.  */
831169689Skanstatic __inline void __attribute__((__always_inline__))
83290075Sobrien_mm_setcsr (unsigned int __I)
83390075Sobrien{
83490075Sobrien  __builtin_ia32_ldmxcsr (__I);
83590075Sobrien}
83690075Sobrien
83790075Sobrien/* Set exception bits in the control register.  */
838169689Skanstatic __inline void __attribute__((__always_inline__))
83990075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask)
84090075Sobrien{
84190075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
84290075Sobrien}
84390075Sobrien
844169689Skanstatic __inline void __attribute__((__always_inline__))
84590075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask)
84690075Sobrien{
84790075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
84890075Sobrien}
84990075Sobrien
850169689Skanstatic __inline void __attribute__((__always_inline__))
85190075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode)
85290075Sobrien{
85390075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
85490075Sobrien}
85590075Sobrien
856169689Skanstatic __inline void __attribute__((__always_inline__))
85790075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
85890075Sobrien{
85990075Sobrien  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
86090075Sobrien}
86190075Sobrien
862169689Skan/* Create a vector with element 0 as F and the rest zero.  */
863169689Skanstatic __inline __m128 __attribute__((__always_inline__))
864169689Skan_mm_set_ss (float __F)
865169689Skan{
866169689Skan  return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 };
867169689Skan}
868169689Skan
869169689Skan/* Create a vector with all four elements equal to F.  */
870169689Skanstatic __inline __m128 __attribute__((__always_inline__))
871169689Skan_mm_set1_ps (float __F)
872169689Skan{
873169689Skan  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
874169689Skan}
875169689Skan
876169689Skanstatic __inline __m128 __attribute__((__always_inline__))
877169689Skan_mm_set_ps1 (float __F)
878169689Skan{
879169689Skan  return _mm_set1_ps (__F);
880169689Skan}
881169689Skan
88290075Sobrien/* Create a vector with element 0 as *P and the rest zero.  */
883169689Skanstatic __inline __m128 __attribute__((__always_inline__))
884117395Skan_mm_load_ss (float const *__P)
88590075Sobrien{
886169689Skan  return _mm_set_ss (*__P);
88790075Sobrien}
88890075Sobrien
88990075Sobrien/* Create a vector with all four elements equal to *P.  */
890169689Skanstatic __inline __m128 __attribute__((__always_inline__))
891117395Skan_mm_load1_ps (float const *__P)
89290075Sobrien{
893169689Skan  return _mm_set1_ps (*__P);
89490075Sobrien}
89590075Sobrien
896169689Skanstatic __inline __m128 __attribute__((__always_inline__))
897117395Skan_mm_load_ps1 (float const *__P)
89890075Sobrien{
89990075Sobrien  return _mm_load1_ps (__P);
90090075Sobrien}
90190075Sobrien
90290075Sobrien/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
903169689Skanstatic __inline __m128 __attribute__((__always_inline__))
904117395Skan_mm_load_ps (float const *__P)
90590075Sobrien{
906169689Skan  return (__m128) *(__v4sf *)__P;
90790075Sobrien}
90890075Sobrien
90990075Sobrien/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
910169689Skanstatic __inline __m128 __attribute__((__always_inline__))
911117395Skan_mm_loadu_ps (float const *__P)
91290075Sobrien{
91390075Sobrien  return (__m128) __builtin_ia32_loadups (__P);
91490075Sobrien}
91590075Sobrien
91690075Sobrien/* Load four SPFP values in reverse order.  The address must be aligned.  */
917169689Skanstatic __inline __m128 __attribute__((__always_inline__))
918117395Skan_mm_loadr_ps (float const *__P)
91990075Sobrien{
920169689Skan  __v4sf __tmp = *(__v4sf *)__P;
92190075Sobrien  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
92290075Sobrien}
92390075Sobrien
924169689Skan/* Create the vector [Z Y X W].  */
925169689Skanstatic __inline __m128 __attribute__((__always_inline__))
926169689Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
92790075Sobrien{
928169689Skan  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
92990075Sobrien}
93090075Sobrien
931169689Skan/* Create the vector [W X Y Z].  */
932169689Skanstatic __inline __m128 __attribute__((__always_inline__))
933169689Skan_mm_setr_ps (float __Z, float __Y, float __X, float __W)
93490075Sobrien{
935169689Skan  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
93690075Sobrien}
93790075Sobrien
938169689Skan/* Stores the lower SPFP value.  */
939169689Skanstatic __inline void __attribute__((__always_inline__))
940169689Skan_mm_store_ss (float *__P, __m128 __A)
94190075Sobrien{
942169689Skan  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
94390075Sobrien}
94490075Sobrien
945169689Skanstatic __inline float __attribute__((__always_inline__))
946169689Skan_mm_cvtss_f32 (__m128 __A)
94790075Sobrien{
948169689Skan  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
94990075Sobrien}
95090075Sobrien
951169689Skan/* Store four SPFP values.  The address must be 16-byte aligned.  */
952169689Skanstatic __inline void __attribute__((__always_inline__))
953169689Skan_mm_store_ps (float *__P, __m128 __A)
95490075Sobrien{
955169689Skan  *(__v4sf *)__P = (__v4sf)__A;
95690075Sobrien}
95790075Sobrien
958169689Skan/* Store four SPFP values.  The address need not be 16-byte aligned.  */
959169689Skanstatic __inline void __attribute__((__always_inline__))
960169689Skan_mm_storeu_ps (float *__P, __m128 __A)
96190075Sobrien{
962169689Skan  __builtin_ia32_storeups (__P, (__v4sf)__A);
96390075Sobrien}
96490075Sobrien
96590075Sobrien/* Store the lower SPFP value across four words.  */
966169689Skanstatic __inline void __attribute__((__always_inline__))
96790075Sobrien_mm_store1_ps (float *__P, __m128 __A)
96890075Sobrien{
96990075Sobrien  __v4sf __va = (__v4sf)__A;
97090075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
971169689Skan  _mm_storeu_ps (__P, __tmp);
97290075Sobrien}
97390075Sobrien
974169689Skanstatic __inline void __attribute__((__always_inline__))
97590075Sobrien_mm_store_ps1 (float *__P, __m128 __A)
97690075Sobrien{
97790075Sobrien  _mm_store1_ps (__P, __A);
97890075Sobrien}
97990075Sobrien
980117395Skan/* Store four SPFP values in reverse order.  The address must be aligned.  */
981169689Skanstatic __inline void __attribute__((__always_inline__))
98290075Sobrien_mm_storer_ps (float *__P, __m128 __A)
98390075Sobrien{
98490075Sobrien  __v4sf __va = (__v4sf)__A;
98590075Sobrien  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
986169689Skan  _mm_store_ps (__P, __tmp);
98790075Sobrien}
98890075Sobrien
98990075Sobrien/* Sets the low SPFP value of A from the low value of B.  */
990169689Skanstatic __inline __m128 __attribute__((__always_inline__))
99190075Sobrien_mm_move_ss (__m128 __A, __m128 __B)
99290075Sobrien{
99390075Sobrien  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
99490075Sobrien}
99590075Sobrien
99690075Sobrien/* Extracts one of the four words of A.  The selector N must be immediate.  */
99790075Sobrien#if 0
998169689Skanstatic __inline int __attribute__((__always_inline__))
999169689Skan_mm_extract_pi16 (__m64 const __A, int const __N)
100090075Sobrien{
1001169689Skan  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
100290075Sobrien}
1003122180Skan
1004169689Skanstatic __inline int __attribute__((__always_inline__))
1005169689Skan_m_pextrw (__m64 const __A, int const __N)
1006122180Skan{
1007122180Skan  return _mm_extract_pi16 (__A, __N);
1008122180Skan}
100990075Sobrien#else
1010169689Skan#define _mm_extract_pi16(A, N)	__builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N))
1011122180Skan#define _m_pextrw(A, N)		_mm_extract_pi16((A), (N))
101290075Sobrien#endif
101390075Sobrien
101490075Sobrien/* Inserts word D into one of four words of A.  The selector N must be
101590075Sobrien   immediate.  */
101690075Sobrien#if 0
1017169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1018169689Skan_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
101990075Sobrien{
1020169689Skan  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
102190075Sobrien}
1022122180Skan
1023169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1024169689Skan_m_pinsrw (__m64 const __A, int const __D, int const __N)
1025122180Skan{
1026122180Skan  return _mm_insert_pi16 (__A, __D, __N);
1027122180Skan}
102890075Sobrien#else
102990075Sobrien#define _mm_insert_pi16(A, D, N) \
1030169689Skan  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N)))
1031122180Skan#define _m_pinsrw(A, D, N)	 _mm_insert_pi16((A), (D), (N))
103290075Sobrien#endif
103390075Sobrien
103490075Sobrien/* Compute the element-wise maximum of signed 16-bit values.  */
1035169689Skanstatic __inline __m64 __attribute__((__always_inline__))
103690075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B)
103790075Sobrien{
103890075Sobrien  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
103990075Sobrien}
104090075Sobrien
1041169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1042122180Skan_m_pmaxsw (__m64 __A, __m64 __B)
1043122180Skan{
1044122180Skan  return _mm_max_pi16 (__A, __B);
1045122180Skan}
1046122180Skan
104790075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values.  */
1048169689Skanstatic __inline __m64 __attribute__((__always_inline__))
104990075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B)
105090075Sobrien{
105190075Sobrien  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
105290075Sobrien}
105390075Sobrien
1054169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1055122180Skan_m_pmaxub (__m64 __A, __m64 __B)
1056122180Skan{
1057122180Skan  return _mm_max_pu8 (__A, __B);
1058122180Skan}
1059122180Skan
106090075Sobrien/* Compute the element-wise minimum of signed 16-bit values.  */
1061169689Skanstatic __inline __m64 __attribute__((__always_inline__))
106290075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B)
106390075Sobrien{
106490075Sobrien  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
106590075Sobrien}
106690075Sobrien
1067169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1068122180Skan_m_pminsw (__m64 __A, __m64 __B)
1069122180Skan{
1070122180Skan  return _mm_min_pi16 (__A, __B);
1071122180Skan}
1072122180Skan
107390075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values.  */
1074169689Skanstatic __inline __m64 __attribute__((__always_inline__))
107590075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B)
107690075Sobrien{
107790075Sobrien  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
107890075Sobrien}
107990075Sobrien
1080169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1081122180Skan_m_pminub (__m64 __A, __m64 __B)
1082122180Skan{
1083122180Skan  return _mm_min_pu8 (__A, __B);
1084122180Skan}
1085122180Skan
108690075Sobrien/* Create an 8-bit mask of the signs of 8-bit values.  */
1087169689Skanstatic __inline int __attribute__((__always_inline__))
108890075Sobrien_mm_movemask_pi8 (__m64 __A)
108990075Sobrien{
109090075Sobrien  return __builtin_ia32_pmovmskb ((__v8qi)__A);
109190075Sobrien}
109290075Sobrien
1093169689Skanstatic __inline int __attribute__((__always_inline__))
1094122180Skan_m_pmovmskb (__m64 __A)
1095122180Skan{
1096122180Skan  return _mm_movemask_pi8 (__A);
1097122180Skan}
1098122180Skan
109990075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
110090075Sobrien   in B and produce the high 16 bits of the 32-bit results.  */
1101169689Skanstatic __inline __m64 __attribute__((__always_inline__))
110290075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B)
110390075Sobrien{
110490075Sobrien  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
110590075Sobrien}
110690075Sobrien
1107169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1108122180Skan_m_pmulhuw (__m64 __A, __m64 __B)
1109122180Skan{
1110122180Skan  return _mm_mulhi_pu16 (__A, __B);
1111122180Skan}
1112122180Skan
111390075Sobrien/* Return a combination of the four 16-bit values in A.  The selector
111490075Sobrien   must be an immediate.  */
111590075Sobrien#if 0
1116169689Skanstatic __inline __m64 __attribute__((__always_inline__))
111790075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N)
111890075Sobrien{
111990075Sobrien  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
112090075Sobrien}
1121122180Skan
1122169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1123122180Skan_m_pshufw (__m64 __A, int __N)
1124122180Skan{
1125122180Skan  return _mm_shuffle_pi16 (__A, __N);
1126122180Skan}
112790075Sobrien#else
112890075Sobrien#define _mm_shuffle_pi16(A, N) \
112990075Sobrien  ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N)))
1130122180Skan#define _m_pshufw(A, N)		_mm_shuffle_pi16 ((A), (N))
113190075Sobrien#endif
113290075Sobrien
113390075Sobrien/* Conditionally store byte elements of A into P.  The high bit of each
113490075Sobrien   byte in the selector N determines whether the corresponding byte from
113590075Sobrien   A is stored.  */
1136169689Skanstatic __inline void __attribute__((__always_inline__))
113790075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
113890075Sobrien{
113990075Sobrien  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
114090075Sobrien}
114190075Sobrien
1142169689Skanstatic __inline void __attribute__((__always_inline__))
1143122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1144122180Skan{
1145122180Skan  _mm_maskmove_si64 (__A, __N, __P);
1146122180Skan}
1147122180Skan
114890075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1149169689Skanstatic __inline __m64 __attribute__((__always_inline__))
115090075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B)
115190075Sobrien{
115290075Sobrien  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
115390075Sobrien}
115490075Sobrien
1155169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1156122180Skan_m_pavgb (__m64 __A, __m64 __B)
1157122180Skan{
1158122180Skan  return _mm_avg_pu8 (__A, __B);
1159122180Skan}
1160122180Skan
116190075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1162169689Skanstatic __inline __m64 __attribute__((__always_inline__))
116390075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B)
116490075Sobrien{
116590075Sobrien  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
116690075Sobrien}
116790075Sobrien
1168169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1169122180Skan_m_pavgw (__m64 __A, __m64 __B)
1170122180Skan{
1171122180Skan  return _mm_avg_pu16 (__A, __B);
1172122180Skan}
1173122180Skan
117490075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit
117590075Sobrien   values in A and B.  Return the value in the lower 16-bit word; the
117690075Sobrien   upper words are cleared.  */
1177169689Skanstatic __inline __m64 __attribute__((__always_inline__))
117890075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B)
117990075Sobrien{
118090075Sobrien  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
118190075Sobrien}
118290075Sobrien
1183169689Skanstatic __inline __m64 __attribute__((__always_inline__))
1184122180Skan_m_psadbw (__m64 __A, __m64 __B)
1185122180Skan{
1186122180Skan  return _mm_sad_pu8 (__A, __B);
1187122180Skan}
1188122180Skan
118990075Sobrien/* Loads one cache line from address P to a location "closer" to the
119090075Sobrien   processor.  The selector I specifies the type of prefetch operation.  */
119190075Sobrien#if 0
1192169689Skanstatic __inline void __attribute__((__always_inline__))
119390075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I)
119490075Sobrien{
119590075Sobrien  __builtin_prefetch (__P, 0, __I);
119690075Sobrien}
119790075Sobrien#else
119890075Sobrien#define _mm_prefetch(P, I) \
119990075Sobrien  __builtin_prefetch ((P), 0, (I))
120090075Sobrien#endif
120190075Sobrien
120290075Sobrien/* Stores the data in A to the address P without polluting the caches.  */
1203169689Skanstatic __inline void __attribute__((__always_inline__))
120490075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A)
120590075Sobrien{
1206117395Skan  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
120790075Sobrien}
120890075Sobrien
120990075Sobrien/* Likewise.  The address must be 16-byte aligned.  */
1210169689Skanstatic __inline void __attribute__((__always_inline__))
121190075Sobrien_mm_stream_ps (float *__P, __m128 __A)
121290075Sobrien{
121390075Sobrien  __builtin_ia32_movntps (__P, (__v4sf)__A);
121490075Sobrien}
121590075Sobrien
1216132718Skan/* Guarantees that every preceding store is globally visible before
121790075Sobrien   any subsequent store.  */
1218169689Skanstatic __inline void __attribute__((__always_inline__))
121990075Sobrien_mm_sfence (void)
122090075Sobrien{
122190075Sobrien  __builtin_ia32_sfence ();
122290075Sobrien}
122390075Sobrien
122490075Sobrien/* The execution of the next instruction is delayed by an implementation
122590075Sobrien   specific amount of time.  The instruction does not modify the
122690075Sobrien   architectural state.  */
1227169689Skanstatic __inline void __attribute__((__always_inline__))
122890075Sobrien_mm_pause (void)
122990075Sobrien{
123090075Sobrien  __asm__ __volatile__ ("rep; nop" : : );
123190075Sobrien}
123290075Sobrien
123390075Sobrien/* Transpose the 4x4 matrix composed of row[0-3].  */
123490075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
123590075Sobriendo {									\
123690075Sobrien  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1237169689Skan  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
1238169689Skan  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
1239169689Skan  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
1240169689Skan  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
1241169689Skan  (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
1242169689Skan  (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
1243169689Skan  (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
1244169689Skan  (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
124590075Sobrien} while (0)
124690075Sobrien
1247122180Skan/* For backward source compatibility.  */
1248219639Smm#ifdef __SSE2__
1249122180Skan#include <emmintrin.h>
1250219639Smm#endif
1251117395Skan
1252117395Skan#endif /* __SSE__ */
125390075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */
1254