xmmintrin.h revision 219639
1169689Skan/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 2169689Skan Free Software Foundation, Inc. 390075Sobrien 4132718Skan This file is part of GCC. 590075Sobrien 6132718Skan GCC is free software; you can redistribute it and/or modify 790075Sobrien it under the terms of the GNU General Public License as published by 890075Sobrien the Free Software Foundation; either version 2, or (at your option) 990075Sobrien any later version. 1090075Sobrien 11132718Skan GCC is distributed in the hope that it will be useful, 1290075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1390075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1490075Sobrien GNU General Public License for more details. 1590075Sobrien 1690075Sobrien You should have received a copy of the GNU General Public License 17132718Skan along with GCC; see the file COPYING. If not, write to 18169689Skan the Free Software Foundation, 51 Franklin Street, Fifth Floor, 19169689Skan Boston, MA 02110-1301, USA. */ 2090075Sobrien 2190075Sobrien/* As a special exception, if you include this header file into source 2290075Sobrien files compiled by GCC, this header file does not by itself cause 2390075Sobrien the resulting executable to be covered by the GNU General Public 2490075Sobrien License. This exception does not however invalidate any other 2590075Sobrien reasons why the executable file might be covered by the GNU General 2690075Sobrien Public License. */ 2790075Sobrien 2890075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 29169689Skan User Guide and Reference, version 9.0. */ 3090075Sobrien 3190075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3290075Sobrien#define _XMMINTRIN_H_INCLUDED 3390075Sobrien 34117395Skan#ifndef __SSE__ 35117395Skan# error "SSE instruction set not enabled" 36117395Skan#else 37117395Skan 3890075Sobrien/* We need type definitions from the MMX header file. */ 3990075Sobrien#include <mmintrin.h> 4090075Sobrien 41169689Skan/* Get _mm_malloc () and _mm_free (). */ 42169689Skan#include <mm_malloc.h> 4390075Sobrien 44169689Skan/* The Intel API is flexible enough that we must allow aliasing with other 45169689Skan vector types, and their scalar components. */ 46169689Skantypedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 47169689Skan 48132718Skan/* Internal data types for implementing the intrinsics. */ 49169689Skantypedef float __v4sf __attribute__ ((__vector_size__ (16))); 5090075Sobrien 5190075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 5290075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 5390075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 5490075Sobrien 5590075Sobrien/* Constants for use with _mm_prefetch. */ 5690075Sobrienenum _mm_hint 5790075Sobrien{ 5890075Sobrien _MM_HINT_T0 = 3, 5990075Sobrien _MM_HINT_T1 = 2, 6090075Sobrien _MM_HINT_T2 = 1, 6190075Sobrien _MM_HINT_NTA = 0 6290075Sobrien}; 6390075Sobrien 6490075Sobrien/* Bits in the MXCSR. */ 6590075Sobrien#define _MM_EXCEPT_MASK 0x003f 6690075Sobrien#define _MM_EXCEPT_INVALID 0x0001 6790075Sobrien#define _MM_EXCEPT_DENORM 0x0002 6890075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 6990075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 7090075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 7190075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 7290075Sobrien 7390075Sobrien#define _MM_MASK_MASK 0x1f80 7490075Sobrien#define _MM_MASK_INVALID 0x0080 7590075Sobrien#define _MM_MASK_DENORM 0x0100 7690075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 7790075Sobrien#define _MM_MASK_OVERFLOW 0x0400 7890075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 7990075Sobrien#define _MM_MASK_INEXACT 0x1000 8090075Sobrien 8190075Sobrien#define _MM_ROUND_MASK 0x6000 8290075Sobrien#define _MM_ROUND_NEAREST 0x0000 8390075Sobrien#define _MM_ROUND_DOWN 0x2000 8490075Sobrien#define _MM_ROUND_UP 0x4000 8590075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 8690075Sobrien 8790075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 8890075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 8990075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 9090075Sobrien 91169689Skan/* Create a vector of zeros. */ 92169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 93169689Skan_mm_setzero_ps (void) 94169689Skan{ 95169689Skan return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 96169689Skan} 97169689Skan 9890075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 9990075Sobrien floating-point) values of A and B; the upper three SPFP values are 10090075Sobrien passed through from A. */ 10190075Sobrien 102169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 10390075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 10490075Sobrien{ 10590075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 10690075Sobrien} 10790075Sobrien 108169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 10990075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 11090075Sobrien{ 11190075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 11290075Sobrien} 11390075Sobrien 114169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 11590075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 11690075Sobrien{ 11790075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 11890075Sobrien} 11990075Sobrien 120169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 12190075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 12290075Sobrien{ 12390075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 12490075Sobrien} 12590075Sobrien 126169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 12790075Sobrien_mm_sqrt_ss (__m128 __A) 12890075Sobrien{ 12990075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 13090075Sobrien} 13190075Sobrien 132169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 13390075Sobrien_mm_rcp_ss (__m128 __A) 13490075Sobrien{ 13590075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 13690075Sobrien} 13790075Sobrien 138169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 13990075Sobrien_mm_rsqrt_ss (__m128 __A) 14090075Sobrien{ 14190075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 14290075Sobrien} 14390075Sobrien 144169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 14590075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 14690075Sobrien{ 14790075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 14890075Sobrien} 14990075Sobrien 150169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 15190075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 15290075Sobrien{ 15390075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 15490075Sobrien} 15590075Sobrien 15690075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 15790075Sobrien 158169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 15990075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 16090075Sobrien{ 16190075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 16290075Sobrien} 16390075Sobrien 164169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 16590075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 16690075Sobrien{ 16790075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 16890075Sobrien} 16990075Sobrien 170169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 17190075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 17290075Sobrien{ 17390075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 17490075Sobrien} 17590075Sobrien 176169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 17790075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 17890075Sobrien{ 17990075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 18090075Sobrien} 18190075Sobrien 182169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 18390075Sobrien_mm_sqrt_ps (__m128 __A) 18490075Sobrien{ 18590075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 18690075Sobrien} 18790075Sobrien 188169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 18990075Sobrien_mm_rcp_ps (__m128 __A) 19090075Sobrien{ 19190075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 19290075Sobrien} 19390075Sobrien 194169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 19590075Sobrien_mm_rsqrt_ps (__m128 __A) 19690075Sobrien{ 19790075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 19890075Sobrien} 19990075Sobrien 200169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 20190075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 20290075Sobrien{ 20390075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 20490075Sobrien} 20590075Sobrien 206169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 20790075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 20890075Sobrien{ 20990075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 21090075Sobrien} 21190075Sobrien 21290075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 21390075Sobrien 214169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 21590075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 21690075Sobrien{ 21790075Sobrien return __builtin_ia32_andps (__A, __B); 21890075Sobrien} 21990075Sobrien 220169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 22190075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 22290075Sobrien{ 22390075Sobrien return __builtin_ia32_andnps (__A, __B); 22490075Sobrien} 22590075Sobrien 226169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 22790075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 22890075Sobrien{ 22990075Sobrien return __builtin_ia32_orps (__A, __B); 23090075Sobrien} 23190075Sobrien 232169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 23390075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 23490075Sobrien{ 23590075Sobrien return __builtin_ia32_xorps (__A, __B); 23690075Sobrien} 23790075Sobrien 23890075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 23990075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 24090075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 24190075Sobrien 242169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 24390075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 24490075Sobrien{ 24590075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 24690075Sobrien} 24790075Sobrien 248169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 24990075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 25090075Sobrien{ 25190075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 25290075Sobrien} 25390075Sobrien 254169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 25590075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 25690075Sobrien{ 25790075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 25890075Sobrien} 25990075Sobrien 260169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 26190075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 26290075Sobrien{ 263107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 264107590Sobrien (__v4sf) 265107590Sobrien __builtin_ia32_cmpltss ((__v4sf) __B, 266107590Sobrien (__v4sf) 267107590Sobrien __A)); 26890075Sobrien} 26990075Sobrien 270169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 27190075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 27290075Sobrien{ 273107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 274107590Sobrien (__v4sf) 275107590Sobrien __builtin_ia32_cmpless ((__v4sf) __B, 276107590Sobrien (__v4sf) 277107590Sobrien __A)); 27890075Sobrien} 27990075Sobrien 280169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 28190075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 28290075Sobrien{ 28390075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 28490075Sobrien} 28590075Sobrien 286169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 28790075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 28890075Sobrien{ 28990075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 29090075Sobrien} 29190075Sobrien 292169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 29390075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 29490075Sobrien{ 29590075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 29690075Sobrien} 29790075Sobrien 298169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 29990075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 30090075Sobrien{ 301107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 302107590Sobrien (__v4sf) 303107590Sobrien __builtin_ia32_cmpnltss ((__v4sf) __B, 304107590Sobrien (__v4sf) 305107590Sobrien __A)); 30690075Sobrien} 30790075Sobrien 308169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 30990075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 31090075Sobrien{ 311107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 312107590Sobrien (__v4sf) 313107590Sobrien __builtin_ia32_cmpnless ((__v4sf) __B, 314107590Sobrien (__v4sf) 315107590Sobrien __A)); 31690075Sobrien} 31790075Sobrien 318169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 31990075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 32090075Sobrien{ 32190075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 32290075Sobrien} 32390075Sobrien 324169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 32590075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 32690075Sobrien{ 32790075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 32890075Sobrien} 32990075Sobrien 33090075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 33190075Sobrien element, if the comparison is true, place a mask of all ones in the 33290075Sobrien result, otherwise a mask of zeros. */ 33390075Sobrien 334169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 33590075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 33690075Sobrien{ 33790075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 33890075Sobrien} 33990075Sobrien 340169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 34190075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 34290075Sobrien{ 34390075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 34490075Sobrien} 34590075Sobrien 346169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 34790075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 34890075Sobrien{ 34990075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 35090075Sobrien} 35190075Sobrien 352169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 35390075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 35490075Sobrien{ 35590075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 35690075Sobrien} 35790075Sobrien 358169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 35990075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 36090075Sobrien{ 36190075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 36290075Sobrien} 36390075Sobrien 364169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 36590075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 36690075Sobrien{ 36790075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 36890075Sobrien} 36990075Sobrien 370169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 37190075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 37290075Sobrien{ 37390075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 37490075Sobrien} 37590075Sobrien 376169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 37790075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 37890075Sobrien{ 37990075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 38090075Sobrien} 38190075Sobrien 382169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 38390075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 38490075Sobrien{ 38590075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 38690075Sobrien} 38790075Sobrien 388169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 38990075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 39090075Sobrien{ 39190075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 39290075Sobrien} 39390075Sobrien 394169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 39590075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 39690075Sobrien{ 39790075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 39890075Sobrien} 39990075Sobrien 400169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 40190075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 40290075Sobrien{ 40390075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 40490075Sobrien} 40590075Sobrien 40690075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 40790075Sobrien and 0 if false. */ 40890075Sobrien 409169689Skanstatic __inline int __attribute__((__always_inline__)) 41090075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 41190075Sobrien{ 41290075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 41390075Sobrien} 41490075Sobrien 415169689Skanstatic __inline int __attribute__((__always_inline__)) 41690075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 41790075Sobrien{ 41890075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 41990075Sobrien} 42090075Sobrien 421169689Skanstatic __inline int __attribute__((__always_inline__)) 42290075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 42390075Sobrien{ 42490075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 42590075Sobrien} 42690075Sobrien 427169689Skanstatic __inline int __attribute__((__always_inline__)) 42890075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 42990075Sobrien{ 43090075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 43190075Sobrien} 43290075Sobrien 433169689Skanstatic __inline int __attribute__((__always_inline__)) 43490075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 43590075Sobrien{ 43690075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 43790075Sobrien} 43890075Sobrien 439169689Skanstatic __inline int __attribute__((__always_inline__)) 44090075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 44190075Sobrien{ 44290075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 44390075Sobrien} 44490075Sobrien 445169689Skanstatic __inline int __attribute__((__always_inline__)) 44690075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 44790075Sobrien{ 44890075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 44990075Sobrien} 45090075Sobrien 451169689Skanstatic __inline int __attribute__((__always_inline__)) 45290075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 45390075Sobrien{ 45490075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 45590075Sobrien} 45690075Sobrien 457169689Skanstatic __inline int __attribute__((__always_inline__)) 45890075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 45990075Sobrien{ 46090075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 46190075Sobrien} 46290075Sobrien 463169689Skanstatic __inline int __attribute__((__always_inline__)) 46490075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 46590075Sobrien{ 46690075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 46790075Sobrien} 46890075Sobrien 469169689Skanstatic __inline int __attribute__((__always_inline__)) 47090075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 47190075Sobrien{ 47290075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 47390075Sobrien} 47490075Sobrien 475169689Skanstatic __inline int __attribute__((__always_inline__)) 47690075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 47790075Sobrien{ 47890075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 47990075Sobrien} 48090075Sobrien 48190075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 48290075Sobrien rounding mode. */ 483169689Skanstatic __inline int __attribute__((__always_inline__)) 48490075Sobrien_mm_cvtss_si32 (__m128 __A) 48590075Sobrien{ 48690075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 48790075Sobrien} 48890075Sobrien 489169689Skanstatic __inline int __attribute__((__always_inline__)) 490122180Skan_mm_cvt_ss2si (__m128 __A) 491122180Skan{ 492122180Skan return _mm_cvtss_si32 (__A); 493122180Skan} 494122180Skan 495117395Skan#ifdef __x86_64__ 496169689Skan/* Convert the lower SPFP value to a 32-bit integer according to the 497169689Skan current rounding mode. */ 498169689Skan 499169689Skan/* Intel intrinsic. */ 500169689Skanstatic __inline long long __attribute__((__always_inline__)) 501169689Skan_mm_cvtss_si64 (__m128 __A) 502169689Skan{ 503169689Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 504169689Skan} 505169689Skan 506169689Skan/* Microsoft intrinsic. */ 507169689Skanstatic __inline long long __attribute__((__always_inline__)) 508117395Skan_mm_cvtss_si64x (__m128 __A) 509117395Skan{ 510117395Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 511117395Skan} 512117395Skan#endif 513117395Skan 51490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 51590075Sobrien current rounding mode. Return the integers in packed form. */ 516169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 51790075Sobrien_mm_cvtps_pi32 (__m128 __A) 51890075Sobrien{ 51990075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 52090075Sobrien} 52190075Sobrien 522169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 523122180Skan_mm_cvt_ps2pi (__m128 __A) 524122180Skan{ 525122180Skan return _mm_cvtps_pi32 (__A); 526122180Skan} 527122180Skan 52890075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 529169689Skanstatic __inline int __attribute__((__always_inline__)) 53090075Sobrien_mm_cvttss_si32 (__m128 __A) 53190075Sobrien{ 53290075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 53390075Sobrien} 53490075Sobrien 535169689Skanstatic __inline int __attribute__((__always_inline__)) 536122180Skan_mm_cvtt_ss2si (__m128 __A) 537122180Skan{ 538122180Skan return _mm_cvttss_si32 (__A); 539122180Skan} 540122180Skan 541117395Skan#ifdef __x86_64__ 542117395Skan/* Truncate the lower SPFP value to a 32-bit integer. */ 543169689Skan 544169689Skan/* Intel intrinsic. */ 545169689Skanstatic __inline long long __attribute__((__always_inline__)) 546169689Skan_mm_cvttss_si64 (__m128 __A) 547169689Skan{ 548169689Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 549169689Skan} 550169689Skan 551169689Skan/* Microsoft intrinsic. */ 552169689Skanstatic __inline long long __attribute__((__always_inline__)) 553117395Skan_mm_cvttss_si64x (__m128 __A) 554117395Skan{ 555117395Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 556117395Skan} 557117395Skan#endif 558117395Skan 55990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 56090075Sobrien integers in packed form. */ 561169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 56290075Sobrien_mm_cvttps_pi32 (__m128 __A) 56390075Sobrien{ 56490075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 56590075Sobrien} 56690075Sobrien 567169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 568122180Skan_mm_cvtt_ps2pi (__m128 __A) 569122180Skan{ 570122180Skan return _mm_cvttps_pi32 (__A); 571122180Skan} 572122180Skan 57390075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 574169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 57590075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 57690075Sobrien{ 57790075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 57890075Sobrien} 57990075Sobrien 580169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 581122180Skan_mm_cvt_si2ss (__m128 __A, int __B) 582122180Skan{ 583122180Skan return _mm_cvtsi32_ss (__A, __B); 584122180Skan} 585122180Skan 586117395Skan#ifdef __x86_64__ 587117395Skan/* Convert B to a SPFP value and insert it as element zero in A. */ 588169689Skan 589169689Skan/* Intel intrinsic. */ 590169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 591169689Skan_mm_cvtsi64_ss (__m128 __A, long long __B) 592169689Skan{ 593169689Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 594169689Skan} 595169689Skan 596169689Skan/* Microsoft intrinsic. */ 597169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 598117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B) 599117395Skan{ 600117395Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 601117395Skan} 602117395Skan#endif 603117395Skan 60490075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 60590075Sobrien as the two lower elements in A. */ 606169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 60790075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 60890075Sobrien{ 60990075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 61090075Sobrien} 61190075Sobrien 612169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 613122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B) 614122180Skan{ 615122180Skan return _mm_cvtpi32_ps (__A, __B); 616122180Skan} 617122180Skan 61890075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 619169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 62090075Sobrien_mm_cvtpi16_ps (__m64 __A) 62190075Sobrien{ 62290075Sobrien __v4hi __sign; 62390075Sobrien __v2si __hisi, __losi; 62490075Sobrien __v4sf __r; 62590075Sobrien 62690075Sobrien /* This comparison against zero gives us a mask that can be used to 62790075Sobrien fill in the missing sign bits in the unpack operations below, so 62890075Sobrien that we get signed values after unpacking. */ 629169689Skan __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); 63090075Sobrien 63190075Sobrien /* Convert the four words to doublewords. */ 63290075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 63390075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 63490075Sobrien 63590075Sobrien /* Convert the doublewords to floating point two at a time. */ 636169689Skan __r = (__v4sf) _mm_setzero_ps (); 63790075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 63890075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 63990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 64090075Sobrien 64190075Sobrien return (__m128) __r; 64290075Sobrien} 64390075Sobrien 64490075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 645169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 64690075Sobrien_mm_cvtpu16_ps (__m64 __A) 64790075Sobrien{ 64890075Sobrien __v2si __hisi, __losi; 64990075Sobrien __v4sf __r; 65090075Sobrien 65190075Sobrien /* Convert the four words to doublewords. */ 652169689Skan __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); 653169689Skan __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); 65490075Sobrien 65590075Sobrien /* Convert the doublewords to floating point two at a time. */ 656169689Skan __r = (__v4sf) _mm_setzero_ps (); 65790075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 65890075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 65990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 66090075Sobrien 66190075Sobrien return (__m128) __r; 66290075Sobrien} 66390075Sobrien 66490075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 665169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 66690075Sobrien_mm_cvtpi8_ps (__m64 __A) 66790075Sobrien{ 66890075Sobrien __v8qi __sign; 66990075Sobrien 67090075Sobrien /* This comparison against zero gives us a mask that can be used to 67190075Sobrien fill in the missing sign bits in the unpack operations below, so 67290075Sobrien that we get signed values after unpacking. */ 673169689Skan __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); 67490075Sobrien 67590075Sobrien /* Convert the four low bytes to words. */ 67690075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 67790075Sobrien 67890075Sobrien return _mm_cvtpi16_ps(__A); 67990075Sobrien} 68090075Sobrien 68190075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 682169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 68390075Sobrien_mm_cvtpu8_ps(__m64 __A) 68490075Sobrien{ 685169689Skan __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); 68690075Sobrien return _mm_cvtpu16_ps(__A); 68790075Sobrien} 68890075Sobrien 68990075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 690169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 69190075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 69290075Sobrien{ 693169689Skan __v4sf __zero = (__v4sf) _mm_setzero_ps (); 69490075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 69590075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 69690075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 69790075Sobrien} 69890075Sobrien 69990075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 700169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 70190075Sobrien_mm_cvtps_pi16(__m128 __A) 70290075Sobrien{ 70390075Sobrien __v4sf __hisf = (__v4sf)__A; 70490075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 70590075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 70690075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 707117395Skan return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 70890075Sobrien} 70990075Sobrien 71090075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 711169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 71290075Sobrien_mm_cvtps_pi8(__m128 __A) 71390075Sobrien{ 71490075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 715169689Skan return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); 71690075Sobrien} 71790075Sobrien 71890075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 71990075Sobrien#if 0 720169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 72190075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 72290075Sobrien{ 72390075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 72490075Sobrien} 72590075Sobrien#else 72690075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 72790075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 72890075Sobrien#endif 72990075Sobrien 73090075Sobrien 73190075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 732169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 73390075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 73490075Sobrien{ 73590075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 73690075Sobrien} 73790075Sobrien 73890075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 739169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 74090075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 74190075Sobrien{ 74290075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 74390075Sobrien} 74490075Sobrien 74590075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 74690075Sobrien the lower two values are passed through from A. */ 747169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 748117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P) 74990075Sobrien{ 75090075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 75190075Sobrien} 75290075Sobrien 75390075Sobrien/* Stores the upper two SPFP values of A into P. */ 754169689Skanstatic __inline void __attribute__((__always_inline__)) 75590075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 75690075Sobrien{ 75790075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 75890075Sobrien} 75990075Sobrien 76090075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 761169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 76290075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 76390075Sobrien{ 76490075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 76590075Sobrien} 76690075Sobrien 76790075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 768169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 76990075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 77090075Sobrien{ 77190075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 77290075Sobrien} 77390075Sobrien 77490075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 77590075Sobrien the upper two values are passed through from A. */ 776169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 777117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P) 77890075Sobrien{ 77990075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 78090075Sobrien} 78190075Sobrien 78290075Sobrien/* Stores the lower two SPFP values of A into P. */ 783169689Skanstatic __inline void __attribute__((__always_inline__)) 78490075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 78590075Sobrien{ 78690075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 78790075Sobrien} 78890075Sobrien 78990075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 790169689Skanstatic __inline int __attribute__((__always_inline__)) 79190075Sobrien_mm_movemask_ps (__m128 __A) 79290075Sobrien{ 79390075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 79490075Sobrien} 79590075Sobrien 79690075Sobrien/* Return the contents of the control register. */ 797169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 79890075Sobrien_mm_getcsr (void) 79990075Sobrien{ 80090075Sobrien return __builtin_ia32_stmxcsr (); 80190075Sobrien} 80290075Sobrien 80390075Sobrien/* Read exception bits from the control register. */ 804169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 80590075Sobrien_MM_GET_EXCEPTION_STATE (void) 80690075Sobrien{ 80790075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 80890075Sobrien} 80990075Sobrien 810169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 81190075Sobrien_MM_GET_EXCEPTION_MASK (void) 81290075Sobrien{ 81390075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 81490075Sobrien} 81590075Sobrien 816169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 81790075Sobrien_MM_GET_ROUNDING_MODE (void) 81890075Sobrien{ 81990075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 82090075Sobrien} 82190075Sobrien 822169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 82390075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 82490075Sobrien{ 82590075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 82690075Sobrien} 82790075Sobrien 82890075Sobrien/* Set the control register to I. */ 829169689Skanstatic __inline void __attribute__((__always_inline__)) 83090075Sobrien_mm_setcsr (unsigned int __I) 83190075Sobrien{ 83290075Sobrien __builtin_ia32_ldmxcsr (__I); 83390075Sobrien} 83490075Sobrien 83590075Sobrien/* Set exception bits in the control register. */ 836169689Skanstatic __inline void __attribute__((__always_inline__)) 83790075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 83890075Sobrien{ 83990075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 84090075Sobrien} 84190075Sobrien 842169689Skanstatic __inline void __attribute__((__always_inline__)) 84390075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 84490075Sobrien{ 84590075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 84690075Sobrien} 84790075Sobrien 848169689Skanstatic __inline void __attribute__((__always_inline__)) 84990075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 85090075Sobrien{ 85190075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 85290075Sobrien} 85390075Sobrien 854169689Skanstatic __inline void __attribute__((__always_inline__)) 85590075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 85690075Sobrien{ 85790075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 85890075Sobrien} 85990075Sobrien 860169689Skan/* Create a vector with element 0 as F and the rest zero. */ 861169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 862169689Skan_mm_set_ss (float __F) 863169689Skan{ 864169689Skan return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 }; 865169689Skan} 866169689Skan 867169689Skan/* Create a vector with all four elements equal to F. */ 868169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 869169689Skan_mm_set1_ps (float __F) 870169689Skan{ 871169689Skan return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 872169689Skan} 873169689Skan 874169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 875169689Skan_mm_set_ps1 (float __F) 876169689Skan{ 877169689Skan return _mm_set1_ps (__F); 878169689Skan} 879169689Skan 88090075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 881169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 882117395Skan_mm_load_ss (float const *__P) 88390075Sobrien{ 884169689Skan return _mm_set_ss (*__P); 88590075Sobrien} 88690075Sobrien 88790075Sobrien/* Create a vector with all four elements equal to *P. */ 888169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 889117395Skan_mm_load1_ps (float const *__P) 89090075Sobrien{ 891169689Skan return _mm_set1_ps (*__P); 89290075Sobrien} 89390075Sobrien 894169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 895117395Skan_mm_load_ps1 (float const *__P) 89690075Sobrien{ 89790075Sobrien return _mm_load1_ps (__P); 89890075Sobrien} 89990075Sobrien 90090075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 901169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 902117395Skan_mm_load_ps (float const *__P) 90390075Sobrien{ 904169689Skan return (__m128) *(__v4sf *)__P; 90590075Sobrien} 90690075Sobrien 90790075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 908169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 909117395Skan_mm_loadu_ps (float const *__P) 91090075Sobrien{ 91190075Sobrien return (__m128) __builtin_ia32_loadups (__P); 91290075Sobrien} 91390075Sobrien 91490075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 915169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 916117395Skan_mm_loadr_ps (float const *__P) 91790075Sobrien{ 918169689Skan __v4sf __tmp = *(__v4sf *)__P; 91990075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 92090075Sobrien} 92190075Sobrien 922169689Skan/* Create the vector [Z Y X W]. */ 923169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 924169689Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 92590075Sobrien{ 926169689Skan return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 92790075Sobrien} 92890075Sobrien 929169689Skan/* Create the vector [W X Y Z]. */ 930169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 931169689Skan_mm_setr_ps (float __Z, float __Y, float __X, float __W) 93290075Sobrien{ 933169689Skan return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 93490075Sobrien} 93590075Sobrien 936169689Skan/* Stores the lower SPFP value. */ 937169689Skanstatic __inline void __attribute__((__always_inline__)) 938169689Skan_mm_store_ss (float *__P, __m128 __A) 93990075Sobrien{ 940169689Skan *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 94190075Sobrien} 94290075Sobrien 943169689Skanstatic __inline float __attribute__((__always_inline__)) 944169689Skan_mm_cvtss_f32 (__m128 __A) 94590075Sobrien{ 946169689Skan return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 94790075Sobrien} 94890075Sobrien 949169689Skan/* Store four SPFP values. The address must be 16-byte aligned. */ 950169689Skanstatic __inline void __attribute__((__always_inline__)) 951169689Skan_mm_store_ps (float *__P, __m128 __A) 95290075Sobrien{ 953169689Skan *(__v4sf *)__P = (__v4sf)__A; 95490075Sobrien} 95590075Sobrien 956169689Skan/* Store four SPFP values. The address need not be 16-byte aligned. */ 957169689Skanstatic __inline void __attribute__((__always_inline__)) 958169689Skan_mm_storeu_ps (float *__P, __m128 __A) 95990075Sobrien{ 960169689Skan __builtin_ia32_storeups (__P, (__v4sf)__A); 96190075Sobrien} 96290075Sobrien 96390075Sobrien/* Store the lower SPFP value across four words. */ 964169689Skanstatic __inline void __attribute__((__always_inline__)) 96590075Sobrien_mm_store1_ps (float *__P, __m128 __A) 96690075Sobrien{ 96790075Sobrien __v4sf __va = (__v4sf)__A; 96890075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 969169689Skan _mm_storeu_ps (__P, __tmp); 97090075Sobrien} 97190075Sobrien 972169689Skanstatic __inline void __attribute__((__always_inline__)) 97390075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 97490075Sobrien{ 97590075Sobrien _mm_store1_ps (__P, __A); 97690075Sobrien} 97790075Sobrien 978117395Skan/* Store four SPFP values in reverse order. The address must be aligned. */ 979169689Skanstatic __inline void __attribute__((__always_inline__)) 98090075Sobrien_mm_storer_ps (float *__P, __m128 __A) 98190075Sobrien{ 98290075Sobrien __v4sf __va = (__v4sf)__A; 98390075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 984169689Skan _mm_store_ps (__P, __tmp); 98590075Sobrien} 98690075Sobrien 98790075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 988169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 98990075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 99090075Sobrien{ 99190075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 99290075Sobrien} 99390075Sobrien 99490075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 99590075Sobrien#if 0 996169689Skanstatic __inline int __attribute__((__always_inline__)) 997169689Skan_mm_extract_pi16 (__m64 const __A, int const __N) 99890075Sobrien{ 999169689Skan return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); 100090075Sobrien} 1001122180Skan 1002169689Skanstatic __inline int __attribute__((__always_inline__)) 1003169689Skan_m_pextrw (__m64 const __A, int const __N) 1004122180Skan{ 1005122180Skan return _mm_extract_pi16 (__A, __N); 1006122180Skan} 100790075Sobrien#else 1008169689Skan#define _mm_extract_pi16(A, N) __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N)) 1009122180Skan#define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 101090075Sobrien#endif 101190075Sobrien 101290075Sobrien/* Inserts word D into one of four words of A. The selector N must be 101390075Sobrien immediate. */ 101490075Sobrien#if 0 1015169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1016169689Skan_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 101790075Sobrien{ 1018169689Skan return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); 101990075Sobrien} 1020122180Skan 1021169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1022169689Skan_m_pinsrw (__m64 const __A, int const __D, int const __N) 1023122180Skan{ 1024122180Skan return _mm_insert_pi16 (__A, __D, __N); 1025122180Skan} 102690075Sobrien#else 102790075Sobrien#define _mm_insert_pi16(A, D, N) \ 1028169689Skan ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N))) 1029122180Skan#define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 103090075Sobrien#endif 103190075Sobrien 103290075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 1033169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 103490075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 103590075Sobrien{ 103690075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 103790075Sobrien} 103890075Sobrien 1039169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1040122180Skan_m_pmaxsw (__m64 __A, __m64 __B) 1041122180Skan{ 1042122180Skan return _mm_max_pi16 (__A, __B); 1043122180Skan} 1044122180Skan 104590075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 1046169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 104790075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 104890075Sobrien{ 104990075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 105090075Sobrien} 105190075Sobrien 1052169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1053122180Skan_m_pmaxub (__m64 __A, __m64 __B) 1054122180Skan{ 1055122180Skan return _mm_max_pu8 (__A, __B); 1056122180Skan} 1057122180Skan 105890075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 1059169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 106090075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 106190075Sobrien{ 106290075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 106390075Sobrien} 106490075Sobrien 1065169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1066122180Skan_m_pminsw (__m64 __A, __m64 __B) 1067122180Skan{ 1068122180Skan return _mm_min_pi16 (__A, __B); 1069122180Skan} 1070122180Skan 107190075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 1072169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 107390075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 107490075Sobrien{ 107590075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 107690075Sobrien} 107790075Sobrien 1078169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1079122180Skan_m_pminub (__m64 __A, __m64 __B) 1080122180Skan{ 1081122180Skan return _mm_min_pu8 (__A, __B); 1082122180Skan} 1083122180Skan 108490075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 1085169689Skanstatic __inline int __attribute__((__always_inline__)) 108690075Sobrien_mm_movemask_pi8 (__m64 __A) 108790075Sobrien{ 108890075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 108990075Sobrien} 109090075Sobrien 1091169689Skanstatic __inline int __attribute__((__always_inline__)) 1092122180Skan_m_pmovmskb (__m64 __A) 1093122180Skan{ 1094122180Skan return _mm_movemask_pi8 (__A); 1095122180Skan} 1096122180Skan 109790075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 109890075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 1099169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 110090075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 110190075Sobrien{ 110290075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 110390075Sobrien} 110490075Sobrien 1105169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1106122180Skan_m_pmulhuw (__m64 __A, __m64 __B) 1107122180Skan{ 1108122180Skan return _mm_mulhi_pu16 (__A, __B); 1109122180Skan} 1110122180Skan 111190075Sobrien/* Return a combination of the four 16-bit values in A. The selector 111290075Sobrien must be an immediate. */ 111390075Sobrien#if 0 1114169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 111590075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 111690075Sobrien{ 111790075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 111890075Sobrien} 1119122180Skan 1120169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1121122180Skan_m_pshufw (__m64 __A, int __N) 1122122180Skan{ 1123122180Skan return _mm_shuffle_pi16 (__A, __N); 1124122180Skan} 112590075Sobrien#else 112690075Sobrien#define _mm_shuffle_pi16(A, N) \ 112790075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1128122180Skan#define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 112990075Sobrien#endif 113090075Sobrien 113190075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 113290075Sobrien byte in the selector N determines whether the corresponding byte from 113390075Sobrien A is stored. */ 1134169689Skanstatic __inline void __attribute__((__always_inline__)) 113590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 113690075Sobrien{ 113790075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 113890075Sobrien} 113990075Sobrien 1140169689Skanstatic __inline void __attribute__((__always_inline__)) 1141122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1142122180Skan{ 1143122180Skan _mm_maskmove_si64 (__A, __N, __P); 1144122180Skan} 1145122180Skan 114690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1147169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 114890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 114990075Sobrien{ 115090075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 115190075Sobrien} 115290075Sobrien 1153169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1154122180Skan_m_pavgb (__m64 __A, __m64 __B) 1155122180Skan{ 1156122180Skan return _mm_avg_pu8 (__A, __B); 1157122180Skan} 1158122180Skan 115990075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1160169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 116190075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 116290075Sobrien{ 116390075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 116490075Sobrien} 116590075Sobrien 1166169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1167122180Skan_m_pavgw (__m64 __A, __m64 __B) 1168122180Skan{ 1169122180Skan return _mm_avg_pu16 (__A, __B); 1170122180Skan} 1171122180Skan 117290075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 117390075Sobrien values in A and B. Return the value in the lower 16-bit word; the 117490075Sobrien upper words are cleared. */ 1175169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 117690075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 117790075Sobrien{ 117890075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 117990075Sobrien} 118090075Sobrien 1181169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1182122180Skan_m_psadbw (__m64 __A, __m64 __B) 1183122180Skan{ 1184122180Skan return _mm_sad_pu8 (__A, __B); 1185122180Skan} 1186122180Skan 118790075Sobrien/* Loads one cache line from address P to a location "closer" to the 118890075Sobrien processor. The selector I specifies the type of prefetch operation. */ 118990075Sobrien#if 0 1190169689Skanstatic __inline void __attribute__((__always_inline__)) 119190075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 119290075Sobrien{ 119390075Sobrien __builtin_prefetch (__P, 0, __I); 119490075Sobrien} 119590075Sobrien#else 119690075Sobrien#define _mm_prefetch(P, I) \ 119790075Sobrien __builtin_prefetch ((P), 0, (I)) 119890075Sobrien#endif 119990075Sobrien 120090075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 1201169689Skanstatic __inline void __attribute__((__always_inline__)) 120290075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 120390075Sobrien{ 1204117395Skan __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); 120590075Sobrien} 120690075Sobrien 120790075Sobrien/* Likewise. The address must be 16-byte aligned. */ 1208169689Skanstatic __inline void __attribute__((__always_inline__)) 120990075Sobrien_mm_stream_ps (float *__P, __m128 __A) 121090075Sobrien{ 121190075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 121290075Sobrien} 121390075Sobrien 1214132718Skan/* Guarantees that every preceding store is globally visible before 121590075Sobrien any subsequent store. */ 1216169689Skanstatic __inline void __attribute__((__always_inline__)) 121790075Sobrien_mm_sfence (void) 121890075Sobrien{ 121990075Sobrien __builtin_ia32_sfence (); 122090075Sobrien} 122190075Sobrien 122290075Sobrien/* The execution of the next instruction is delayed by an implementation 122390075Sobrien specific amount of time. The instruction does not modify the 122490075Sobrien architectural state. */ 1225169689Skanstatic __inline void __attribute__((__always_inline__)) 122690075Sobrien_mm_pause (void) 122790075Sobrien{ 122890075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 122990075Sobrien} 123090075Sobrien 123190075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 123290075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 123390075Sobriendo { \ 123490075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1235169689Skan __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ 1236169689Skan __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ 1237169689Skan __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ 1238169689Skan __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ 1239169689Skan (row0) = __builtin_ia32_movlhps (__t0, __t1); \ 1240169689Skan (row1) = __builtin_ia32_movhlps (__t1, __t0); \ 1241169689Skan (row2) = __builtin_ia32_movlhps (__t2, __t3); \ 1242169689Skan (row3) = __builtin_ia32_movhlps (__t3, __t2); \ 124390075Sobrien} while (0) 124490075Sobrien 1245122180Skan/* For backward source compatibility. */ 1246219639Smm#ifdef __SSE2__ 1247122180Skan#include <emmintrin.h> 1248219639Smm#endif 1249117395Skan 1250117395Skan#endif /* __SSE__ */ 125190075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1252