xmmintrin.h revision 122180
1122180Skan/* Copyright (C) 2002, 2003 Free Software Foundation, Inc. 290075Sobrien 390075Sobrien This file is part of GNU CC. 490075Sobrien 590075Sobrien GNU CC is free software; you can redistribute it and/or modify 690075Sobrien it under the terms of the GNU General Public License as published by 790075Sobrien the Free Software Foundation; either version 2, or (at your option) 890075Sobrien any later version. 990075Sobrien 1090075Sobrien GNU CC is distributed in the hope that it will be useful, 1190075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1290075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1390075Sobrien GNU General Public License for more details. 1490075Sobrien 1590075Sobrien You should have received a copy of the GNU General Public License 1690075Sobrien along with GNU CC; see the file COPYING. If not, write to 1790075Sobrien the Free Software Foundation, 59 Temple Place - Suite 330, 1890075Sobrien Boston, MA 02111-1307, USA. */ 1990075Sobrien 2090075Sobrien/* As a special exception, if you include this header file into source 2190075Sobrien files compiled by GCC, this header file does not by itself cause 2290075Sobrien the resulting executable to be covered by the GNU General Public 2390075Sobrien License. This exception does not however invalidate any other 2490075Sobrien reasons why the executable file might be covered by the GNU General 2590075Sobrien Public License. */ 2690075Sobrien 2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 28122180Skan User Guide and Reference, version 8.0. */ 2990075Sobrien 3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3190075Sobrien#define _XMMINTRIN_H_INCLUDED 3290075Sobrien 33117395Skan#ifndef __SSE__ 34117395Skan# error "SSE instruction set not enabled" 35117395Skan#else 36117395Skan 3790075Sobrien/* We need type definitions from the MMX header file. */ 3890075Sobrien#include <mmintrin.h> 3990075Sobrien 4090075Sobrien/* The data type indended for user use. */ 4190075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__))); 4290075Sobrien 4390075Sobrien/* Internal data types for implementing the instrinsics. */ 4490075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 4590075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__))); 4690075Sobrien 4790075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 4890075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 4990075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 5090075Sobrien 5190075Sobrien/* Constants for use with _mm_prefetch. */ 5290075Sobrienenum _mm_hint 5390075Sobrien{ 5490075Sobrien _MM_HINT_T0 = 3, 5590075Sobrien _MM_HINT_T1 = 2, 5690075Sobrien _MM_HINT_T2 = 1, 5790075Sobrien _MM_HINT_NTA = 0 5890075Sobrien}; 5990075Sobrien 6090075Sobrien/* Bits in the MXCSR. */ 6190075Sobrien#define _MM_EXCEPT_MASK 0x003f 6290075Sobrien#define _MM_EXCEPT_INVALID 0x0001 6390075Sobrien#define _MM_EXCEPT_DENORM 0x0002 6490075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 6590075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 6690075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 6790075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 6890075Sobrien 6990075Sobrien#define _MM_MASK_MASK 0x1f80 7090075Sobrien#define _MM_MASK_INVALID 0x0080 7190075Sobrien#define _MM_MASK_DENORM 0x0100 7290075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 7390075Sobrien#define _MM_MASK_OVERFLOW 0x0400 7490075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 7590075Sobrien#define _MM_MASK_INEXACT 0x1000 7690075Sobrien 7790075Sobrien#define _MM_ROUND_MASK 0x6000 7890075Sobrien#define _MM_ROUND_NEAREST 0x0000 7990075Sobrien#define _MM_ROUND_DOWN 0x2000 8090075Sobrien#define _MM_ROUND_UP 0x4000 8190075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 8290075Sobrien 8390075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 8490075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 8590075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 8690075Sobrien 8790075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 8890075Sobrien floating-point) values of A and B; the upper three SPFP values are 8990075Sobrien passed through from A. */ 9090075Sobrien 9190075Sobrienstatic __inline __m128 9290075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 9390075Sobrien{ 9490075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 9590075Sobrien} 9690075Sobrien 9790075Sobrienstatic __inline __m128 9890075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 9990075Sobrien{ 10090075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 10190075Sobrien} 10290075Sobrien 10390075Sobrienstatic __inline __m128 10490075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 10590075Sobrien{ 10690075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 10790075Sobrien} 10890075Sobrien 10990075Sobrienstatic __inline __m128 11090075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 11190075Sobrien{ 11290075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 11390075Sobrien} 11490075Sobrien 11590075Sobrienstatic __inline __m128 11690075Sobrien_mm_sqrt_ss (__m128 __A) 11790075Sobrien{ 11890075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 11990075Sobrien} 12090075Sobrien 12190075Sobrienstatic __inline __m128 12290075Sobrien_mm_rcp_ss (__m128 __A) 12390075Sobrien{ 12490075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 12590075Sobrien} 12690075Sobrien 12790075Sobrienstatic __inline __m128 12890075Sobrien_mm_rsqrt_ss (__m128 __A) 12990075Sobrien{ 13090075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 13190075Sobrien} 13290075Sobrien 13390075Sobrienstatic __inline __m128 13490075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 13590075Sobrien{ 13690075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 13790075Sobrien} 13890075Sobrien 13990075Sobrienstatic __inline __m128 14090075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 14190075Sobrien{ 14290075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 14390075Sobrien} 14490075Sobrien 14590075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 14690075Sobrien 14790075Sobrienstatic __inline __m128 14890075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 14990075Sobrien{ 15090075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 15190075Sobrien} 15290075Sobrien 15390075Sobrienstatic __inline __m128 15490075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 15590075Sobrien{ 15690075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 15790075Sobrien} 15890075Sobrien 15990075Sobrienstatic __inline __m128 16090075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 16190075Sobrien{ 16290075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 16390075Sobrien} 16490075Sobrien 16590075Sobrienstatic __inline __m128 16690075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 16790075Sobrien{ 16890075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 16990075Sobrien} 17090075Sobrien 17190075Sobrienstatic __inline __m128 17290075Sobrien_mm_sqrt_ps (__m128 __A) 17390075Sobrien{ 17490075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 17590075Sobrien} 17690075Sobrien 17790075Sobrienstatic __inline __m128 17890075Sobrien_mm_rcp_ps (__m128 __A) 17990075Sobrien{ 18090075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 18190075Sobrien} 18290075Sobrien 18390075Sobrienstatic __inline __m128 18490075Sobrien_mm_rsqrt_ps (__m128 __A) 18590075Sobrien{ 18690075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 18790075Sobrien} 18890075Sobrien 18990075Sobrienstatic __inline __m128 19090075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 19190075Sobrien{ 19290075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 19390075Sobrien} 19490075Sobrien 19590075Sobrienstatic __inline __m128 19690075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 19790075Sobrien{ 19890075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 19990075Sobrien} 20090075Sobrien 20190075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 20290075Sobrien 20390075Sobrienstatic __inline __m128 20490075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 20590075Sobrien{ 20690075Sobrien return __builtin_ia32_andps (__A, __B); 20790075Sobrien} 20890075Sobrien 20990075Sobrienstatic __inline __m128 21090075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 21190075Sobrien{ 21290075Sobrien return __builtin_ia32_andnps (__A, __B); 21390075Sobrien} 21490075Sobrien 21590075Sobrienstatic __inline __m128 21690075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 21790075Sobrien{ 21890075Sobrien return __builtin_ia32_orps (__A, __B); 21990075Sobrien} 22090075Sobrien 22190075Sobrienstatic __inline __m128 22290075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 22390075Sobrien{ 22490075Sobrien return __builtin_ia32_xorps (__A, __B); 22590075Sobrien} 22690075Sobrien 22790075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 22890075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 22990075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 23090075Sobrien 23190075Sobrienstatic __inline __m128 23290075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 23390075Sobrien{ 23490075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 23590075Sobrien} 23690075Sobrien 23790075Sobrienstatic __inline __m128 23890075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 23990075Sobrien{ 24090075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 24190075Sobrien} 24290075Sobrien 24390075Sobrienstatic __inline __m128 24490075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 24590075Sobrien{ 24690075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 24790075Sobrien} 24890075Sobrien 24990075Sobrienstatic __inline __m128 25090075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 25190075Sobrien{ 252107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 253107590Sobrien (__v4sf) 254107590Sobrien __builtin_ia32_cmpltss ((__v4sf) __B, 255107590Sobrien (__v4sf) 256107590Sobrien __A)); 25790075Sobrien} 25890075Sobrien 25990075Sobrienstatic __inline __m128 26090075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 26190075Sobrien{ 262107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 263107590Sobrien (__v4sf) 264107590Sobrien __builtin_ia32_cmpless ((__v4sf) __B, 265107590Sobrien (__v4sf) 266107590Sobrien __A)); 26790075Sobrien} 26890075Sobrien 26990075Sobrienstatic __inline __m128 27090075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 27190075Sobrien{ 27290075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 27390075Sobrien} 27490075Sobrien 27590075Sobrienstatic __inline __m128 27690075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 27790075Sobrien{ 27890075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 27990075Sobrien} 28090075Sobrien 28190075Sobrienstatic __inline __m128 28290075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 28390075Sobrien{ 28490075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 28590075Sobrien} 28690075Sobrien 28790075Sobrienstatic __inline __m128 28890075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 28990075Sobrien{ 290107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 291107590Sobrien (__v4sf) 292107590Sobrien __builtin_ia32_cmpnltss ((__v4sf) __B, 293107590Sobrien (__v4sf) 294107590Sobrien __A)); 29590075Sobrien} 29690075Sobrien 29790075Sobrienstatic __inline __m128 29890075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 29990075Sobrien{ 300107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 301107590Sobrien (__v4sf) 302107590Sobrien __builtin_ia32_cmpnless ((__v4sf) __B, 303107590Sobrien (__v4sf) 304107590Sobrien __A)); 30590075Sobrien} 30690075Sobrien 30790075Sobrienstatic __inline __m128 30890075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 30990075Sobrien{ 31090075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 31190075Sobrien} 31290075Sobrien 31390075Sobrienstatic __inline __m128 31490075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 31590075Sobrien{ 31690075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 31790075Sobrien} 31890075Sobrien 31990075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 32090075Sobrien element, if the comparison is true, place a mask of all ones in the 32190075Sobrien result, otherwise a mask of zeros. */ 32290075Sobrien 32390075Sobrienstatic __inline __m128 32490075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 32590075Sobrien{ 32690075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 32790075Sobrien} 32890075Sobrien 32990075Sobrienstatic __inline __m128 33090075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 33190075Sobrien{ 33290075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 33390075Sobrien} 33490075Sobrien 33590075Sobrienstatic __inline __m128 33690075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 33790075Sobrien{ 33890075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 33990075Sobrien} 34090075Sobrien 34190075Sobrienstatic __inline __m128 34290075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 34390075Sobrien{ 34490075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 34590075Sobrien} 34690075Sobrien 34790075Sobrienstatic __inline __m128 34890075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 34990075Sobrien{ 35090075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 35190075Sobrien} 35290075Sobrien 35390075Sobrienstatic __inline __m128 35490075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 35590075Sobrien{ 35690075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 35790075Sobrien} 35890075Sobrien 35990075Sobrienstatic __inline __m128 36090075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 36190075Sobrien{ 36290075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 36390075Sobrien} 36490075Sobrien 36590075Sobrienstatic __inline __m128 36690075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 36790075Sobrien{ 36890075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 36990075Sobrien} 37090075Sobrien 37190075Sobrienstatic __inline __m128 37290075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 37390075Sobrien{ 37490075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 37590075Sobrien} 37690075Sobrien 37790075Sobrienstatic __inline __m128 37890075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 37990075Sobrien{ 38090075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 38190075Sobrien} 38290075Sobrien 38390075Sobrienstatic __inline __m128 38490075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 38590075Sobrien{ 38690075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 38790075Sobrien} 38890075Sobrien 38990075Sobrienstatic __inline __m128 39090075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 39190075Sobrien{ 39290075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 39390075Sobrien} 39490075Sobrien 39590075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 39690075Sobrien and 0 if false. */ 39790075Sobrien 39890075Sobrienstatic __inline int 39990075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 40090075Sobrien{ 40190075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 40290075Sobrien} 40390075Sobrien 40490075Sobrienstatic __inline int 40590075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 40690075Sobrien{ 40790075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 40890075Sobrien} 40990075Sobrien 41090075Sobrienstatic __inline int 41190075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 41290075Sobrien{ 41390075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 41490075Sobrien} 41590075Sobrien 41690075Sobrienstatic __inline int 41790075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 41890075Sobrien{ 41990075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 42090075Sobrien} 42190075Sobrien 42290075Sobrienstatic __inline int 42390075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 42490075Sobrien{ 42590075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 42690075Sobrien} 42790075Sobrien 42890075Sobrienstatic __inline int 42990075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 43090075Sobrien{ 43190075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 43290075Sobrien} 43390075Sobrien 43490075Sobrienstatic __inline int 43590075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 43690075Sobrien{ 43790075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 43890075Sobrien} 43990075Sobrien 44090075Sobrienstatic __inline int 44190075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 44290075Sobrien{ 44390075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 44490075Sobrien} 44590075Sobrien 44690075Sobrienstatic __inline int 44790075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 44890075Sobrien{ 44990075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 45090075Sobrien} 45190075Sobrien 45290075Sobrienstatic __inline int 45390075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 45490075Sobrien{ 45590075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 45690075Sobrien} 45790075Sobrien 45890075Sobrienstatic __inline int 45990075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 46090075Sobrien{ 46190075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 46290075Sobrien} 46390075Sobrien 46490075Sobrienstatic __inline int 46590075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 46690075Sobrien{ 46790075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 46890075Sobrien} 46990075Sobrien 47090075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 47190075Sobrien rounding mode. */ 47290075Sobrienstatic __inline int 47390075Sobrien_mm_cvtss_si32 (__m128 __A) 47490075Sobrien{ 47590075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 47690075Sobrien} 47790075Sobrien 478122180Skanstatic __inline int 479122180Skan_mm_cvt_ss2si (__m128 __A) 480122180Skan{ 481122180Skan return _mm_cvtss_si32 (__A); 482122180Skan} 483122180Skan 484117395Skan#ifdef __x86_64__ 485117395Skan/* Convert the lower SPFP value to a 32-bit integer according to the current 486117395Skan rounding mode. */ 487117395Skanstatic __inline long long 488117395Skan_mm_cvtss_si64x (__m128 __A) 489117395Skan{ 490117395Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 491117395Skan} 492117395Skan#endif 493117395Skan 49490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 49590075Sobrien current rounding mode. Return the integers in packed form. */ 49690075Sobrienstatic __inline __m64 49790075Sobrien_mm_cvtps_pi32 (__m128 __A) 49890075Sobrien{ 49990075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 50090075Sobrien} 50190075Sobrien 502122180Skanstatic __inline __m64 503122180Skan_mm_cvt_ps2pi (__m128 __A) 504122180Skan{ 505122180Skan return _mm_cvtps_pi32 (__A); 506122180Skan} 507122180Skan 50890075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 50990075Sobrienstatic __inline int 51090075Sobrien_mm_cvttss_si32 (__m128 __A) 51190075Sobrien{ 51290075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 51390075Sobrien} 51490075Sobrien 515122180Skanstatic __inline int 516122180Skan_mm_cvtt_ss2si (__m128 __A) 517122180Skan{ 518122180Skan return _mm_cvttss_si32 (__A); 519122180Skan} 520122180Skan 521117395Skan#ifdef __x86_64__ 522117395Skan/* Truncate the lower SPFP value to a 32-bit integer. */ 523117395Skanstatic __inline long long 524117395Skan_mm_cvttss_si64x (__m128 __A) 525117395Skan{ 526117395Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 527117395Skan} 528117395Skan#endif 529117395Skan 53090075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 53190075Sobrien integers in packed form. */ 53290075Sobrienstatic __inline __m64 53390075Sobrien_mm_cvttps_pi32 (__m128 __A) 53490075Sobrien{ 53590075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 53690075Sobrien} 53790075Sobrien 538122180Skanstatic __inline __m64 539122180Skan_mm_cvtt_ps2pi (__m128 __A) 540122180Skan{ 541122180Skan return _mm_cvttps_pi32 (__A); 542122180Skan} 543122180Skan 54490075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 54590075Sobrienstatic __inline __m128 54690075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 54790075Sobrien{ 54890075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 54990075Sobrien} 55090075Sobrien 551122180Skanstatic __inline __m128 552122180Skan_mm_cvt_si2ss (__m128 __A, int __B) 553122180Skan{ 554122180Skan return _mm_cvtsi32_ss (__A, __B); 555122180Skan} 556122180Skan 557117395Skan#ifdef __x86_64__ 558117395Skan/* Convert B to a SPFP value and insert it as element zero in A. */ 559117395Skanstatic __inline __m128 560117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B) 561117395Skan{ 562117395Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 563117395Skan} 564117395Skan#endif 565117395Skan 56690075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 56790075Sobrien as the two lower elements in A. */ 56890075Sobrienstatic __inline __m128 56990075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 57090075Sobrien{ 57190075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 57290075Sobrien} 57390075Sobrien 574122180Skanstatic __inline __m128 575122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B) 576122180Skan{ 577122180Skan return _mm_cvtpi32_ps (__A, __B); 578122180Skan} 579122180Skan 58090075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 58190075Sobrienstatic __inline __m128 58290075Sobrien_mm_cvtpi16_ps (__m64 __A) 58390075Sobrien{ 58490075Sobrien __v4hi __sign; 58590075Sobrien __v2si __hisi, __losi; 58690075Sobrien __v4sf __r; 58790075Sobrien 58890075Sobrien /* This comparison against zero gives us a mask that can be used to 58990075Sobrien fill in the missing sign bits in the unpack operations below, so 59090075Sobrien that we get signed values after unpacking. */ 59190075Sobrien __sign = (__v4hi) __builtin_ia32_mmx_zero (); 59290075Sobrien __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 59390075Sobrien 59490075Sobrien /* Convert the four words to doublewords. */ 59590075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 59690075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 59790075Sobrien 59890075Sobrien /* Convert the doublewords to floating point two at a time. */ 59990075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 60090075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 60190075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 60290075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 60390075Sobrien 60490075Sobrien return (__m128) __r; 60590075Sobrien} 60690075Sobrien 60790075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 60890075Sobrienstatic __inline __m128 60990075Sobrien_mm_cvtpu16_ps (__m64 __A) 61090075Sobrien{ 61190075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 61290075Sobrien __v2si __hisi, __losi; 61390075Sobrien __v4sf __r; 61490075Sobrien 61590075Sobrien /* Convert the four words to doublewords. */ 61690075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 61790075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 61890075Sobrien 61990075Sobrien /* Convert the doublewords to floating point two at a time. */ 62090075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 62190075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 62290075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 62390075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 62490075Sobrien 62590075Sobrien return (__m128) __r; 62690075Sobrien} 62790075Sobrien 62890075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 62990075Sobrienstatic __inline __m128 63090075Sobrien_mm_cvtpi8_ps (__m64 __A) 63190075Sobrien{ 63290075Sobrien __v8qi __sign; 63390075Sobrien 63490075Sobrien /* This comparison against zero gives us a mask that can be used to 63590075Sobrien fill in the missing sign bits in the unpack operations below, so 63690075Sobrien that we get signed values after unpacking. */ 63790075Sobrien __sign = (__v8qi) __builtin_ia32_mmx_zero (); 63890075Sobrien __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 63990075Sobrien 64090075Sobrien /* Convert the four low bytes to words. */ 64190075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 64290075Sobrien 64390075Sobrien return _mm_cvtpi16_ps(__A); 64490075Sobrien} 64590075Sobrien 64690075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 64790075Sobrienstatic __inline __m128 64890075Sobrien_mm_cvtpu8_ps(__m64 __A) 64990075Sobrien{ 65090075Sobrien __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 65190075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 65290075Sobrien return _mm_cvtpu16_ps(__A); 65390075Sobrien} 65490075Sobrien 65590075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 65690075Sobrienstatic __inline __m128 65790075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 65890075Sobrien{ 65990075Sobrien __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 66090075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 66190075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 66290075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 66390075Sobrien} 66490075Sobrien 66590075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 66690075Sobrienstatic __inline __m64 66790075Sobrien_mm_cvtps_pi16(__m128 __A) 66890075Sobrien{ 66990075Sobrien __v4sf __hisf = (__v4sf)__A; 67090075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 67190075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 67290075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 673117395Skan return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 67490075Sobrien} 67590075Sobrien 67690075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 67790075Sobrienstatic __inline __m64 67890075Sobrien_mm_cvtps_pi8(__m128 __A) 67990075Sobrien{ 68090075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 68190075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 68290075Sobrien return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 68390075Sobrien} 68490075Sobrien 68590075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 68690075Sobrien#if 0 68790075Sobrienstatic __inline __m128 68890075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 68990075Sobrien{ 69090075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 69190075Sobrien} 69290075Sobrien#else 69390075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 69490075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 69590075Sobrien#endif 69690075Sobrien 69790075Sobrien 69890075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 69990075Sobrienstatic __inline __m128 70090075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 70190075Sobrien{ 70290075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 70390075Sobrien} 70490075Sobrien 70590075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 70690075Sobrienstatic __inline __m128 70790075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 70890075Sobrien{ 70990075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 71090075Sobrien} 71190075Sobrien 71290075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 71390075Sobrien the lower two values are passed through from A. */ 71490075Sobrienstatic __inline __m128 715117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P) 71690075Sobrien{ 71790075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 71890075Sobrien} 71990075Sobrien 72090075Sobrien/* Stores the upper two SPFP values of A into P. */ 72190075Sobrienstatic __inline void 72290075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 72390075Sobrien{ 72490075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 72590075Sobrien} 72690075Sobrien 72790075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 72890075Sobrienstatic __inline __m128 72990075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 73090075Sobrien{ 73190075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 73290075Sobrien} 73390075Sobrien 73490075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 73590075Sobrienstatic __inline __m128 73690075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 73790075Sobrien{ 73890075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 73990075Sobrien} 74090075Sobrien 74190075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 74290075Sobrien the upper two values are passed through from A. */ 74390075Sobrienstatic __inline __m128 744117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P) 74590075Sobrien{ 74690075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 74790075Sobrien} 74890075Sobrien 74990075Sobrien/* Stores the lower two SPFP values of A into P. */ 75090075Sobrienstatic __inline void 75190075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 75290075Sobrien{ 75390075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 75490075Sobrien} 75590075Sobrien 75690075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 75790075Sobrienstatic __inline int 75890075Sobrien_mm_movemask_ps (__m128 __A) 75990075Sobrien{ 76090075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 76190075Sobrien} 76290075Sobrien 76390075Sobrien/* Return the contents of the control register. */ 76490075Sobrienstatic __inline unsigned int 76590075Sobrien_mm_getcsr (void) 76690075Sobrien{ 76790075Sobrien return __builtin_ia32_stmxcsr (); 76890075Sobrien} 76990075Sobrien 77090075Sobrien/* Read exception bits from the control register. */ 77190075Sobrienstatic __inline unsigned int 77290075Sobrien_MM_GET_EXCEPTION_STATE (void) 77390075Sobrien{ 77490075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 77590075Sobrien} 77690075Sobrien 77790075Sobrienstatic __inline unsigned int 77890075Sobrien_MM_GET_EXCEPTION_MASK (void) 77990075Sobrien{ 78090075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 78190075Sobrien} 78290075Sobrien 78390075Sobrienstatic __inline unsigned int 78490075Sobrien_MM_GET_ROUNDING_MODE (void) 78590075Sobrien{ 78690075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 78790075Sobrien} 78890075Sobrien 78990075Sobrienstatic __inline unsigned int 79090075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 79190075Sobrien{ 79290075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 79390075Sobrien} 79490075Sobrien 79590075Sobrien/* Set the control register to I. */ 79690075Sobrienstatic __inline void 79790075Sobrien_mm_setcsr (unsigned int __I) 79890075Sobrien{ 79990075Sobrien __builtin_ia32_ldmxcsr (__I); 80090075Sobrien} 80190075Sobrien 80290075Sobrien/* Set exception bits in the control register. */ 80390075Sobrienstatic __inline void 80490075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 80590075Sobrien{ 80690075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 80790075Sobrien} 80890075Sobrien 80990075Sobrienstatic __inline void 81090075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 81190075Sobrien{ 81290075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 81390075Sobrien} 81490075Sobrien 81590075Sobrienstatic __inline void 81690075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 81790075Sobrien{ 81890075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 81990075Sobrien} 82090075Sobrien 82190075Sobrienstatic __inline void 82290075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 82390075Sobrien{ 82490075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 82590075Sobrien} 82690075Sobrien 82790075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 82890075Sobrienstatic __inline __m128 829117395Skan_mm_load_ss (float const *__P) 83090075Sobrien{ 83190075Sobrien return (__m128) __builtin_ia32_loadss (__P); 83290075Sobrien} 83390075Sobrien 83490075Sobrien/* Create a vector with all four elements equal to *P. */ 83590075Sobrienstatic __inline __m128 836117395Skan_mm_load1_ps (float const *__P) 83790075Sobrien{ 83890075Sobrien __v4sf __tmp = __builtin_ia32_loadss (__P); 83990075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 84090075Sobrien} 84190075Sobrien 84290075Sobrienstatic __inline __m128 843117395Skan_mm_load_ps1 (float const *__P) 84490075Sobrien{ 84590075Sobrien return _mm_load1_ps (__P); 84690075Sobrien} 84790075Sobrien 84890075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 84990075Sobrienstatic __inline __m128 850117395Skan_mm_load_ps (float const *__P) 85190075Sobrien{ 85290075Sobrien return (__m128) __builtin_ia32_loadaps (__P); 85390075Sobrien} 85490075Sobrien 85590075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 85690075Sobrienstatic __inline __m128 857117395Skan_mm_loadu_ps (float const *__P) 85890075Sobrien{ 85990075Sobrien return (__m128) __builtin_ia32_loadups (__P); 86090075Sobrien} 86190075Sobrien 86290075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 86390075Sobrienstatic __inline __m128 864117395Skan_mm_loadr_ps (float const *__P) 86590075Sobrien{ 86690075Sobrien __v4sf __tmp = __builtin_ia32_loadaps (__P); 86790075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 86890075Sobrien} 86990075Sobrien 87090075Sobrien/* Create a vector with element 0 as F and the rest zero. */ 87190075Sobrienstatic __inline __m128 87290075Sobrien_mm_set_ss (float __F) 87390075Sobrien{ 87490075Sobrien return (__m128) __builtin_ia32_loadss (&__F); 87590075Sobrien} 87690075Sobrien 87790075Sobrien/* Create a vector with all four elements equal to F. */ 87890075Sobrienstatic __inline __m128 87990075Sobrien_mm_set1_ps (float __F) 88090075Sobrien{ 88190075Sobrien __v4sf __tmp = __builtin_ia32_loadss (&__F); 88290075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 88390075Sobrien} 88490075Sobrien 88590075Sobrienstatic __inline __m128 88690075Sobrien_mm_set_ps1 (float __F) 88790075Sobrien{ 88890075Sobrien return _mm_set1_ps (__F); 88990075Sobrien} 89090075Sobrien 89190075Sobrien/* Create the vector [Z Y X W]. */ 89290075Sobrienstatic __inline __m128 89390075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W) 89490075Sobrien{ 89590075Sobrien union { 89690075Sobrien float __a[4]; 89790075Sobrien __m128 __v; 89890075Sobrien } __u; 89990075Sobrien 90090075Sobrien __u.__a[0] = __W; 90190075Sobrien __u.__a[1] = __X; 90290075Sobrien __u.__a[2] = __Y; 90390075Sobrien __u.__a[3] = __Z; 90490075Sobrien 90590075Sobrien return __u.__v; 90690075Sobrien} 90790075Sobrien 90890075Sobrien/* Create the vector [W X Y Z]. */ 90990075Sobrienstatic __inline __m128 91090075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W) 91190075Sobrien{ 91290075Sobrien return _mm_set_ps (__W, __X, __Y, __Z); 91390075Sobrien} 91490075Sobrien 91590075Sobrien/* Create a vector of zeros. */ 91690075Sobrienstatic __inline __m128 91790075Sobrien_mm_setzero_ps (void) 91890075Sobrien{ 91990075Sobrien return (__m128) __builtin_ia32_setzerops (); 92090075Sobrien} 92190075Sobrien 92290075Sobrien/* Stores the lower SPFP value. */ 92390075Sobrienstatic __inline void 92490075Sobrien_mm_store_ss (float *__P, __m128 __A) 92590075Sobrien{ 92690075Sobrien __builtin_ia32_storess (__P, (__v4sf)__A); 92790075Sobrien} 92890075Sobrien 92990075Sobrien/* Store the lower SPFP value across four words. */ 93090075Sobrienstatic __inline void 93190075Sobrien_mm_store1_ps (float *__P, __m128 __A) 93290075Sobrien{ 93390075Sobrien __v4sf __va = (__v4sf)__A; 93490075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 93590075Sobrien __builtin_ia32_storeaps (__P, __tmp); 93690075Sobrien} 93790075Sobrien 93890075Sobrienstatic __inline void 93990075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 94090075Sobrien{ 94190075Sobrien _mm_store1_ps (__P, __A); 94290075Sobrien} 94390075Sobrien 94490075Sobrien/* Store four SPFP values. The address must be 16-byte aligned. */ 94590075Sobrienstatic __inline void 94690075Sobrien_mm_store_ps (float *__P, __m128 __A) 94790075Sobrien{ 94890075Sobrien __builtin_ia32_storeaps (__P, (__v4sf)__A); 94990075Sobrien} 95090075Sobrien 95190075Sobrien/* Store four SPFP values. The address need not be 16-byte aligned. */ 95290075Sobrienstatic __inline void 95390075Sobrien_mm_storeu_ps (float *__P, __m128 __A) 95490075Sobrien{ 95590075Sobrien __builtin_ia32_storeups (__P, (__v4sf)__A); 95690075Sobrien} 95790075Sobrien 958117395Skan/* Store four SPFP values in reverse order. The address must be aligned. */ 95990075Sobrienstatic __inline void 96090075Sobrien_mm_storer_ps (float *__P, __m128 __A) 96190075Sobrien{ 96290075Sobrien __v4sf __va = (__v4sf)__A; 96390075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 96490075Sobrien __builtin_ia32_storeaps (__P, __tmp); 96590075Sobrien} 96690075Sobrien 96790075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 96890075Sobrienstatic __inline __m128 96990075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 97090075Sobrien{ 97190075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 97290075Sobrien} 97390075Sobrien 97490075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 97590075Sobrien#if 0 97690075Sobrienstatic __inline int 97790075Sobrien_mm_extract_pi16 (__m64 __A, int __N) 97890075Sobrien{ 97990075Sobrien return __builtin_ia32_pextrw ((__v4hi)__A, __N); 98090075Sobrien} 981122180Skan 982122180Skanstatic __inline int 983122180Skan_m_pextrw (__m64 __A, int __N) 984122180Skan{ 985122180Skan return _mm_extract_pi16 (__A, __N); 986122180Skan} 98790075Sobrien#else 98890075Sobrien#define _mm_extract_pi16(A, N) \ 98990075Sobrien __builtin_ia32_pextrw ((__v4hi)(A), (N)) 990122180Skan#define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 99190075Sobrien#endif 99290075Sobrien 99390075Sobrien/* Inserts word D into one of four words of A. The selector N must be 99490075Sobrien immediate. */ 99590075Sobrien#if 0 99690075Sobrienstatic __inline __m64 99790075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N) 99890075Sobrien{ 99990075Sobrien return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 100090075Sobrien} 1001122180Skan 1002122180Skanstatic __inline __m64 1003122180Skan_m_pinsrw (__m64 __A, int __D, int __N) 1004122180Skan{ 1005122180Skan return _mm_insert_pi16 (__A, __D, __N); 1006122180Skan} 100790075Sobrien#else 100890075Sobrien#define _mm_insert_pi16(A, D, N) \ 100990075Sobrien ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 1010122180Skan#define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 101190075Sobrien#endif 101290075Sobrien 101390075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 101490075Sobrienstatic __inline __m64 101590075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 101690075Sobrien{ 101790075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 101890075Sobrien} 101990075Sobrien 1020122180Skanstatic __inline __m64 1021122180Skan_m_pmaxsw (__m64 __A, __m64 __B) 1022122180Skan{ 1023122180Skan return _mm_max_pi16 (__A, __B); 1024122180Skan} 1025122180Skan 102690075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 102790075Sobrienstatic __inline __m64 102890075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 102990075Sobrien{ 103090075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 103190075Sobrien} 103290075Sobrien 1033122180Skanstatic __inline __m64 1034122180Skan_m_pmaxub (__m64 __A, __m64 __B) 1035122180Skan{ 1036122180Skan return _mm_max_pu8 (__A, __B); 1037122180Skan} 1038122180Skan 103990075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 104090075Sobrienstatic __inline __m64 104190075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 104290075Sobrien{ 104390075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 104490075Sobrien} 104590075Sobrien 1046122180Skanstatic __inline __m64 1047122180Skan_m_pminsw (__m64 __A, __m64 __B) 1048122180Skan{ 1049122180Skan return _mm_min_pi16 (__A, __B); 1050122180Skan} 1051122180Skan 105290075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 105390075Sobrienstatic __inline __m64 105490075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 105590075Sobrien{ 105690075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 105790075Sobrien} 105890075Sobrien 1059122180Skanstatic __inline __m64 1060122180Skan_m_pminub (__m64 __A, __m64 __B) 1061122180Skan{ 1062122180Skan return _mm_min_pu8 (__A, __B); 1063122180Skan} 1064122180Skan 106590075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 106690075Sobrienstatic __inline int 106790075Sobrien_mm_movemask_pi8 (__m64 __A) 106890075Sobrien{ 106990075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 107090075Sobrien} 107190075Sobrien 1072122180Skanstatic __inline int 1073122180Skan_m_pmovmskb (__m64 __A) 1074122180Skan{ 1075122180Skan return _mm_movemask_pi8 (__A); 1076122180Skan} 1077122180Skan 107890075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 107990075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 108090075Sobrienstatic __inline __m64 108190075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 108290075Sobrien{ 108390075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 108490075Sobrien} 108590075Sobrien 1086122180Skanstatic __inline __m64 1087122180Skan_m_pmulhuw (__m64 __A, __m64 __B) 1088122180Skan{ 1089122180Skan return _mm_mulhi_pu16 (__A, __B); 1090122180Skan} 1091122180Skan 109290075Sobrien/* Return a combination of the four 16-bit values in A. The selector 109390075Sobrien must be an immediate. */ 109490075Sobrien#if 0 109590075Sobrienstatic __inline __m64 109690075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 109790075Sobrien{ 109890075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 109990075Sobrien} 1100122180Skan 1101122180Skanstatic __inline __m64 1102122180Skan_m_pshufw (__m64 __A, int __N) 1103122180Skan{ 1104122180Skan return _mm_shuffle_pi16 (__A, __N); 1105122180Skan} 110690075Sobrien#else 110790075Sobrien#define _mm_shuffle_pi16(A, N) \ 110890075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1109122180Skan#define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 111090075Sobrien#endif 111190075Sobrien 111290075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 111390075Sobrien byte in the selector N determines whether the corresponding byte from 111490075Sobrien A is stored. */ 111590075Sobrienstatic __inline void 111690075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 111790075Sobrien{ 111890075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 111990075Sobrien} 112090075Sobrien 1121122180Skanstatic __inline void 1122122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1123122180Skan{ 1124122180Skan _mm_maskmove_si64 (__A, __N, __P); 1125122180Skan} 1126122180Skan 112790075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 112890075Sobrienstatic __inline __m64 112990075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 113090075Sobrien{ 113190075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 113290075Sobrien} 113390075Sobrien 1134122180Skanstatic __inline __m64 1135122180Skan_m_pavgb (__m64 __A, __m64 __B) 1136122180Skan{ 1137122180Skan return _mm_avg_pu8 (__A, __B); 1138122180Skan} 1139122180Skan 114090075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 114190075Sobrienstatic __inline __m64 114290075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 114390075Sobrien{ 114490075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 114590075Sobrien} 114690075Sobrien 1147122180Skanstatic __inline __m64 1148122180Skan_m_pavgw (__m64 __A, __m64 __B) 1149122180Skan{ 1150122180Skan return _mm_avg_pu16 (__A, __B); 1151122180Skan} 1152122180Skan 115390075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 115490075Sobrien values in A and B. Return the value in the lower 16-bit word; the 115590075Sobrien upper words are cleared. */ 115690075Sobrienstatic __inline __m64 115790075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 115890075Sobrien{ 115990075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 116090075Sobrien} 116190075Sobrien 1162122180Skanstatic __inline __m64 1163122180Skan_m_psadbw (__m64 __A, __m64 __B) 1164122180Skan{ 1165122180Skan return _mm_sad_pu8 (__A, __B); 1166122180Skan} 1167122180Skan 116890075Sobrien/* Loads one cache line from address P to a location "closer" to the 116990075Sobrien processor. The selector I specifies the type of prefetch operation. */ 117090075Sobrien#if 0 117190075Sobrienstatic __inline void 117290075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 117390075Sobrien{ 117490075Sobrien __builtin_prefetch (__P, 0, __I); 117590075Sobrien} 117690075Sobrien#else 117790075Sobrien#define _mm_prefetch(P, I) \ 117890075Sobrien __builtin_prefetch ((P), 0, (I)) 117990075Sobrien#endif 118090075Sobrien 118190075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 118290075Sobrienstatic __inline void 118390075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 118490075Sobrien{ 1185117395Skan __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); 118690075Sobrien} 118790075Sobrien 118890075Sobrien/* Likewise. The address must be 16-byte aligned. */ 118990075Sobrienstatic __inline void 119090075Sobrien_mm_stream_ps (float *__P, __m128 __A) 119190075Sobrien{ 119290075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 119390075Sobrien} 119490075Sobrien 119590075Sobrien/* Guarantees that every preceeding store is globally visible before 119690075Sobrien any subsequent store. */ 119790075Sobrienstatic __inline void 119890075Sobrien_mm_sfence (void) 119990075Sobrien{ 120090075Sobrien __builtin_ia32_sfence (); 120190075Sobrien} 120290075Sobrien 120390075Sobrien/* The execution of the next instruction is delayed by an implementation 120490075Sobrien specific amount of time. The instruction does not modify the 120590075Sobrien architectural state. */ 120690075Sobrienstatic __inline void 120790075Sobrien_mm_pause (void) 120890075Sobrien{ 120990075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 121090075Sobrien} 121190075Sobrien 121290075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 121390075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 121490075Sobriendo { \ 121590075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 121690075Sobrien __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1217107590Sobrien __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1218107590Sobrien __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 121990075Sobrien __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 122090075Sobrien (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 122190075Sobrien (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 122290075Sobrien (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 122390075Sobrien (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 122490075Sobrien} while (0) 122590075Sobrien 1226122180Skan/* For backward source compatibility. */ 1227122180Skan#include <emmintrin.h> 1228117395Skan 1229117395Skan#endif /* __SSE__ */ 123090075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1231