xmmintrin.h revision 132718
1132718Skan/* Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. 290075Sobrien 3132718Skan This file is part of GCC. 490075Sobrien 5132718Skan GCC is free software; you can redistribute it and/or modify 690075Sobrien it under the terms of the GNU General Public License as published by 790075Sobrien the Free Software Foundation; either version 2, or (at your option) 890075Sobrien any later version. 990075Sobrien 10132718Skan GCC is distributed in the hope that it will be useful, 1190075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1290075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1390075Sobrien GNU General Public License for more details. 1490075Sobrien 1590075Sobrien You should have received a copy of the GNU General Public License 16132718Skan along with GCC; see the file COPYING. If not, write to 1790075Sobrien the Free Software Foundation, 59 Temple Place - Suite 330, 1890075Sobrien Boston, MA 02111-1307, USA. */ 1990075Sobrien 2090075Sobrien/* As a special exception, if you include this header file into source 2190075Sobrien files compiled by GCC, this header file does not by itself cause 2290075Sobrien the resulting executable to be covered by the GNU General Public 2390075Sobrien License. This exception does not however invalidate any other 2490075Sobrien reasons why the executable file might be covered by the GNU General 2590075Sobrien Public License. */ 2690075Sobrien 2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 28122180Skan User Guide and Reference, version 8.0. */ 2990075Sobrien 3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3190075Sobrien#define _XMMINTRIN_H_INCLUDED 3290075Sobrien 33117395Skan#ifndef __SSE__ 34117395Skan# error "SSE instruction set not enabled" 35117395Skan#else 36117395Skan 3790075Sobrien/* We need type definitions from the MMX header file. */ 3890075Sobrien#include <mmintrin.h> 3990075Sobrien 40132718Skan/* The data type intended for user use. */ 4190075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__))); 4290075Sobrien 43132718Skan/* Internal data types for implementing the intrinsics. */ 4490075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 4590075Sobrien 4690075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 4790075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 4890075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 4990075Sobrien 5090075Sobrien/* Constants for use with _mm_prefetch. */ 5190075Sobrienenum _mm_hint 5290075Sobrien{ 5390075Sobrien _MM_HINT_T0 = 3, 5490075Sobrien _MM_HINT_T1 = 2, 5590075Sobrien _MM_HINT_T2 = 1, 5690075Sobrien _MM_HINT_NTA = 0 5790075Sobrien}; 5890075Sobrien 5990075Sobrien/* Bits in the MXCSR. */ 6090075Sobrien#define _MM_EXCEPT_MASK 0x003f 6190075Sobrien#define _MM_EXCEPT_INVALID 0x0001 6290075Sobrien#define _MM_EXCEPT_DENORM 0x0002 6390075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 6490075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 6590075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 6690075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 6790075Sobrien 6890075Sobrien#define _MM_MASK_MASK 0x1f80 6990075Sobrien#define _MM_MASK_INVALID 0x0080 7090075Sobrien#define _MM_MASK_DENORM 0x0100 7190075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 7290075Sobrien#define _MM_MASK_OVERFLOW 0x0400 7390075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 7490075Sobrien#define _MM_MASK_INEXACT 0x1000 7590075Sobrien 7690075Sobrien#define _MM_ROUND_MASK 0x6000 7790075Sobrien#define _MM_ROUND_NEAREST 0x0000 7890075Sobrien#define _MM_ROUND_DOWN 0x2000 7990075Sobrien#define _MM_ROUND_UP 0x4000 8090075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 8190075Sobrien 8290075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 8390075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 8490075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 8590075Sobrien 8690075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 8790075Sobrien floating-point) values of A and B; the upper three SPFP values are 8890075Sobrien passed through from A. */ 8990075Sobrien 9090075Sobrienstatic __inline __m128 9190075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 9290075Sobrien{ 9390075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 9490075Sobrien} 9590075Sobrien 9690075Sobrienstatic __inline __m128 9790075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 9890075Sobrien{ 9990075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 10090075Sobrien} 10190075Sobrien 10290075Sobrienstatic __inline __m128 10390075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 10490075Sobrien{ 10590075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 10690075Sobrien} 10790075Sobrien 10890075Sobrienstatic __inline __m128 10990075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 11090075Sobrien{ 11190075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 11290075Sobrien} 11390075Sobrien 11490075Sobrienstatic __inline __m128 11590075Sobrien_mm_sqrt_ss (__m128 __A) 11690075Sobrien{ 11790075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 11890075Sobrien} 11990075Sobrien 12090075Sobrienstatic __inline __m128 12190075Sobrien_mm_rcp_ss (__m128 __A) 12290075Sobrien{ 12390075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 12490075Sobrien} 12590075Sobrien 12690075Sobrienstatic __inline __m128 12790075Sobrien_mm_rsqrt_ss (__m128 __A) 12890075Sobrien{ 12990075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 13090075Sobrien} 13190075Sobrien 13290075Sobrienstatic __inline __m128 13390075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 13490075Sobrien{ 13590075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 13690075Sobrien} 13790075Sobrien 13890075Sobrienstatic __inline __m128 13990075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 14090075Sobrien{ 14190075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 14290075Sobrien} 14390075Sobrien 14490075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 14590075Sobrien 14690075Sobrienstatic __inline __m128 14790075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 14890075Sobrien{ 14990075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 15090075Sobrien} 15190075Sobrien 15290075Sobrienstatic __inline __m128 15390075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 15490075Sobrien{ 15590075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 15690075Sobrien} 15790075Sobrien 15890075Sobrienstatic __inline __m128 15990075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 16090075Sobrien{ 16190075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 16290075Sobrien} 16390075Sobrien 16490075Sobrienstatic __inline __m128 16590075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 16690075Sobrien{ 16790075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 16890075Sobrien} 16990075Sobrien 17090075Sobrienstatic __inline __m128 17190075Sobrien_mm_sqrt_ps (__m128 __A) 17290075Sobrien{ 17390075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 17490075Sobrien} 17590075Sobrien 17690075Sobrienstatic __inline __m128 17790075Sobrien_mm_rcp_ps (__m128 __A) 17890075Sobrien{ 17990075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 18090075Sobrien} 18190075Sobrien 18290075Sobrienstatic __inline __m128 18390075Sobrien_mm_rsqrt_ps (__m128 __A) 18490075Sobrien{ 18590075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 18690075Sobrien} 18790075Sobrien 18890075Sobrienstatic __inline __m128 18990075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 19090075Sobrien{ 19190075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 19290075Sobrien} 19390075Sobrien 19490075Sobrienstatic __inline __m128 19590075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 19690075Sobrien{ 19790075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 19890075Sobrien} 19990075Sobrien 20090075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 20190075Sobrien 20290075Sobrienstatic __inline __m128 20390075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 20490075Sobrien{ 20590075Sobrien return __builtin_ia32_andps (__A, __B); 20690075Sobrien} 20790075Sobrien 20890075Sobrienstatic __inline __m128 20990075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 21090075Sobrien{ 21190075Sobrien return __builtin_ia32_andnps (__A, __B); 21290075Sobrien} 21390075Sobrien 21490075Sobrienstatic __inline __m128 21590075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 21690075Sobrien{ 21790075Sobrien return __builtin_ia32_orps (__A, __B); 21890075Sobrien} 21990075Sobrien 22090075Sobrienstatic __inline __m128 22190075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 22290075Sobrien{ 22390075Sobrien return __builtin_ia32_xorps (__A, __B); 22490075Sobrien} 22590075Sobrien 22690075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 22790075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 22890075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 22990075Sobrien 23090075Sobrienstatic __inline __m128 23190075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 23290075Sobrien{ 23390075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 23490075Sobrien} 23590075Sobrien 23690075Sobrienstatic __inline __m128 23790075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 23890075Sobrien{ 23990075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 24090075Sobrien} 24190075Sobrien 24290075Sobrienstatic __inline __m128 24390075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 24490075Sobrien{ 24590075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 24690075Sobrien} 24790075Sobrien 24890075Sobrienstatic __inline __m128 24990075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 25090075Sobrien{ 251107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 252107590Sobrien (__v4sf) 253107590Sobrien __builtin_ia32_cmpltss ((__v4sf) __B, 254107590Sobrien (__v4sf) 255107590Sobrien __A)); 25690075Sobrien} 25790075Sobrien 25890075Sobrienstatic __inline __m128 25990075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 26090075Sobrien{ 261107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 262107590Sobrien (__v4sf) 263107590Sobrien __builtin_ia32_cmpless ((__v4sf) __B, 264107590Sobrien (__v4sf) 265107590Sobrien __A)); 26690075Sobrien} 26790075Sobrien 26890075Sobrienstatic __inline __m128 26990075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 27090075Sobrien{ 27190075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 27290075Sobrien} 27390075Sobrien 27490075Sobrienstatic __inline __m128 27590075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 27690075Sobrien{ 27790075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 27890075Sobrien} 27990075Sobrien 28090075Sobrienstatic __inline __m128 28190075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 28290075Sobrien{ 28390075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 28490075Sobrien} 28590075Sobrien 28690075Sobrienstatic __inline __m128 28790075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 28890075Sobrien{ 289107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 290107590Sobrien (__v4sf) 291107590Sobrien __builtin_ia32_cmpnltss ((__v4sf) __B, 292107590Sobrien (__v4sf) 293107590Sobrien __A)); 29490075Sobrien} 29590075Sobrien 29690075Sobrienstatic __inline __m128 29790075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 29890075Sobrien{ 299107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 300107590Sobrien (__v4sf) 301107590Sobrien __builtin_ia32_cmpnless ((__v4sf) __B, 302107590Sobrien (__v4sf) 303107590Sobrien __A)); 30490075Sobrien} 30590075Sobrien 30690075Sobrienstatic __inline __m128 30790075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 30890075Sobrien{ 30990075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 31090075Sobrien} 31190075Sobrien 31290075Sobrienstatic __inline __m128 31390075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 31490075Sobrien{ 31590075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 31690075Sobrien} 31790075Sobrien 31890075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 31990075Sobrien element, if the comparison is true, place a mask of all ones in the 32090075Sobrien result, otherwise a mask of zeros. */ 32190075Sobrien 32290075Sobrienstatic __inline __m128 32390075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 32490075Sobrien{ 32590075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 32690075Sobrien} 32790075Sobrien 32890075Sobrienstatic __inline __m128 32990075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 33090075Sobrien{ 33190075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 33290075Sobrien} 33390075Sobrien 33490075Sobrienstatic __inline __m128 33590075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 33690075Sobrien{ 33790075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 33890075Sobrien} 33990075Sobrien 34090075Sobrienstatic __inline __m128 34190075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 34290075Sobrien{ 34390075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 34490075Sobrien} 34590075Sobrien 34690075Sobrienstatic __inline __m128 34790075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 34890075Sobrien{ 34990075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 35090075Sobrien} 35190075Sobrien 35290075Sobrienstatic __inline __m128 35390075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 35490075Sobrien{ 35590075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 35690075Sobrien} 35790075Sobrien 35890075Sobrienstatic __inline __m128 35990075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 36090075Sobrien{ 36190075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 36290075Sobrien} 36390075Sobrien 36490075Sobrienstatic __inline __m128 36590075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 36690075Sobrien{ 36790075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 36890075Sobrien} 36990075Sobrien 37090075Sobrienstatic __inline __m128 37190075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 37290075Sobrien{ 37390075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 37490075Sobrien} 37590075Sobrien 37690075Sobrienstatic __inline __m128 37790075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 37890075Sobrien{ 37990075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 38090075Sobrien} 38190075Sobrien 38290075Sobrienstatic __inline __m128 38390075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 38490075Sobrien{ 38590075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 38690075Sobrien} 38790075Sobrien 38890075Sobrienstatic __inline __m128 38990075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 39090075Sobrien{ 39190075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 39290075Sobrien} 39390075Sobrien 39490075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 39590075Sobrien and 0 if false. */ 39690075Sobrien 39790075Sobrienstatic __inline int 39890075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 39990075Sobrien{ 40090075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 40190075Sobrien} 40290075Sobrien 40390075Sobrienstatic __inline int 40490075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 40590075Sobrien{ 40690075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 40790075Sobrien} 40890075Sobrien 40990075Sobrienstatic __inline int 41090075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 41190075Sobrien{ 41290075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 41390075Sobrien} 41490075Sobrien 41590075Sobrienstatic __inline int 41690075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 41790075Sobrien{ 41890075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 41990075Sobrien} 42090075Sobrien 42190075Sobrienstatic __inline int 42290075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 42390075Sobrien{ 42490075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 42590075Sobrien} 42690075Sobrien 42790075Sobrienstatic __inline int 42890075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 42990075Sobrien{ 43090075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 43190075Sobrien} 43290075Sobrien 43390075Sobrienstatic __inline int 43490075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 43590075Sobrien{ 43690075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 43790075Sobrien} 43890075Sobrien 43990075Sobrienstatic __inline int 44090075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 44190075Sobrien{ 44290075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 44390075Sobrien} 44490075Sobrien 44590075Sobrienstatic __inline int 44690075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 44790075Sobrien{ 44890075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 44990075Sobrien} 45090075Sobrien 45190075Sobrienstatic __inline int 45290075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 45390075Sobrien{ 45490075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 45590075Sobrien} 45690075Sobrien 45790075Sobrienstatic __inline int 45890075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 45990075Sobrien{ 46090075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 46190075Sobrien} 46290075Sobrien 46390075Sobrienstatic __inline int 46490075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 46590075Sobrien{ 46690075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 46790075Sobrien} 46890075Sobrien 46990075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 47090075Sobrien rounding mode. */ 47190075Sobrienstatic __inline int 47290075Sobrien_mm_cvtss_si32 (__m128 __A) 47390075Sobrien{ 47490075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 47590075Sobrien} 47690075Sobrien 477122180Skanstatic __inline int 478122180Skan_mm_cvt_ss2si (__m128 __A) 479122180Skan{ 480122180Skan return _mm_cvtss_si32 (__A); 481122180Skan} 482122180Skan 483117395Skan#ifdef __x86_64__ 484117395Skan/* Convert the lower SPFP value to a 32-bit integer according to the current 485117395Skan rounding mode. */ 486117395Skanstatic __inline long long 487117395Skan_mm_cvtss_si64x (__m128 __A) 488117395Skan{ 489117395Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 490117395Skan} 491117395Skan#endif 492117395Skan 49390075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 49490075Sobrien current rounding mode. Return the integers in packed form. */ 49590075Sobrienstatic __inline __m64 49690075Sobrien_mm_cvtps_pi32 (__m128 __A) 49790075Sobrien{ 49890075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 49990075Sobrien} 50090075Sobrien 501122180Skanstatic __inline __m64 502122180Skan_mm_cvt_ps2pi (__m128 __A) 503122180Skan{ 504122180Skan return _mm_cvtps_pi32 (__A); 505122180Skan} 506122180Skan 50790075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 50890075Sobrienstatic __inline int 50990075Sobrien_mm_cvttss_si32 (__m128 __A) 51090075Sobrien{ 51190075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 51290075Sobrien} 51390075Sobrien 514122180Skanstatic __inline int 515122180Skan_mm_cvtt_ss2si (__m128 __A) 516122180Skan{ 517122180Skan return _mm_cvttss_si32 (__A); 518122180Skan} 519122180Skan 520117395Skan#ifdef __x86_64__ 521117395Skan/* Truncate the lower SPFP value to a 32-bit integer. */ 522117395Skanstatic __inline long long 523117395Skan_mm_cvttss_si64x (__m128 __A) 524117395Skan{ 525117395Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 526117395Skan} 527117395Skan#endif 528117395Skan 52990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 53090075Sobrien integers in packed form. */ 53190075Sobrienstatic __inline __m64 53290075Sobrien_mm_cvttps_pi32 (__m128 __A) 53390075Sobrien{ 53490075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 53590075Sobrien} 53690075Sobrien 537122180Skanstatic __inline __m64 538122180Skan_mm_cvtt_ps2pi (__m128 __A) 539122180Skan{ 540122180Skan return _mm_cvttps_pi32 (__A); 541122180Skan} 542122180Skan 54390075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 54490075Sobrienstatic __inline __m128 54590075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 54690075Sobrien{ 54790075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 54890075Sobrien} 54990075Sobrien 550122180Skanstatic __inline __m128 551122180Skan_mm_cvt_si2ss (__m128 __A, int __B) 552122180Skan{ 553122180Skan return _mm_cvtsi32_ss (__A, __B); 554122180Skan} 555122180Skan 556117395Skan#ifdef __x86_64__ 557117395Skan/* Convert B to a SPFP value and insert it as element zero in A. */ 558117395Skanstatic __inline __m128 559117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B) 560117395Skan{ 561117395Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 562117395Skan} 563117395Skan#endif 564117395Skan 56590075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 56690075Sobrien as the two lower elements in A. */ 56790075Sobrienstatic __inline __m128 56890075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 56990075Sobrien{ 57090075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 57190075Sobrien} 57290075Sobrien 573122180Skanstatic __inline __m128 574122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B) 575122180Skan{ 576122180Skan return _mm_cvtpi32_ps (__A, __B); 577122180Skan} 578122180Skan 57990075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 58090075Sobrienstatic __inline __m128 58190075Sobrien_mm_cvtpi16_ps (__m64 __A) 58290075Sobrien{ 58390075Sobrien __v4hi __sign; 58490075Sobrien __v2si __hisi, __losi; 58590075Sobrien __v4sf __r; 58690075Sobrien 58790075Sobrien /* This comparison against zero gives us a mask that can be used to 58890075Sobrien fill in the missing sign bits in the unpack operations below, so 58990075Sobrien that we get signed values after unpacking. */ 59090075Sobrien __sign = (__v4hi) __builtin_ia32_mmx_zero (); 59190075Sobrien __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 59290075Sobrien 59390075Sobrien /* Convert the four words to doublewords. */ 59490075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 59590075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 59690075Sobrien 59790075Sobrien /* Convert the doublewords to floating point two at a time. */ 59890075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 59990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 60090075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 60190075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 60290075Sobrien 60390075Sobrien return (__m128) __r; 60490075Sobrien} 60590075Sobrien 60690075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 60790075Sobrienstatic __inline __m128 60890075Sobrien_mm_cvtpu16_ps (__m64 __A) 60990075Sobrien{ 61090075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 61190075Sobrien __v2si __hisi, __losi; 61290075Sobrien __v4sf __r; 61390075Sobrien 61490075Sobrien /* Convert the four words to doublewords. */ 61590075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 61690075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 61790075Sobrien 61890075Sobrien /* Convert the doublewords to floating point two at a time. */ 61990075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 62090075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 62190075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 62290075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 62390075Sobrien 62490075Sobrien return (__m128) __r; 62590075Sobrien} 62690075Sobrien 62790075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 62890075Sobrienstatic __inline __m128 62990075Sobrien_mm_cvtpi8_ps (__m64 __A) 63090075Sobrien{ 63190075Sobrien __v8qi __sign; 63290075Sobrien 63390075Sobrien /* This comparison against zero gives us a mask that can be used to 63490075Sobrien fill in the missing sign bits in the unpack operations below, so 63590075Sobrien that we get signed values after unpacking. */ 63690075Sobrien __sign = (__v8qi) __builtin_ia32_mmx_zero (); 63790075Sobrien __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 63890075Sobrien 63990075Sobrien /* Convert the four low bytes to words. */ 64090075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 64190075Sobrien 64290075Sobrien return _mm_cvtpi16_ps(__A); 64390075Sobrien} 64490075Sobrien 64590075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 64690075Sobrienstatic __inline __m128 64790075Sobrien_mm_cvtpu8_ps(__m64 __A) 64890075Sobrien{ 64990075Sobrien __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 65090075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 65190075Sobrien return _mm_cvtpu16_ps(__A); 65290075Sobrien} 65390075Sobrien 65490075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 65590075Sobrienstatic __inline __m128 65690075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 65790075Sobrien{ 65890075Sobrien __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 65990075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 66090075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 66190075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 66290075Sobrien} 66390075Sobrien 66490075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 66590075Sobrienstatic __inline __m64 66690075Sobrien_mm_cvtps_pi16(__m128 __A) 66790075Sobrien{ 66890075Sobrien __v4sf __hisf = (__v4sf)__A; 66990075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 67090075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 67190075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 672117395Skan return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 67390075Sobrien} 67490075Sobrien 67590075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 67690075Sobrienstatic __inline __m64 67790075Sobrien_mm_cvtps_pi8(__m128 __A) 67890075Sobrien{ 67990075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 68090075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 68190075Sobrien return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 68290075Sobrien} 68390075Sobrien 68490075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 68590075Sobrien#if 0 68690075Sobrienstatic __inline __m128 68790075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 68890075Sobrien{ 68990075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 69090075Sobrien} 69190075Sobrien#else 69290075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 69390075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 69490075Sobrien#endif 69590075Sobrien 69690075Sobrien 69790075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 69890075Sobrienstatic __inline __m128 69990075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 70090075Sobrien{ 70190075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 70290075Sobrien} 70390075Sobrien 70490075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 70590075Sobrienstatic __inline __m128 70690075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 70790075Sobrien{ 70890075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 70990075Sobrien} 71090075Sobrien 71190075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 71290075Sobrien the lower two values are passed through from A. */ 71390075Sobrienstatic __inline __m128 714117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P) 71590075Sobrien{ 71690075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 71790075Sobrien} 71890075Sobrien 71990075Sobrien/* Stores the upper two SPFP values of A into P. */ 72090075Sobrienstatic __inline void 72190075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 72290075Sobrien{ 72390075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 72490075Sobrien} 72590075Sobrien 72690075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 72790075Sobrienstatic __inline __m128 72890075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 72990075Sobrien{ 73090075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 73190075Sobrien} 73290075Sobrien 73390075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 73490075Sobrienstatic __inline __m128 73590075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 73690075Sobrien{ 73790075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 73890075Sobrien} 73990075Sobrien 74090075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 74190075Sobrien the upper two values are passed through from A. */ 74290075Sobrienstatic __inline __m128 743117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P) 74490075Sobrien{ 74590075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 74690075Sobrien} 74790075Sobrien 74890075Sobrien/* Stores the lower two SPFP values of A into P. */ 74990075Sobrienstatic __inline void 75090075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 75190075Sobrien{ 75290075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 75390075Sobrien} 75490075Sobrien 75590075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 75690075Sobrienstatic __inline int 75790075Sobrien_mm_movemask_ps (__m128 __A) 75890075Sobrien{ 75990075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 76090075Sobrien} 76190075Sobrien 76290075Sobrien/* Return the contents of the control register. */ 76390075Sobrienstatic __inline unsigned int 76490075Sobrien_mm_getcsr (void) 76590075Sobrien{ 76690075Sobrien return __builtin_ia32_stmxcsr (); 76790075Sobrien} 76890075Sobrien 76990075Sobrien/* Read exception bits from the control register. */ 77090075Sobrienstatic __inline unsigned int 77190075Sobrien_MM_GET_EXCEPTION_STATE (void) 77290075Sobrien{ 77390075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 77490075Sobrien} 77590075Sobrien 77690075Sobrienstatic __inline unsigned int 77790075Sobrien_MM_GET_EXCEPTION_MASK (void) 77890075Sobrien{ 77990075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 78090075Sobrien} 78190075Sobrien 78290075Sobrienstatic __inline unsigned int 78390075Sobrien_MM_GET_ROUNDING_MODE (void) 78490075Sobrien{ 78590075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 78690075Sobrien} 78790075Sobrien 78890075Sobrienstatic __inline unsigned int 78990075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 79090075Sobrien{ 79190075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 79290075Sobrien} 79390075Sobrien 79490075Sobrien/* Set the control register to I. */ 79590075Sobrienstatic __inline void 79690075Sobrien_mm_setcsr (unsigned int __I) 79790075Sobrien{ 79890075Sobrien __builtin_ia32_ldmxcsr (__I); 79990075Sobrien} 80090075Sobrien 80190075Sobrien/* Set exception bits in the control register. */ 80290075Sobrienstatic __inline void 80390075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 80490075Sobrien{ 80590075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 80690075Sobrien} 80790075Sobrien 80890075Sobrienstatic __inline void 80990075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 81090075Sobrien{ 81190075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 81290075Sobrien} 81390075Sobrien 81490075Sobrienstatic __inline void 81590075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 81690075Sobrien{ 81790075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 81890075Sobrien} 81990075Sobrien 82090075Sobrienstatic __inline void 82190075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 82290075Sobrien{ 82390075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 82490075Sobrien} 82590075Sobrien 82690075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 82790075Sobrienstatic __inline __m128 828117395Skan_mm_load_ss (float const *__P) 82990075Sobrien{ 83090075Sobrien return (__m128) __builtin_ia32_loadss (__P); 83190075Sobrien} 83290075Sobrien 83390075Sobrien/* Create a vector with all four elements equal to *P. */ 83490075Sobrienstatic __inline __m128 835117395Skan_mm_load1_ps (float const *__P) 83690075Sobrien{ 83790075Sobrien __v4sf __tmp = __builtin_ia32_loadss (__P); 83890075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 83990075Sobrien} 84090075Sobrien 84190075Sobrienstatic __inline __m128 842117395Skan_mm_load_ps1 (float const *__P) 84390075Sobrien{ 84490075Sobrien return _mm_load1_ps (__P); 84590075Sobrien} 84690075Sobrien 84790075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 84890075Sobrienstatic __inline __m128 849117395Skan_mm_load_ps (float const *__P) 85090075Sobrien{ 85190075Sobrien return (__m128) __builtin_ia32_loadaps (__P); 85290075Sobrien} 85390075Sobrien 85490075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 85590075Sobrienstatic __inline __m128 856117395Skan_mm_loadu_ps (float const *__P) 85790075Sobrien{ 85890075Sobrien return (__m128) __builtin_ia32_loadups (__P); 85990075Sobrien} 86090075Sobrien 86190075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 86290075Sobrienstatic __inline __m128 863117395Skan_mm_loadr_ps (float const *__P) 86490075Sobrien{ 86590075Sobrien __v4sf __tmp = __builtin_ia32_loadaps (__P); 86690075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 86790075Sobrien} 86890075Sobrien 86990075Sobrien/* Create a vector with element 0 as F and the rest zero. */ 87090075Sobrienstatic __inline __m128 87190075Sobrien_mm_set_ss (float __F) 87290075Sobrien{ 87390075Sobrien return (__m128) __builtin_ia32_loadss (&__F); 87490075Sobrien} 87590075Sobrien 87690075Sobrien/* Create a vector with all four elements equal to F. */ 87790075Sobrienstatic __inline __m128 87890075Sobrien_mm_set1_ps (float __F) 87990075Sobrien{ 88090075Sobrien __v4sf __tmp = __builtin_ia32_loadss (&__F); 88190075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 88290075Sobrien} 88390075Sobrien 88490075Sobrienstatic __inline __m128 88590075Sobrien_mm_set_ps1 (float __F) 88690075Sobrien{ 88790075Sobrien return _mm_set1_ps (__F); 88890075Sobrien} 88990075Sobrien 89090075Sobrien/* Create the vector [Z Y X W]. */ 89190075Sobrienstatic __inline __m128 892132718Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 89390075Sobrien{ 894132718Skan return (__v4sf) {__W, __X, __Y, __Z}; 89590075Sobrien} 89690075Sobrien 89790075Sobrien/* Create the vector [W X Y Z]. */ 89890075Sobrienstatic __inline __m128 89990075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W) 90090075Sobrien{ 90190075Sobrien return _mm_set_ps (__W, __X, __Y, __Z); 90290075Sobrien} 90390075Sobrien 90490075Sobrien/* Create a vector of zeros. */ 90590075Sobrienstatic __inline __m128 90690075Sobrien_mm_setzero_ps (void) 90790075Sobrien{ 90890075Sobrien return (__m128) __builtin_ia32_setzerops (); 90990075Sobrien} 91090075Sobrien 91190075Sobrien/* Stores the lower SPFP value. */ 91290075Sobrienstatic __inline void 91390075Sobrien_mm_store_ss (float *__P, __m128 __A) 91490075Sobrien{ 91590075Sobrien __builtin_ia32_storess (__P, (__v4sf)__A); 91690075Sobrien} 91790075Sobrien 91890075Sobrien/* Store the lower SPFP value across four words. */ 91990075Sobrienstatic __inline void 92090075Sobrien_mm_store1_ps (float *__P, __m128 __A) 92190075Sobrien{ 92290075Sobrien __v4sf __va = (__v4sf)__A; 92390075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 92490075Sobrien __builtin_ia32_storeaps (__P, __tmp); 92590075Sobrien} 92690075Sobrien 92790075Sobrienstatic __inline void 92890075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 92990075Sobrien{ 93090075Sobrien _mm_store1_ps (__P, __A); 93190075Sobrien} 93290075Sobrien 93390075Sobrien/* Store four SPFP values. The address must be 16-byte aligned. */ 93490075Sobrienstatic __inline void 93590075Sobrien_mm_store_ps (float *__P, __m128 __A) 93690075Sobrien{ 93790075Sobrien __builtin_ia32_storeaps (__P, (__v4sf)__A); 93890075Sobrien} 93990075Sobrien 94090075Sobrien/* Store four SPFP values. The address need not be 16-byte aligned. */ 94190075Sobrienstatic __inline void 94290075Sobrien_mm_storeu_ps (float *__P, __m128 __A) 94390075Sobrien{ 94490075Sobrien __builtin_ia32_storeups (__P, (__v4sf)__A); 94590075Sobrien} 94690075Sobrien 947117395Skan/* Store four SPFP values in reverse order. The address must be aligned. */ 94890075Sobrienstatic __inline void 94990075Sobrien_mm_storer_ps (float *__P, __m128 __A) 95090075Sobrien{ 95190075Sobrien __v4sf __va = (__v4sf)__A; 95290075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 95390075Sobrien __builtin_ia32_storeaps (__P, __tmp); 95490075Sobrien} 95590075Sobrien 95690075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 95790075Sobrienstatic __inline __m128 95890075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 95990075Sobrien{ 96090075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 96190075Sobrien} 96290075Sobrien 96390075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 96490075Sobrien#if 0 96590075Sobrienstatic __inline int 96690075Sobrien_mm_extract_pi16 (__m64 __A, int __N) 96790075Sobrien{ 96890075Sobrien return __builtin_ia32_pextrw ((__v4hi)__A, __N); 96990075Sobrien} 970122180Skan 971122180Skanstatic __inline int 972122180Skan_m_pextrw (__m64 __A, int __N) 973122180Skan{ 974122180Skan return _mm_extract_pi16 (__A, __N); 975122180Skan} 97690075Sobrien#else 97790075Sobrien#define _mm_extract_pi16(A, N) \ 97890075Sobrien __builtin_ia32_pextrw ((__v4hi)(A), (N)) 979122180Skan#define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 98090075Sobrien#endif 98190075Sobrien 98290075Sobrien/* Inserts word D into one of four words of A. The selector N must be 98390075Sobrien immediate. */ 98490075Sobrien#if 0 98590075Sobrienstatic __inline __m64 98690075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N) 98790075Sobrien{ 98890075Sobrien return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 98990075Sobrien} 990122180Skan 991122180Skanstatic __inline __m64 992122180Skan_m_pinsrw (__m64 __A, int __D, int __N) 993122180Skan{ 994122180Skan return _mm_insert_pi16 (__A, __D, __N); 995122180Skan} 99690075Sobrien#else 99790075Sobrien#define _mm_insert_pi16(A, D, N) \ 99890075Sobrien ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 999122180Skan#define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 100090075Sobrien#endif 100190075Sobrien 100290075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 100390075Sobrienstatic __inline __m64 100490075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 100590075Sobrien{ 100690075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 100790075Sobrien} 100890075Sobrien 1009122180Skanstatic __inline __m64 1010122180Skan_m_pmaxsw (__m64 __A, __m64 __B) 1011122180Skan{ 1012122180Skan return _mm_max_pi16 (__A, __B); 1013122180Skan} 1014122180Skan 101590075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 101690075Sobrienstatic __inline __m64 101790075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 101890075Sobrien{ 101990075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 102090075Sobrien} 102190075Sobrien 1022122180Skanstatic __inline __m64 1023122180Skan_m_pmaxub (__m64 __A, __m64 __B) 1024122180Skan{ 1025122180Skan return _mm_max_pu8 (__A, __B); 1026122180Skan} 1027122180Skan 102890075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 102990075Sobrienstatic __inline __m64 103090075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 103190075Sobrien{ 103290075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 103390075Sobrien} 103490075Sobrien 1035122180Skanstatic __inline __m64 1036122180Skan_m_pminsw (__m64 __A, __m64 __B) 1037122180Skan{ 1038122180Skan return _mm_min_pi16 (__A, __B); 1039122180Skan} 1040122180Skan 104190075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 104290075Sobrienstatic __inline __m64 104390075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 104490075Sobrien{ 104590075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 104690075Sobrien} 104790075Sobrien 1048122180Skanstatic __inline __m64 1049122180Skan_m_pminub (__m64 __A, __m64 __B) 1050122180Skan{ 1051122180Skan return _mm_min_pu8 (__A, __B); 1052122180Skan} 1053122180Skan 105490075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 105590075Sobrienstatic __inline int 105690075Sobrien_mm_movemask_pi8 (__m64 __A) 105790075Sobrien{ 105890075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 105990075Sobrien} 106090075Sobrien 1061122180Skanstatic __inline int 1062122180Skan_m_pmovmskb (__m64 __A) 1063122180Skan{ 1064122180Skan return _mm_movemask_pi8 (__A); 1065122180Skan} 1066122180Skan 106790075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 106890075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 106990075Sobrienstatic __inline __m64 107090075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 107190075Sobrien{ 107290075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 107390075Sobrien} 107490075Sobrien 1075122180Skanstatic __inline __m64 1076122180Skan_m_pmulhuw (__m64 __A, __m64 __B) 1077122180Skan{ 1078122180Skan return _mm_mulhi_pu16 (__A, __B); 1079122180Skan} 1080122180Skan 108190075Sobrien/* Return a combination of the four 16-bit values in A. The selector 108290075Sobrien must be an immediate. */ 108390075Sobrien#if 0 108490075Sobrienstatic __inline __m64 108590075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 108690075Sobrien{ 108790075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 108890075Sobrien} 1089122180Skan 1090122180Skanstatic __inline __m64 1091122180Skan_m_pshufw (__m64 __A, int __N) 1092122180Skan{ 1093122180Skan return _mm_shuffle_pi16 (__A, __N); 1094122180Skan} 109590075Sobrien#else 109690075Sobrien#define _mm_shuffle_pi16(A, N) \ 109790075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1098122180Skan#define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 109990075Sobrien#endif 110090075Sobrien 110190075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 110290075Sobrien byte in the selector N determines whether the corresponding byte from 110390075Sobrien A is stored. */ 110490075Sobrienstatic __inline void 110590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 110690075Sobrien{ 110790075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 110890075Sobrien} 110990075Sobrien 1110122180Skanstatic __inline void 1111122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1112122180Skan{ 1113122180Skan _mm_maskmove_si64 (__A, __N, __P); 1114122180Skan} 1115122180Skan 111690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 111790075Sobrienstatic __inline __m64 111890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 111990075Sobrien{ 112090075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 112190075Sobrien} 112290075Sobrien 1123122180Skanstatic __inline __m64 1124122180Skan_m_pavgb (__m64 __A, __m64 __B) 1125122180Skan{ 1126122180Skan return _mm_avg_pu8 (__A, __B); 1127122180Skan} 1128122180Skan 112990075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 113090075Sobrienstatic __inline __m64 113190075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 113290075Sobrien{ 113390075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 113490075Sobrien} 113590075Sobrien 1136122180Skanstatic __inline __m64 1137122180Skan_m_pavgw (__m64 __A, __m64 __B) 1138122180Skan{ 1139122180Skan return _mm_avg_pu16 (__A, __B); 1140122180Skan} 1141122180Skan 114290075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 114390075Sobrien values in A and B. Return the value in the lower 16-bit word; the 114490075Sobrien upper words are cleared. */ 114590075Sobrienstatic __inline __m64 114690075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 114790075Sobrien{ 114890075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 114990075Sobrien} 115090075Sobrien 1151122180Skanstatic __inline __m64 1152122180Skan_m_psadbw (__m64 __A, __m64 __B) 1153122180Skan{ 1154122180Skan return _mm_sad_pu8 (__A, __B); 1155122180Skan} 1156122180Skan 115790075Sobrien/* Loads one cache line from address P to a location "closer" to the 115890075Sobrien processor. The selector I specifies the type of prefetch operation. */ 115990075Sobrien#if 0 116090075Sobrienstatic __inline void 116190075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 116290075Sobrien{ 116390075Sobrien __builtin_prefetch (__P, 0, __I); 116490075Sobrien} 116590075Sobrien#else 116690075Sobrien#define _mm_prefetch(P, I) \ 116790075Sobrien __builtin_prefetch ((P), 0, (I)) 116890075Sobrien#endif 116990075Sobrien 117090075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 117190075Sobrienstatic __inline void 117290075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 117390075Sobrien{ 1174117395Skan __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); 117590075Sobrien} 117690075Sobrien 117790075Sobrien/* Likewise. The address must be 16-byte aligned. */ 117890075Sobrienstatic __inline void 117990075Sobrien_mm_stream_ps (float *__P, __m128 __A) 118090075Sobrien{ 118190075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 118290075Sobrien} 118390075Sobrien 1184132718Skan/* Guarantees that every preceding store is globally visible before 118590075Sobrien any subsequent store. */ 118690075Sobrienstatic __inline void 118790075Sobrien_mm_sfence (void) 118890075Sobrien{ 118990075Sobrien __builtin_ia32_sfence (); 119090075Sobrien} 119190075Sobrien 119290075Sobrien/* The execution of the next instruction is delayed by an implementation 119390075Sobrien specific amount of time. The instruction does not modify the 119490075Sobrien architectural state. */ 119590075Sobrienstatic __inline void 119690075Sobrien_mm_pause (void) 119790075Sobrien{ 119890075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 119990075Sobrien} 120090075Sobrien 120190075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 120290075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 120390075Sobriendo { \ 120490075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 120590075Sobrien __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1206107590Sobrien __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1207107590Sobrien __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 120890075Sobrien __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 120990075Sobrien (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 121090075Sobrien (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 121190075Sobrien (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 121290075Sobrien (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 121390075Sobrien} while (0) 121490075Sobrien 1215122180Skan/* For backward source compatibility. */ 1216122180Skan#include <emmintrin.h> 1217117395Skan 1218117395Skan#endif /* __SSE__ */ 121990075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1220