xmmintrin.h revision 242182
1169689Skan/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 2169689Skan Free Software Foundation, Inc. 390075Sobrien 4132718Skan This file is part of GCC. 590075Sobrien 6132718Skan GCC is free software; you can redistribute it and/or modify 790075Sobrien it under the terms of the GNU General Public License as published by 890075Sobrien the Free Software Foundation; either version 2, or (at your option) 990075Sobrien any later version. 1090075Sobrien 11132718Skan GCC is distributed in the hope that it will be useful, 1290075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1390075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1490075Sobrien GNU General Public License for more details. 1590075Sobrien 1690075Sobrien You should have received a copy of the GNU General Public License 17132718Skan along with GCC; see the file COPYING. If not, write to 18169689Skan the Free Software Foundation, 51 Franklin Street, Fifth Floor, 19169689Skan Boston, MA 02110-1301, USA. */ 2090075Sobrien 2190075Sobrien/* As a special exception, if you include this header file into source 2290075Sobrien files compiled by GCC, this header file does not by itself cause 2390075Sobrien the resulting executable to be covered by the GNU General Public 2490075Sobrien License. This exception does not however invalidate any other 2590075Sobrien reasons why the executable file might be covered by the GNU General 2690075Sobrien Public License. */ 2790075Sobrien 2890075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 29169689Skan User Guide and Reference, version 9.0. */ 3090075Sobrien 3190075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3290075Sobrien#define _XMMINTRIN_H_INCLUDED 3390075Sobrien 34117395Skan#ifndef __SSE__ 35117395Skan# error "SSE instruction set not enabled" 36117395Skan#else 37117395Skan 3890075Sobrien/* We need type definitions from the MMX header file. */ 3990075Sobrien#include <mmintrin.h> 4090075Sobrien 41169689Skan/* Get _mm_malloc () and _mm_free (). */ 42242182Skan#if __STDC_HOSTED__ 43169689Skan#include <mm_malloc.h> 44242182Skan#endif 4590075Sobrien 46169689Skan/* The Intel API is flexible enough that we must allow aliasing with other 47169689Skan vector types, and their scalar components. */ 48169689Skantypedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 49169689Skan 50132718Skan/* Internal data types for implementing the intrinsics. */ 51169689Skantypedef float __v4sf __attribute__ ((__vector_size__ (16))); 5290075Sobrien 5390075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 5490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 5590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 5690075Sobrien 5790075Sobrien/* Constants for use with _mm_prefetch. */ 5890075Sobrienenum _mm_hint 5990075Sobrien{ 6090075Sobrien _MM_HINT_T0 = 3, 6190075Sobrien _MM_HINT_T1 = 2, 6290075Sobrien _MM_HINT_T2 = 1, 6390075Sobrien _MM_HINT_NTA = 0 6490075Sobrien}; 6590075Sobrien 6690075Sobrien/* Bits in the MXCSR. */ 6790075Sobrien#define _MM_EXCEPT_MASK 0x003f 6890075Sobrien#define _MM_EXCEPT_INVALID 0x0001 6990075Sobrien#define _MM_EXCEPT_DENORM 0x0002 7090075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 7190075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 7290075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 7390075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 7490075Sobrien 7590075Sobrien#define _MM_MASK_MASK 0x1f80 7690075Sobrien#define _MM_MASK_INVALID 0x0080 7790075Sobrien#define _MM_MASK_DENORM 0x0100 7890075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 7990075Sobrien#define _MM_MASK_OVERFLOW 0x0400 8090075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 8190075Sobrien#define _MM_MASK_INEXACT 0x1000 8290075Sobrien 8390075Sobrien#define _MM_ROUND_MASK 0x6000 8490075Sobrien#define _MM_ROUND_NEAREST 0x0000 8590075Sobrien#define _MM_ROUND_DOWN 0x2000 8690075Sobrien#define _MM_ROUND_UP 0x4000 8790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 8890075Sobrien 8990075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 9090075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 9190075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 9290075Sobrien 93169689Skan/* Create a vector of zeros. */ 94169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 95169689Skan_mm_setzero_ps (void) 96169689Skan{ 97169689Skan return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 98169689Skan} 99169689Skan 10090075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 10190075Sobrien floating-point) values of A and B; the upper three SPFP values are 10290075Sobrien passed through from A. */ 10390075Sobrien 104169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 10590075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 10690075Sobrien{ 10790075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 10890075Sobrien} 10990075Sobrien 110169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 11190075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 11290075Sobrien{ 11390075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 11490075Sobrien} 11590075Sobrien 116169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 11790075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 11890075Sobrien{ 11990075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 12090075Sobrien} 12190075Sobrien 122169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 12390075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 12490075Sobrien{ 12590075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 12690075Sobrien} 12790075Sobrien 128169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 12990075Sobrien_mm_sqrt_ss (__m128 __A) 13090075Sobrien{ 13190075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 13290075Sobrien} 13390075Sobrien 134169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 13590075Sobrien_mm_rcp_ss (__m128 __A) 13690075Sobrien{ 13790075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 13890075Sobrien} 13990075Sobrien 140169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 14190075Sobrien_mm_rsqrt_ss (__m128 __A) 14290075Sobrien{ 14390075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 14490075Sobrien} 14590075Sobrien 146169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 14790075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 14890075Sobrien{ 14990075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 15090075Sobrien} 15190075Sobrien 152169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 15390075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 15490075Sobrien{ 15590075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 15690075Sobrien} 15790075Sobrien 15890075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 15990075Sobrien 160169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 16190075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 16290075Sobrien{ 16390075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 16490075Sobrien} 16590075Sobrien 166169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 16790075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 16890075Sobrien{ 16990075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 17090075Sobrien} 17190075Sobrien 172169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 17390075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 17490075Sobrien{ 17590075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 17690075Sobrien} 17790075Sobrien 178169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 17990075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 18090075Sobrien{ 18190075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 18290075Sobrien} 18390075Sobrien 184169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 18590075Sobrien_mm_sqrt_ps (__m128 __A) 18690075Sobrien{ 18790075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 18890075Sobrien} 18990075Sobrien 190169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 19190075Sobrien_mm_rcp_ps (__m128 __A) 19290075Sobrien{ 19390075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 19490075Sobrien} 19590075Sobrien 196169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 19790075Sobrien_mm_rsqrt_ps (__m128 __A) 19890075Sobrien{ 19990075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 20090075Sobrien} 20190075Sobrien 202169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 20390075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 20490075Sobrien{ 20590075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 20690075Sobrien} 20790075Sobrien 208169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 20990075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 21090075Sobrien{ 21190075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 21290075Sobrien} 21390075Sobrien 21490075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 21590075Sobrien 216169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 21790075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 21890075Sobrien{ 21990075Sobrien return __builtin_ia32_andps (__A, __B); 22090075Sobrien} 22190075Sobrien 222169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 22390075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 22490075Sobrien{ 22590075Sobrien return __builtin_ia32_andnps (__A, __B); 22690075Sobrien} 22790075Sobrien 228169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 22990075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 23090075Sobrien{ 23190075Sobrien return __builtin_ia32_orps (__A, __B); 23290075Sobrien} 23390075Sobrien 234169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 23590075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 23690075Sobrien{ 23790075Sobrien return __builtin_ia32_xorps (__A, __B); 23890075Sobrien} 23990075Sobrien 24090075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 24190075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 24290075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 24390075Sobrien 244169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 24590075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 24690075Sobrien{ 24790075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 24890075Sobrien} 24990075Sobrien 250169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 25190075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 25290075Sobrien{ 25390075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 25490075Sobrien} 25590075Sobrien 256169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 25790075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 25890075Sobrien{ 25990075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 26090075Sobrien} 26190075Sobrien 262169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 26390075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 26490075Sobrien{ 265107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 266107590Sobrien (__v4sf) 267107590Sobrien __builtin_ia32_cmpltss ((__v4sf) __B, 268107590Sobrien (__v4sf) 269107590Sobrien __A)); 27090075Sobrien} 27190075Sobrien 272169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 27390075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 27490075Sobrien{ 275107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 276107590Sobrien (__v4sf) 277107590Sobrien __builtin_ia32_cmpless ((__v4sf) __B, 278107590Sobrien (__v4sf) 279107590Sobrien __A)); 28090075Sobrien} 28190075Sobrien 282169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 28390075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 28490075Sobrien{ 28590075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 28690075Sobrien} 28790075Sobrien 288169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 28990075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 29090075Sobrien{ 29190075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 29290075Sobrien} 29390075Sobrien 294169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 29590075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 29690075Sobrien{ 29790075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 29890075Sobrien} 29990075Sobrien 300169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 30190075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 30290075Sobrien{ 303107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 304107590Sobrien (__v4sf) 305107590Sobrien __builtin_ia32_cmpnltss ((__v4sf) __B, 306107590Sobrien (__v4sf) 307107590Sobrien __A)); 30890075Sobrien} 30990075Sobrien 310169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 31190075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 31290075Sobrien{ 313107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 314107590Sobrien (__v4sf) 315107590Sobrien __builtin_ia32_cmpnless ((__v4sf) __B, 316107590Sobrien (__v4sf) 317107590Sobrien __A)); 31890075Sobrien} 31990075Sobrien 320169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 32190075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 32290075Sobrien{ 32390075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 32490075Sobrien} 32590075Sobrien 326169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 32790075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 32890075Sobrien{ 32990075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 33090075Sobrien} 33190075Sobrien 33290075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 33390075Sobrien element, if the comparison is true, place a mask of all ones in the 33490075Sobrien result, otherwise a mask of zeros. */ 33590075Sobrien 336169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 33790075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 33890075Sobrien{ 33990075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 34090075Sobrien} 34190075Sobrien 342169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 34390075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 34490075Sobrien{ 34590075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 34690075Sobrien} 34790075Sobrien 348169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 34990075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 35090075Sobrien{ 35190075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 35290075Sobrien} 35390075Sobrien 354169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 35590075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 35690075Sobrien{ 35790075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 35890075Sobrien} 35990075Sobrien 360169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 36190075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 36290075Sobrien{ 36390075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 36490075Sobrien} 36590075Sobrien 366169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 36790075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 36890075Sobrien{ 36990075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 37090075Sobrien} 37190075Sobrien 372169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 37390075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 37490075Sobrien{ 37590075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 37690075Sobrien} 37790075Sobrien 378169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 37990075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 38090075Sobrien{ 38190075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 38290075Sobrien} 38390075Sobrien 384169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 38590075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 38690075Sobrien{ 38790075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 38890075Sobrien} 38990075Sobrien 390169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 39190075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 39290075Sobrien{ 39390075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 39490075Sobrien} 39590075Sobrien 396169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 39790075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 39890075Sobrien{ 39990075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 40090075Sobrien} 40190075Sobrien 402169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 40390075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 40490075Sobrien{ 40590075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 40690075Sobrien} 40790075Sobrien 40890075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 40990075Sobrien and 0 if false. */ 41090075Sobrien 411169689Skanstatic __inline int __attribute__((__always_inline__)) 41290075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 41390075Sobrien{ 41490075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 41590075Sobrien} 41690075Sobrien 417169689Skanstatic __inline int __attribute__((__always_inline__)) 41890075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 41990075Sobrien{ 42090075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 42190075Sobrien} 42290075Sobrien 423169689Skanstatic __inline int __attribute__((__always_inline__)) 42490075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 42590075Sobrien{ 42690075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 42790075Sobrien} 42890075Sobrien 429169689Skanstatic __inline int __attribute__((__always_inline__)) 43090075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 43190075Sobrien{ 43290075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 43390075Sobrien} 43490075Sobrien 435169689Skanstatic __inline int __attribute__((__always_inline__)) 43690075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 43790075Sobrien{ 43890075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 43990075Sobrien} 44090075Sobrien 441169689Skanstatic __inline int __attribute__((__always_inline__)) 44290075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 44390075Sobrien{ 44490075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 44590075Sobrien} 44690075Sobrien 447169689Skanstatic __inline int __attribute__((__always_inline__)) 44890075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 44990075Sobrien{ 45090075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 45190075Sobrien} 45290075Sobrien 453169689Skanstatic __inline int __attribute__((__always_inline__)) 45490075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 45590075Sobrien{ 45690075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 45790075Sobrien} 45890075Sobrien 459169689Skanstatic __inline int __attribute__((__always_inline__)) 46090075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 46190075Sobrien{ 46290075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 46390075Sobrien} 46490075Sobrien 465169689Skanstatic __inline int __attribute__((__always_inline__)) 46690075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 46790075Sobrien{ 46890075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 46990075Sobrien} 47090075Sobrien 471169689Skanstatic __inline int __attribute__((__always_inline__)) 47290075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 47390075Sobrien{ 47490075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 47590075Sobrien} 47690075Sobrien 477169689Skanstatic __inline int __attribute__((__always_inline__)) 47890075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 47990075Sobrien{ 48090075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 48190075Sobrien} 48290075Sobrien 48390075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 48490075Sobrien rounding mode. */ 485169689Skanstatic __inline int __attribute__((__always_inline__)) 48690075Sobrien_mm_cvtss_si32 (__m128 __A) 48790075Sobrien{ 48890075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 48990075Sobrien} 49090075Sobrien 491169689Skanstatic __inline int __attribute__((__always_inline__)) 492122180Skan_mm_cvt_ss2si (__m128 __A) 493122180Skan{ 494122180Skan return _mm_cvtss_si32 (__A); 495122180Skan} 496122180Skan 497117395Skan#ifdef __x86_64__ 498169689Skan/* Convert the lower SPFP value to a 32-bit integer according to the 499169689Skan current rounding mode. */ 500169689Skan 501169689Skan/* Intel intrinsic. */ 502169689Skanstatic __inline long long __attribute__((__always_inline__)) 503169689Skan_mm_cvtss_si64 (__m128 __A) 504169689Skan{ 505169689Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 506169689Skan} 507169689Skan 508169689Skan/* Microsoft intrinsic. */ 509169689Skanstatic __inline long long __attribute__((__always_inline__)) 510117395Skan_mm_cvtss_si64x (__m128 __A) 511117395Skan{ 512117395Skan return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 513117395Skan} 514117395Skan#endif 515117395Skan 51690075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 51790075Sobrien current rounding mode. Return the integers in packed form. */ 518169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 51990075Sobrien_mm_cvtps_pi32 (__m128 __A) 52090075Sobrien{ 52190075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 52290075Sobrien} 52390075Sobrien 524169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 525122180Skan_mm_cvt_ps2pi (__m128 __A) 526122180Skan{ 527122180Skan return _mm_cvtps_pi32 (__A); 528122180Skan} 529122180Skan 53090075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 531169689Skanstatic __inline int __attribute__((__always_inline__)) 53290075Sobrien_mm_cvttss_si32 (__m128 __A) 53390075Sobrien{ 53490075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 53590075Sobrien} 53690075Sobrien 537169689Skanstatic __inline int __attribute__((__always_inline__)) 538122180Skan_mm_cvtt_ss2si (__m128 __A) 539122180Skan{ 540122180Skan return _mm_cvttss_si32 (__A); 541122180Skan} 542122180Skan 543117395Skan#ifdef __x86_64__ 544117395Skan/* Truncate the lower SPFP value to a 32-bit integer. */ 545169689Skan 546169689Skan/* Intel intrinsic. */ 547169689Skanstatic __inline long long __attribute__((__always_inline__)) 548169689Skan_mm_cvttss_si64 (__m128 __A) 549169689Skan{ 550169689Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 551169689Skan} 552169689Skan 553169689Skan/* Microsoft intrinsic. */ 554169689Skanstatic __inline long long __attribute__((__always_inline__)) 555117395Skan_mm_cvttss_si64x (__m128 __A) 556117395Skan{ 557117395Skan return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 558117395Skan} 559117395Skan#endif 560117395Skan 56190075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 56290075Sobrien integers in packed form. */ 563169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 56490075Sobrien_mm_cvttps_pi32 (__m128 __A) 56590075Sobrien{ 56690075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 56790075Sobrien} 56890075Sobrien 569169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 570122180Skan_mm_cvtt_ps2pi (__m128 __A) 571122180Skan{ 572122180Skan return _mm_cvttps_pi32 (__A); 573122180Skan} 574122180Skan 57590075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 576169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 57790075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 57890075Sobrien{ 57990075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 58090075Sobrien} 58190075Sobrien 582169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 583122180Skan_mm_cvt_si2ss (__m128 __A, int __B) 584122180Skan{ 585122180Skan return _mm_cvtsi32_ss (__A, __B); 586122180Skan} 587122180Skan 588117395Skan#ifdef __x86_64__ 589117395Skan/* Convert B to a SPFP value and insert it as element zero in A. */ 590169689Skan 591169689Skan/* Intel intrinsic. */ 592169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 593169689Skan_mm_cvtsi64_ss (__m128 __A, long long __B) 594169689Skan{ 595169689Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 596169689Skan} 597169689Skan 598169689Skan/* Microsoft intrinsic. */ 599169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 600117395Skan_mm_cvtsi64x_ss (__m128 __A, long long __B) 601117395Skan{ 602117395Skan return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 603117395Skan} 604117395Skan#endif 605117395Skan 60690075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 60790075Sobrien as the two lower elements in A. */ 608169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 60990075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 61090075Sobrien{ 61190075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 61290075Sobrien} 61390075Sobrien 614169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 615122180Skan_mm_cvt_pi2ps (__m128 __A, __m64 __B) 616122180Skan{ 617122180Skan return _mm_cvtpi32_ps (__A, __B); 618122180Skan} 619122180Skan 62090075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 621169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 62290075Sobrien_mm_cvtpi16_ps (__m64 __A) 62390075Sobrien{ 62490075Sobrien __v4hi __sign; 62590075Sobrien __v2si __hisi, __losi; 62690075Sobrien __v4sf __r; 62790075Sobrien 62890075Sobrien /* This comparison against zero gives us a mask that can be used to 62990075Sobrien fill in the missing sign bits in the unpack operations below, so 63090075Sobrien that we get signed values after unpacking. */ 631169689Skan __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); 63290075Sobrien 63390075Sobrien /* Convert the four words to doublewords. */ 63490075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 63590075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 63690075Sobrien 63790075Sobrien /* Convert the doublewords to floating point two at a time. */ 638169689Skan __r = (__v4sf) _mm_setzero_ps (); 63990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 64090075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 64190075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 64290075Sobrien 64390075Sobrien return (__m128) __r; 64490075Sobrien} 64590075Sobrien 64690075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 647169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 64890075Sobrien_mm_cvtpu16_ps (__m64 __A) 64990075Sobrien{ 65090075Sobrien __v2si __hisi, __losi; 65190075Sobrien __v4sf __r; 65290075Sobrien 65390075Sobrien /* Convert the four words to doublewords. */ 654169689Skan __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); 655169689Skan __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); 65690075Sobrien 65790075Sobrien /* Convert the doublewords to floating point two at a time. */ 658169689Skan __r = (__v4sf) _mm_setzero_ps (); 65990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 66090075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 66190075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 66290075Sobrien 66390075Sobrien return (__m128) __r; 66490075Sobrien} 66590075Sobrien 66690075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 667169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 66890075Sobrien_mm_cvtpi8_ps (__m64 __A) 66990075Sobrien{ 67090075Sobrien __v8qi __sign; 67190075Sobrien 67290075Sobrien /* This comparison against zero gives us a mask that can be used to 67390075Sobrien fill in the missing sign bits in the unpack operations below, so 67490075Sobrien that we get signed values after unpacking. */ 675169689Skan __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); 67690075Sobrien 67790075Sobrien /* Convert the four low bytes to words. */ 67890075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 67990075Sobrien 68090075Sobrien return _mm_cvtpi16_ps(__A); 68190075Sobrien} 68290075Sobrien 68390075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 684169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 68590075Sobrien_mm_cvtpu8_ps(__m64 __A) 68690075Sobrien{ 687169689Skan __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); 68890075Sobrien return _mm_cvtpu16_ps(__A); 68990075Sobrien} 69090075Sobrien 69190075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 692169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 69390075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 69490075Sobrien{ 695169689Skan __v4sf __zero = (__v4sf) _mm_setzero_ps (); 69690075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 69790075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 69890075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 69990075Sobrien} 70090075Sobrien 70190075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 702169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 70390075Sobrien_mm_cvtps_pi16(__m128 __A) 70490075Sobrien{ 70590075Sobrien __v4sf __hisf = (__v4sf)__A; 70690075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 70790075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 70890075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 709117395Skan return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 71090075Sobrien} 71190075Sobrien 71290075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 713169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 71490075Sobrien_mm_cvtps_pi8(__m128 __A) 71590075Sobrien{ 71690075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 717169689Skan return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); 71890075Sobrien} 71990075Sobrien 72090075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 72190075Sobrien#if 0 722169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 72390075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 72490075Sobrien{ 72590075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 72690075Sobrien} 72790075Sobrien#else 72890075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 72990075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 73090075Sobrien#endif 73190075Sobrien 73290075Sobrien 73390075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 734169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 73590075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 73690075Sobrien{ 73790075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 73890075Sobrien} 73990075Sobrien 74090075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 741169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 74290075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 74390075Sobrien{ 74490075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 74590075Sobrien} 74690075Sobrien 74790075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 74890075Sobrien the lower two values are passed through from A. */ 749169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 750117395Skan_mm_loadh_pi (__m128 __A, __m64 const *__P) 75190075Sobrien{ 75290075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 75390075Sobrien} 75490075Sobrien 75590075Sobrien/* Stores the upper two SPFP values of A into P. */ 756169689Skanstatic __inline void __attribute__((__always_inline__)) 75790075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 75890075Sobrien{ 75990075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 76090075Sobrien} 76190075Sobrien 76290075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 763169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 76490075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 76590075Sobrien{ 76690075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 76790075Sobrien} 76890075Sobrien 76990075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 770169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 77190075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 77290075Sobrien{ 77390075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 77490075Sobrien} 77590075Sobrien 77690075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 77790075Sobrien the upper two values are passed through from A. */ 778169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 779117395Skan_mm_loadl_pi (__m128 __A, __m64 const *__P) 78090075Sobrien{ 78190075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 78290075Sobrien} 78390075Sobrien 78490075Sobrien/* Stores the lower two SPFP values of A into P. */ 785169689Skanstatic __inline void __attribute__((__always_inline__)) 78690075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 78790075Sobrien{ 78890075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 78990075Sobrien} 79090075Sobrien 79190075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 792169689Skanstatic __inline int __attribute__((__always_inline__)) 79390075Sobrien_mm_movemask_ps (__m128 __A) 79490075Sobrien{ 79590075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 79690075Sobrien} 79790075Sobrien 79890075Sobrien/* Return the contents of the control register. */ 799169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 80090075Sobrien_mm_getcsr (void) 80190075Sobrien{ 80290075Sobrien return __builtin_ia32_stmxcsr (); 80390075Sobrien} 80490075Sobrien 80590075Sobrien/* Read exception bits from the control register. */ 806169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 80790075Sobrien_MM_GET_EXCEPTION_STATE (void) 80890075Sobrien{ 80990075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 81090075Sobrien} 81190075Sobrien 812169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 81390075Sobrien_MM_GET_EXCEPTION_MASK (void) 81490075Sobrien{ 81590075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 81690075Sobrien} 81790075Sobrien 818169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 81990075Sobrien_MM_GET_ROUNDING_MODE (void) 82090075Sobrien{ 82190075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 82290075Sobrien} 82390075Sobrien 824169689Skanstatic __inline unsigned int __attribute__((__always_inline__)) 82590075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 82690075Sobrien{ 82790075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 82890075Sobrien} 82990075Sobrien 83090075Sobrien/* Set the control register to I. */ 831169689Skanstatic __inline void __attribute__((__always_inline__)) 83290075Sobrien_mm_setcsr (unsigned int __I) 83390075Sobrien{ 83490075Sobrien __builtin_ia32_ldmxcsr (__I); 83590075Sobrien} 83690075Sobrien 83790075Sobrien/* Set exception bits in the control register. */ 838169689Skanstatic __inline void __attribute__((__always_inline__)) 83990075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 84090075Sobrien{ 84190075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 84290075Sobrien} 84390075Sobrien 844169689Skanstatic __inline void __attribute__((__always_inline__)) 84590075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 84690075Sobrien{ 84790075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 84890075Sobrien} 84990075Sobrien 850169689Skanstatic __inline void __attribute__((__always_inline__)) 85190075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 85290075Sobrien{ 85390075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 85490075Sobrien} 85590075Sobrien 856169689Skanstatic __inline void __attribute__((__always_inline__)) 85790075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 85890075Sobrien{ 85990075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 86090075Sobrien} 86190075Sobrien 862169689Skan/* Create a vector with element 0 as F and the rest zero. */ 863169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 864169689Skan_mm_set_ss (float __F) 865169689Skan{ 866169689Skan return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 }; 867169689Skan} 868169689Skan 869169689Skan/* Create a vector with all four elements equal to F. */ 870169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 871169689Skan_mm_set1_ps (float __F) 872169689Skan{ 873169689Skan return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 874169689Skan} 875169689Skan 876169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 877169689Skan_mm_set_ps1 (float __F) 878169689Skan{ 879169689Skan return _mm_set1_ps (__F); 880169689Skan} 881169689Skan 88290075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 883169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 884117395Skan_mm_load_ss (float const *__P) 88590075Sobrien{ 886169689Skan return _mm_set_ss (*__P); 88790075Sobrien} 88890075Sobrien 88990075Sobrien/* Create a vector with all four elements equal to *P. */ 890169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 891117395Skan_mm_load1_ps (float const *__P) 89290075Sobrien{ 893169689Skan return _mm_set1_ps (*__P); 89490075Sobrien} 89590075Sobrien 896169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 897117395Skan_mm_load_ps1 (float const *__P) 89890075Sobrien{ 89990075Sobrien return _mm_load1_ps (__P); 90090075Sobrien} 90190075Sobrien 90290075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 903169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 904117395Skan_mm_load_ps (float const *__P) 90590075Sobrien{ 906169689Skan return (__m128) *(__v4sf *)__P; 90790075Sobrien} 90890075Sobrien 90990075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 910169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 911117395Skan_mm_loadu_ps (float const *__P) 91290075Sobrien{ 91390075Sobrien return (__m128) __builtin_ia32_loadups (__P); 91490075Sobrien} 91590075Sobrien 91690075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 917169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 918117395Skan_mm_loadr_ps (float const *__P) 91990075Sobrien{ 920169689Skan __v4sf __tmp = *(__v4sf *)__P; 92190075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 92290075Sobrien} 92390075Sobrien 924169689Skan/* Create the vector [Z Y X W]. */ 925169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 926169689Skan_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 92790075Sobrien{ 928169689Skan return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 92990075Sobrien} 93090075Sobrien 931169689Skan/* Create the vector [W X Y Z]. */ 932169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 933169689Skan_mm_setr_ps (float __Z, float __Y, float __X, float __W) 93490075Sobrien{ 935169689Skan return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 93690075Sobrien} 93790075Sobrien 938169689Skan/* Stores the lower SPFP value. */ 939169689Skanstatic __inline void __attribute__((__always_inline__)) 940169689Skan_mm_store_ss (float *__P, __m128 __A) 94190075Sobrien{ 942169689Skan *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 94390075Sobrien} 94490075Sobrien 945169689Skanstatic __inline float __attribute__((__always_inline__)) 946169689Skan_mm_cvtss_f32 (__m128 __A) 94790075Sobrien{ 948169689Skan return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 94990075Sobrien} 95090075Sobrien 951169689Skan/* Store four SPFP values. The address must be 16-byte aligned. */ 952169689Skanstatic __inline void __attribute__((__always_inline__)) 953169689Skan_mm_store_ps (float *__P, __m128 __A) 95490075Sobrien{ 955169689Skan *(__v4sf *)__P = (__v4sf)__A; 95690075Sobrien} 95790075Sobrien 958169689Skan/* Store four SPFP values. The address need not be 16-byte aligned. */ 959169689Skanstatic __inline void __attribute__((__always_inline__)) 960169689Skan_mm_storeu_ps (float *__P, __m128 __A) 96190075Sobrien{ 962169689Skan __builtin_ia32_storeups (__P, (__v4sf)__A); 96390075Sobrien} 96490075Sobrien 96590075Sobrien/* Store the lower SPFP value across four words. */ 966169689Skanstatic __inline void __attribute__((__always_inline__)) 96790075Sobrien_mm_store1_ps (float *__P, __m128 __A) 96890075Sobrien{ 96990075Sobrien __v4sf __va = (__v4sf)__A; 97090075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 971169689Skan _mm_storeu_ps (__P, __tmp); 97290075Sobrien} 97390075Sobrien 974169689Skanstatic __inline void __attribute__((__always_inline__)) 97590075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 97690075Sobrien{ 97790075Sobrien _mm_store1_ps (__P, __A); 97890075Sobrien} 97990075Sobrien 980117395Skan/* Store four SPFP values in reverse order. The address must be aligned. */ 981169689Skanstatic __inline void __attribute__((__always_inline__)) 98290075Sobrien_mm_storer_ps (float *__P, __m128 __A) 98390075Sobrien{ 98490075Sobrien __v4sf __va = (__v4sf)__A; 98590075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 986169689Skan _mm_store_ps (__P, __tmp); 98790075Sobrien} 98890075Sobrien 98990075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 990169689Skanstatic __inline __m128 __attribute__((__always_inline__)) 99190075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 99290075Sobrien{ 99390075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 99490075Sobrien} 99590075Sobrien 99690075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 99790075Sobrien#if 0 998169689Skanstatic __inline int __attribute__((__always_inline__)) 999169689Skan_mm_extract_pi16 (__m64 const __A, int const __N) 100090075Sobrien{ 1001169689Skan return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); 100290075Sobrien} 1003122180Skan 1004169689Skanstatic __inline int __attribute__((__always_inline__)) 1005169689Skan_m_pextrw (__m64 const __A, int const __N) 1006122180Skan{ 1007122180Skan return _mm_extract_pi16 (__A, __N); 1008122180Skan} 100990075Sobrien#else 1010169689Skan#define _mm_extract_pi16(A, N) __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N)) 1011122180Skan#define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 101290075Sobrien#endif 101390075Sobrien 101490075Sobrien/* Inserts word D into one of four words of A. The selector N must be 101590075Sobrien immediate. */ 101690075Sobrien#if 0 1017169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1018169689Skan_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 101990075Sobrien{ 1020169689Skan return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); 102190075Sobrien} 1022122180Skan 1023169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1024169689Skan_m_pinsrw (__m64 const __A, int const __D, int const __N) 1025122180Skan{ 1026122180Skan return _mm_insert_pi16 (__A, __D, __N); 1027122180Skan} 102890075Sobrien#else 102990075Sobrien#define _mm_insert_pi16(A, D, N) \ 1030169689Skan ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N))) 1031122180Skan#define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 103290075Sobrien#endif 103390075Sobrien 103490075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 1035169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 103690075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 103790075Sobrien{ 103890075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 103990075Sobrien} 104090075Sobrien 1041169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1042122180Skan_m_pmaxsw (__m64 __A, __m64 __B) 1043122180Skan{ 1044122180Skan return _mm_max_pi16 (__A, __B); 1045122180Skan} 1046122180Skan 104790075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 1048169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 104990075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 105090075Sobrien{ 105190075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 105290075Sobrien} 105390075Sobrien 1054169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1055122180Skan_m_pmaxub (__m64 __A, __m64 __B) 1056122180Skan{ 1057122180Skan return _mm_max_pu8 (__A, __B); 1058122180Skan} 1059122180Skan 106090075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 1061169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 106290075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 106390075Sobrien{ 106490075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 106590075Sobrien} 106690075Sobrien 1067169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1068122180Skan_m_pminsw (__m64 __A, __m64 __B) 1069122180Skan{ 1070122180Skan return _mm_min_pi16 (__A, __B); 1071122180Skan} 1072122180Skan 107390075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 1074169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 107590075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 107690075Sobrien{ 107790075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 107890075Sobrien} 107990075Sobrien 1080169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1081122180Skan_m_pminub (__m64 __A, __m64 __B) 1082122180Skan{ 1083122180Skan return _mm_min_pu8 (__A, __B); 1084122180Skan} 1085122180Skan 108690075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 1087169689Skanstatic __inline int __attribute__((__always_inline__)) 108890075Sobrien_mm_movemask_pi8 (__m64 __A) 108990075Sobrien{ 109090075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 109190075Sobrien} 109290075Sobrien 1093169689Skanstatic __inline int __attribute__((__always_inline__)) 1094122180Skan_m_pmovmskb (__m64 __A) 1095122180Skan{ 1096122180Skan return _mm_movemask_pi8 (__A); 1097122180Skan} 1098122180Skan 109990075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 110090075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 1101169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 110290075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 110390075Sobrien{ 110490075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 110590075Sobrien} 110690075Sobrien 1107169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1108122180Skan_m_pmulhuw (__m64 __A, __m64 __B) 1109122180Skan{ 1110122180Skan return _mm_mulhi_pu16 (__A, __B); 1111122180Skan} 1112122180Skan 111390075Sobrien/* Return a combination of the four 16-bit values in A. The selector 111490075Sobrien must be an immediate. */ 111590075Sobrien#if 0 1116169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 111790075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 111890075Sobrien{ 111990075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 112090075Sobrien} 1121122180Skan 1122169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1123122180Skan_m_pshufw (__m64 __A, int __N) 1124122180Skan{ 1125122180Skan return _mm_shuffle_pi16 (__A, __N); 1126122180Skan} 112790075Sobrien#else 112890075Sobrien#define _mm_shuffle_pi16(A, N) \ 112990075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1130122180Skan#define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 113190075Sobrien#endif 113290075Sobrien 113390075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 113490075Sobrien byte in the selector N determines whether the corresponding byte from 113590075Sobrien A is stored. */ 1136169689Skanstatic __inline void __attribute__((__always_inline__)) 113790075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 113890075Sobrien{ 113990075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 114090075Sobrien} 114190075Sobrien 1142169689Skanstatic __inline void __attribute__((__always_inline__)) 1143122180Skan_m_maskmovq (__m64 __A, __m64 __N, char *__P) 1144122180Skan{ 1145122180Skan _mm_maskmove_si64 (__A, __N, __P); 1146122180Skan} 1147122180Skan 114890075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1149169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 115090075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 115190075Sobrien{ 115290075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 115390075Sobrien} 115490075Sobrien 1155169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1156122180Skan_m_pavgb (__m64 __A, __m64 __B) 1157122180Skan{ 1158122180Skan return _mm_avg_pu8 (__A, __B); 1159122180Skan} 1160122180Skan 116190075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1162169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 116390075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 116490075Sobrien{ 116590075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 116690075Sobrien} 116790075Sobrien 1168169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1169122180Skan_m_pavgw (__m64 __A, __m64 __B) 1170122180Skan{ 1171122180Skan return _mm_avg_pu16 (__A, __B); 1172122180Skan} 1173122180Skan 117490075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 117590075Sobrien values in A and B. Return the value in the lower 16-bit word; the 117690075Sobrien upper words are cleared. */ 1177169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 117890075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 117990075Sobrien{ 118090075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 118190075Sobrien} 118290075Sobrien 1183169689Skanstatic __inline __m64 __attribute__((__always_inline__)) 1184122180Skan_m_psadbw (__m64 __A, __m64 __B) 1185122180Skan{ 1186122180Skan return _mm_sad_pu8 (__A, __B); 1187122180Skan} 1188122180Skan 118990075Sobrien/* Loads one cache line from address P to a location "closer" to the 119090075Sobrien processor. The selector I specifies the type of prefetch operation. */ 119190075Sobrien#if 0 1192169689Skanstatic __inline void __attribute__((__always_inline__)) 119390075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 119490075Sobrien{ 119590075Sobrien __builtin_prefetch (__P, 0, __I); 119690075Sobrien} 119790075Sobrien#else 119890075Sobrien#define _mm_prefetch(P, I) \ 119990075Sobrien __builtin_prefetch ((P), 0, (I)) 120090075Sobrien#endif 120190075Sobrien 120290075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 1203169689Skanstatic __inline void __attribute__((__always_inline__)) 120490075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 120590075Sobrien{ 1206117395Skan __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); 120790075Sobrien} 120890075Sobrien 120990075Sobrien/* Likewise. The address must be 16-byte aligned. */ 1210169689Skanstatic __inline void __attribute__((__always_inline__)) 121190075Sobrien_mm_stream_ps (float *__P, __m128 __A) 121290075Sobrien{ 121390075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 121490075Sobrien} 121590075Sobrien 1216132718Skan/* Guarantees that every preceding store is globally visible before 121790075Sobrien any subsequent store. */ 1218169689Skanstatic __inline void __attribute__((__always_inline__)) 121990075Sobrien_mm_sfence (void) 122090075Sobrien{ 122190075Sobrien __builtin_ia32_sfence (); 122290075Sobrien} 122390075Sobrien 122490075Sobrien/* The execution of the next instruction is delayed by an implementation 122590075Sobrien specific amount of time. The instruction does not modify the 122690075Sobrien architectural state. */ 1227169689Skanstatic __inline void __attribute__((__always_inline__)) 122890075Sobrien_mm_pause (void) 122990075Sobrien{ 123090075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 123190075Sobrien} 123290075Sobrien 123390075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 123490075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 123590075Sobriendo { \ 123690075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1237169689Skan __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ 1238169689Skan __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ 1239169689Skan __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ 1240169689Skan __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ 1241169689Skan (row0) = __builtin_ia32_movlhps (__t0, __t1); \ 1242169689Skan (row1) = __builtin_ia32_movhlps (__t1, __t0); \ 1243169689Skan (row2) = __builtin_ia32_movlhps (__t2, __t3); \ 1244169689Skan (row3) = __builtin_ia32_movhlps (__t3, __t2); \ 124590075Sobrien} while (0) 124690075Sobrien 1247122180Skan/* For backward source compatibility. */ 1248219639Smm#ifdef __SSE2__ 1249122180Skan#include <emmintrin.h> 1250219639Smm#endif 1251117395Skan 1252117395Skan#endif /* __SSE__ */ 125390075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1254