xmmintrin.h revision 107590
190075Sobrien/* Copyright (C) 2002 Free Software Foundation, Inc. 290075Sobrien 390075Sobrien This file is part of GNU CC. 490075Sobrien 590075Sobrien GNU CC is free software; you can redistribute it and/or modify 690075Sobrien it under the terms of the GNU General Public License as published by 790075Sobrien the Free Software Foundation; either version 2, or (at your option) 890075Sobrien any later version. 990075Sobrien 1090075Sobrien GNU CC is distributed in the hope that it will be useful, 1190075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1290075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1390075Sobrien GNU General Public License for more details. 1490075Sobrien 1590075Sobrien You should have received a copy of the GNU General Public License 1690075Sobrien along with GNU CC; see the file COPYING. If not, write to 1790075Sobrien the Free Software Foundation, 59 Temple Place - Suite 330, 1890075Sobrien Boston, MA 02111-1307, USA. */ 1990075Sobrien 2090075Sobrien/* As a special exception, if you include this header file into source 2190075Sobrien files compiled by GCC, this header file does not by itself cause 2290075Sobrien the resulting executable to be covered by the GNU General Public 2390075Sobrien License. This exception does not however invalidate any other 2490075Sobrien reasons why the executable file might be covered by the GNU General 2590075Sobrien Public License. */ 2690075Sobrien 2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 2890075Sobrien User Guide and Reference, version 5.0. */ 2990075Sobrien 3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3190075Sobrien#define _XMMINTRIN_H_INCLUDED 3290075Sobrien 3390075Sobrien/* We need type definitions from the MMX header file. */ 3490075Sobrien#include <mmintrin.h> 3590075Sobrien 3690075Sobrien/* The data type indended for user use. */ 3790075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__))); 3890075Sobrien 3990075Sobrien/* Internal data types for implementing the instrinsics. */ 4090075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 4190075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__))); 4290075Sobrien 4390075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 4490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 4590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 4690075Sobrien 4790075Sobrien/* Constants for use with _mm_prefetch. */ 4890075Sobrienenum _mm_hint 4990075Sobrien{ 5090075Sobrien _MM_HINT_T0 = 3, 5190075Sobrien _MM_HINT_T1 = 2, 5290075Sobrien _MM_HINT_T2 = 1, 5390075Sobrien _MM_HINT_NTA = 0 5490075Sobrien}; 5590075Sobrien 5690075Sobrien/* Bits in the MXCSR. */ 5790075Sobrien#define _MM_EXCEPT_MASK 0x003f 5890075Sobrien#define _MM_EXCEPT_INVALID 0x0001 5990075Sobrien#define _MM_EXCEPT_DENORM 0x0002 6090075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 6190075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 6290075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 6390075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 6490075Sobrien 6590075Sobrien#define _MM_MASK_MASK 0x1f80 6690075Sobrien#define _MM_MASK_INVALID 0x0080 6790075Sobrien#define _MM_MASK_DENORM 0x0100 6890075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 6990075Sobrien#define _MM_MASK_OVERFLOW 0x0400 7090075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 7190075Sobrien#define _MM_MASK_INEXACT 0x1000 7290075Sobrien 7390075Sobrien#define _MM_ROUND_MASK 0x6000 7490075Sobrien#define _MM_ROUND_NEAREST 0x0000 7590075Sobrien#define _MM_ROUND_DOWN 0x2000 7690075Sobrien#define _MM_ROUND_UP 0x4000 7790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 7890075Sobrien 7990075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 8090075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 8190075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 8290075Sobrien 8390075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 8490075Sobrien floating-point) values of A and B; the upper three SPFP values are 8590075Sobrien passed through from A. */ 8690075Sobrien 8790075Sobrienstatic __inline __m128 8890075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 8990075Sobrien{ 9090075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 9190075Sobrien} 9290075Sobrien 9390075Sobrienstatic __inline __m128 9490075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 9590075Sobrien{ 9690075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 9790075Sobrien} 9890075Sobrien 9990075Sobrienstatic __inline __m128 10090075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 10190075Sobrien{ 10290075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 10390075Sobrien} 10490075Sobrien 10590075Sobrienstatic __inline __m128 10690075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 10790075Sobrien{ 10890075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 10990075Sobrien} 11090075Sobrien 11190075Sobrienstatic __inline __m128 11290075Sobrien_mm_sqrt_ss (__m128 __A) 11390075Sobrien{ 11490075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 11590075Sobrien} 11690075Sobrien 11790075Sobrienstatic __inline __m128 11890075Sobrien_mm_rcp_ss (__m128 __A) 11990075Sobrien{ 12090075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 12190075Sobrien} 12290075Sobrien 12390075Sobrienstatic __inline __m128 12490075Sobrien_mm_rsqrt_ss (__m128 __A) 12590075Sobrien{ 12690075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 12790075Sobrien} 12890075Sobrien 12990075Sobrienstatic __inline __m128 13090075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 13190075Sobrien{ 13290075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 13390075Sobrien} 13490075Sobrien 13590075Sobrienstatic __inline __m128 13690075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 13790075Sobrien{ 13890075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 13990075Sobrien} 14090075Sobrien 14190075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 14290075Sobrien 14390075Sobrienstatic __inline __m128 14490075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 14590075Sobrien{ 14690075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 14790075Sobrien} 14890075Sobrien 14990075Sobrienstatic __inline __m128 15090075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 15190075Sobrien{ 15290075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 15390075Sobrien} 15490075Sobrien 15590075Sobrienstatic __inline __m128 15690075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 15790075Sobrien{ 15890075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 15990075Sobrien} 16090075Sobrien 16190075Sobrienstatic __inline __m128 16290075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 16390075Sobrien{ 16490075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 16590075Sobrien} 16690075Sobrien 16790075Sobrienstatic __inline __m128 16890075Sobrien_mm_sqrt_ps (__m128 __A) 16990075Sobrien{ 17090075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 17190075Sobrien} 17290075Sobrien 17390075Sobrienstatic __inline __m128 17490075Sobrien_mm_rcp_ps (__m128 __A) 17590075Sobrien{ 17690075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 17790075Sobrien} 17890075Sobrien 17990075Sobrienstatic __inline __m128 18090075Sobrien_mm_rsqrt_ps (__m128 __A) 18190075Sobrien{ 18290075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 18390075Sobrien} 18490075Sobrien 18590075Sobrienstatic __inline __m128 18690075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 18790075Sobrien{ 18890075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 18990075Sobrien} 19090075Sobrien 19190075Sobrienstatic __inline __m128 19290075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 19390075Sobrien{ 19490075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 19590075Sobrien} 19690075Sobrien 19790075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 19890075Sobrien 19990075Sobrienstatic __inline __m128 20090075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 20190075Sobrien{ 20290075Sobrien return __builtin_ia32_andps (__A, __B); 20390075Sobrien} 20490075Sobrien 20590075Sobrienstatic __inline __m128 20690075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 20790075Sobrien{ 20890075Sobrien return __builtin_ia32_andnps (__A, __B); 20990075Sobrien} 21090075Sobrien 21190075Sobrienstatic __inline __m128 21290075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 21390075Sobrien{ 21490075Sobrien return __builtin_ia32_orps (__A, __B); 21590075Sobrien} 21690075Sobrien 21790075Sobrienstatic __inline __m128 21890075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 21990075Sobrien{ 22090075Sobrien return __builtin_ia32_xorps (__A, __B); 22190075Sobrien} 22290075Sobrien 22390075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 22490075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 22590075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 22690075Sobrien 22790075Sobrienstatic __inline __m128 22890075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 22990075Sobrien{ 23090075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 23190075Sobrien} 23290075Sobrien 23390075Sobrienstatic __inline __m128 23490075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 23590075Sobrien{ 23690075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 23790075Sobrien} 23890075Sobrien 23990075Sobrienstatic __inline __m128 24090075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 24190075Sobrien{ 24290075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 24390075Sobrien} 24490075Sobrien 24590075Sobrienstatic __inline __m128 24690075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 24790075Sobrien{ 248107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 249107590Sobrien (__v4sf) 250107590Sobrien __builtin_ia32_cmpltss ((__v4sf) __B, 251107590Sobrien (__v4sf) 252107590Sobrien __A)); 25390075Sobrien} 25490075Sobrien 25590075Sobrienstatic __inline __m128 25690075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 25790075Sobrien{ 258107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 259107590Sobrien (__v4sf) 260107590Sobrien __builtin_ia32_cmpless ((__v4sf) __B, 261107590Sobrien (__v4sf) 262107590Sobrien __A)); 26390075Sobrien} 26490075Sobrien 26590075Sobrienstatic __inline __m128 26690075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 26790075Sobrien{ 26890075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 26990075Sobrien} 27090075Sobrien 27190075Sobrienstatic __inline __m128 27290075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 27390075Sobrien{ 27490075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 27590075Sobrien} 27690075Sobrien 27790075Sobrienstatic __inline __m128 27890075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 27990075Sobrien{ 28090075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 28190075Sobrien} 28290075Sobrien 28390075Sobrienstatic __inline __m128 28490075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 28590075Sobrien{ 286107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 287107590Sobrien (__v4sf) 288107590Sobrien __builtin_ia32_cmpnltss ((__v4sf) __B, 289107590Sobrien (__v4sf) 290107590Sobrien __A)); 29190075Sobrien} 29290075Sobrien 29390075Sobrienstatic __inline __m128 29490075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 29590075Sobrien{ 296107590Sobrien return (__m128) __builtin_ia32_movss ((__v4sf) __A, 297107590Sobrien (__v4sf) 298107590Sobrien __builtin_ia32_cmpnless ((__v4sf) __B, 299107590Sobrien (__v4sf) 300107590Sobrien __A)); 30190075Sobrien} 30290075Sobrien 30390075Sobrienstatic __inline __m128 30490075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 30590075Sobrien{ 30690075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 30790075Sobrien} 30890075Sobrien 30990075Sobrienstatic __inline __m128 31090075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 31190075Sobrien{ 31290075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 31390075Sobrien} 31490075Sobrien 31590075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 31690075Sobrien element, if the comparison is true, place a mask of all ones in the 31790075Sobrien result, otherwise a mask of zeros. */ 31890075Sobrien 31990075Sobrienstatic __inline __m128 32090075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 32190075Sobrien{ 32290075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 32390075Sobrien} 32490075Sobrien 32590075Sobrienstatic __inline __m128 32690075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 32790075Sobrien{ 32890075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 32990075Sobrien} 33090075Sobrien 33190075Sobrienstatic __inline __m128 33290075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 33390075Sobrien{ 33490075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 33590075Sobrien} 33690075Sobrien 33790075Sobrienstatic __inline __m128 33890075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 33990075Sobrien{ 34090075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 34190075Sobrien} 34290075Sobrien 34390075Sobrienstatic __inline __m128 34490075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 34590075Sobrien{ 34690075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 34790075Sobrien} 34890075Sobrien 34990075Sobrienstatic __inline __m128 35090075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 35190075Sobrien{ 35290075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 35390075Sobrien} 35490075Sobrien 35590075Sobrienstatic __inline __m128 35690075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 35790075Sobrien{ 35890075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 35990075Sobrien} 36090075Sobrien 36190075Sobrienstatic __inline __m128 36290075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 36390075Sobrien{ 36490075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 36590075Sobrien} 36690075Sobrien 36790075Sobrienstatic __inline __m128 36890075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 36990075Sobrien{ 37090075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 37190075Sobrien} 37290075Sobrien 37390075Sobrienstatic __inline __m128 37490075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 37590075Sobrien{ 37690075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 37790075Sobrien} 37890075Sobrien 37990075Sobrienstatic __inline __m128 38090075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 38190075Sobrien{ 38290075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 38390075Sobrien} 38490075Sobrien 38590075Sobrienstatic __inline __m128 38690075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 38790075Sobrien{ 38890075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 38990075Sobrien} 39090075Sobrien 39190075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 39290075Sobrien and 0 if false. */ 39390075Sobrien 39490075Sobrienstatic __inline int 39590075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 39690075Sobrien{ 39790075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 39890075Sobrien} 39990075Sobrien 40090075Sobrienstatic __inline int 40190075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 40290075Sobrien{ 40390075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 40490075Sobrien} 40590075Sobrien 40690075Sobrienstatic __inline int 40790075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 40890075Sobrien{ 40990075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 41090075Sobrien} 41190075Sobrien 41290075Sobrienstatic __inline int 41390075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 41490075Sobrien{ 41590075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 41690075Sobrien} 41790075Sobrien 41890075Sobrienstatic __inline int 41990075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 42090075Sobrien{ 42190075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 42290075Sobrien} 42390075Sobrien 42490075Sobrienstatic __inline int 42590075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 42690075Sobrien{ 42790075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 42890075Sobrien} 42990075Sobrien 43090075Sobrienstatic __inline int 43190075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 43290075Sobrien{ 43390075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 43490075Sobrien} 43590075Sobrien 43690075Sobrienstatic __inline int 43790075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 43890075Sobrien{ 43990075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 44090075Sobrien} 44190075Sobrien 44290075Sobrienstatic __inline int 44390075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 44490075Sobrien{ 44590075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 44690075Sobrien} 44790075Sobrien 44890075Sobrienstatic __inline int 44990075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 45090075Sobrien{ 45190075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 45290075Sobrien} 45390075Sobrien 45490075Sobrienstatic __inline int 45590075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 45690075Sobrien{ 45790075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 45890075Sobrien} 45990075Sobrien 46090075Sobrienstatic __inline int 46190075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 46290075Sobrien{ 46390075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 46490075Sobrien} 46590075Sobrien 46690075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 46790075Sobrien rounding mode. */ 46890075Sobrienstatic __inline int 46990075Sobrien_mm_cvtss_si32 (__m128 __A) 47090075Sobrien{ 47190075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 47290075Sobrien} 47390075Sobrien 47490075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 47590075Sobrien current rounding mode. Return the integers in packed form. */ 47690075Sobrienstatic __inline __m64 47790075Sobrien_mm_cvtps_pi32 (__m128 __A) 47890075Sobrien{ 47990075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 48090075Sobrien} 48190075Sobrien 48290075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 48390075Sobrienstatic __inline int 48490075Sobrien_mm_cvttss_si32 (__m128 __A) 48590075Sobrien{ 48690075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 48790075Sobrien} 48890075Sobrien 48990075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 49090075Sobrien integers in packed form. */ 49190075Sobrienstatic __inline __m64 49290075Sobrien_mm_cvttps_pi32 (__m128 __A) 49390075Sobrien{ 49490075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 49590075Sobrien} 49690075Sobrien 49790075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 49890075Sobrienstatic __inline __m128 49990075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 50090075Sobrien{ 50190075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 50290075Sobrien} 50390075Sobrien 50490075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 50590075Sobrien as the two lower elements in A. */ 50690075Sobrienstatic __inline __m128 50790075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 50890075Sobrien{ 50990075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 51090075Sobrien} 51190075Sobrien 51290075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 51390075Sobrienstatic __inline __m128 51490075Sobrien_mm_cvtpi16_ps (__m64 __A) 51590075Sobrien{ 51690075Sobrien __v4hi __sign; 51790075Sobrien __v2si __hisi, __losi; 51890075Sobrien __v4sf __r; 51990075Sobrien 52090075Sobrien /* This comparison against zero gives us a mask that can be used to 52190075Sobrien fill in the missing sign bits in the unpack operations below, so 52290075Sobrien that we get signed values after unpacking. */ 52390075Sobrien __sign = (__v4hi) __builtin_ia32_mmx_zero (); 52490075Sobrien __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 52590075Sobrien 52690075Sobrien /* Convert the four words to doublewords. */ 52790075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 52890075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 52990075Sobrien 53090075Sobrien /* Convert the doublewords to floating point two at a time. */ 53190075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 53290075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 53390075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 53490075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 53590075Sobrien 53690075Sobrien return (__m128) __r; 53790075Sobrien} 53890075Sobrien 53990075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 54090075Sobrienstatic __inline __m128 54190075Sobrien_mm_cvtpu16_ps (__m64 __A) 54290075Sobrien{ 54390075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 54490075Sobrien __v2si __hisi, __losi; 54590075Sobrien __v4sf __r; 54690075Sobrien 54790075Sobrien /* Convert the four words to doublewords. */ 54890075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 54990075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 55090075Sobrien 55190075Sobrien /* Convert the doublewords to floating point two at a time. */ 55290075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 55390075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 55490075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 55590075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 55690075Sobrien 55790075Sobrien return (__m128) __r; 55890075Sobrien} 55990075Sobrien 56090075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 56190075Sobrienstatic __inline __m128 56290075Sobrien_mm_cvtpi8_ps (__m64 __A) 56390075Sobrien{ 56490075Sobrien __v8qi __sign; 56590075Sobrien 56690075Sobrien /* This comparison against zero gives us a mask that can be used to 56790075Sobrien fill in the missing sign bits in the unpack operations below, so 56890075Sobrien that we get signed values after unpacking. */ 56990075Sobrien __sign = (__v8qi) __builtin_ia32_mmx_zero (); 57090075Sobrien __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 57190075Sobrien 57290075Sobrien /* Convert the four low bytes to words. */ 57390075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 57490075Sobrien 57590075Sobrien return _mm_cvtpi16_ps(__A); 57690075Sobrien} 57790075Sobrien 57890075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 57990075Sobrienstatic __inline __m128 58090075Sobrien_mm_cvtpu8_ps(__m64 __A) 58190075Sobrien{ 58290075Sobrien __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 58390075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 58490075Sobrien return _mm_cvtpu16_ps(__A); 58590075Sobrien} 58690075Sobrien 58790075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 58890075Sobrienstatic __inline __m128 58990075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 59090075Sobrien{ 59190075Sobrien __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 59290075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 59390075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 59490075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 59590075Sobrien} 59690075Sobrien 59790075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 59890075Sobrienstatic __inline __m64 59990075Sobrien_mm_cvtps_pi16(__m128 __A) 60090075Sobrien{ 60190075Sobrien __v4sf __hisf = (__v4sf)__A; 60290075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 60390075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 60490075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 60590075Sobrien return (__m64) __builtin_ia32_packssdw (__losi, __hisi); 60690075Sobrien} 60790075Sobrien 60890075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 60990075Sobrienstatic __inline __m64 61090075Sobrien_mm_cvtps_pi8(__m128 __A) 61190075Sobrien{ 61290075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 61390075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 61490075Sobrien return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 61590075Sobrien} 61690075Sobrien 61790075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 61890075Sobrien#if 0 61990075Sobrienstatic __inline __m128 62090075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 62190075Sobrien{ 62290075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 62390075Sobrien} 62490075Sobrien#else 62590075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 62690075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 62790075Sobrien#endif 62890075Sobrien 62990075Sobrien 63090075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 63190075Sobrienstatic __inline __m128 63290075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 63390075Sobrien{ 63490075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 63590075Sobrien} 63690075Sobrien 63790075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 63890075Sobrienstatic __inline __m128 63990075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 64090075Sobrien{ 64190075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 64290075Sobrien} 64390075Sobrien 64490075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 64590075Sobrien the lower two values are passed through from A. */ 64690075Sobrienstatic __inline __m128 64790075Sobrien_mm_loadh_pi (__m128 __A, __m64 *__P) 64890075Sobrien{ 64990075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 65090075Sobrien} 65190075Sobrien 65290075Sobrien/* Stores the upper two SPFP values of A into P. */ 65390075Sobrienstatic __inline void 65490075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 65590075Sobrien{ 65690075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 65790075Sobrien} 65890075Sobrien 65990075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 66090075Sobrienstatic __inline __m128 66190075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 66290075Sobrien{ 66390075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 66490075Sobrien} 66590075Sobrien 66690075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 66790075Sobrienstatic __inline __m128 66890075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 66990075Sobrien{ 67090075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 67190075Sobrien} 67290075Sobrien 67390075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 67490075Sobrien the upper two values are passed through from A. */ 67590075Sobrienstatic __inline __m128 67690075Sobrien_mm_loadl_pi (__m128 __A, __m64 *__P) 67790075Sobrien{ 67890075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 67990075Sobrien} 68090075Sobrien 68190075Sobrien/* Stores the lower two SPFP values of A into P. */ 68290075Sobrienstatic __inline void 68390075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 68490075Sobrien{ 68590075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 68690075Sobrien} 68790075Sobrien 68890075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 68990075Sobrienstatic __inline int 69090075Sobrien_mm_movemask_ps (__m128 __A) 69190075Sobrien{ 69290075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 69390075Sobrien} 69490075Sobrien 69590075Sobrien/* Return the contents of the control register. */ 69690075Sobrienstatic __inline unsigned int 69790075Sobrien_mm_getcsr (void) 69890075Sobrien{ 69990075Sobrien return __builtin_ia32_stmxcsr (); 70090075Sobrien} 70190075Sobrien 70290075Sobrien/* Read exception bits from the control register. */ 70390075Sobrienstatic __inline unsigned int 70490075Sobrien_MM_GET_EXCEPTION_STATE (void) 70590075Sobrien{ 70690075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 70790075Sobrien} 70890075Sobrien 70990075Sobrienstatic __inline unsigned int 71090075Sobrien_MM_GET_EXCEPTION_MASK (void) 71190075Sobrien{ 71290075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 71390075Sobrien} 71490075Sobrien 71590075Sobrienstatic __inline unsigned int 71690075Sobrien_MM_GET_ROUNDING_MODE (void) 71790075Sobrien{ 71890075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 71990075Sobrien} 72090075Sobrien 72190075Sobrienstatic __inline unsigned int 72290075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 72390075Sobrien{ 72490075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 72590075Sobrien} 72690075Sobrien 72790075Sobrien/* Set the control register to I. */ 72890075Sobrienstatic __inline void 72990075Sobrien_mm_setcsr (unsigned int __I) 73090075Sobrien{ 73190075Sobrien __builtin_ia32_ldmxcsr (__I); 73290075Sobrien} 73390075Sobrien 73490075Sobrien/* Set exception bits in the control register. */ 73590075Sobrienstatic __inline void 73690075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 73790075Sobrien{ 73890075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 73990075Sobrien} 74090075Sobrien 74190075Sobrienstatic __inline void 74290075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 74390075Sobrien{ 74490075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 74590075Sobrien} 74690075Sobrien 74790075Sobrienstatic __inline void 74890075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 74990075Sobrien{ 75090075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 75190075Sobrien} 75290075Sobrien 75390075Sobrienstatic __inline void 75490075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 75590075Sobrien{ 75690075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 75790075Sobrien} 75890075Sobrien 75990075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 76090075Sobrienstatic __inline __m128 76190075Sobrien_mm_load_ss (float *__P) 76290075Sobrien{ 76390075Sobrien return (__m128) __builtin_ia32_loadss (__P); 76490075Sobrien} 76590075Sobrien 76690075Sobrien/* Create a vector with all four elements equal to *P. */ 76790075Sobrienstatic __inline __m128 76890075Sobrien_mm_load1_ps (float *__P) 76990075Sobrien{ 77090075Sobrien __v4sf __tmp = __builtin_ia32_loadss (__P); 77190075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 77290075Sobrien} 77390075Sobrien 77490075Sobrienstatic __inline __m128 77590075Sobrien_mm_load_ps1 (float *__P) 77690075Sobrien{ 77790075Sobrien return _mm_load1_ps (__P); 77890075Sobrien} 77990075Sobrien 78090075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 78190075Sobrienstatic __inline __m128 78290075Sobrien_mm_load_ps (float *__P) 78390075Sobrien{ 78490075Sobrien return (__m128) __builtin_ia32_loadaps (__P); 78590075Sobrien} 78690075Sobrien 78790075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 78890075Sobrienstatic __inline __m128 78990075Sobrien_mm_loadu_ps (float *__P) 79090075Sobrien{ 79190075Sobrien return (__m128) __builtin_ia32_loadups (__P); 79290075Sobrien} 79390075Sobrien 79490075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 79590075Sobrienstatic __inline __m128 79690075Sobrien_mm_loadr_ps (float *__P) 79790075Sobrien{ 79890075Sobrien __v4sf __tmp = __builtin_ia32_loadaps (__P); 79990075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 80090075Sobrien} 80190075Sobrien 80290075Sobrien/* Create a vector with element 0 as F and the rest zero. */ 80390075Sobrienstatic __inline __m128 80490075Sobrien_mm_set_ss (float __F) 80590075Sobrien{ 80690075Sobrien return (__m128) __builtin_ia32_loadss (&__F); 80790075Sobrien} 80890075Sobrien 80990075Sobrien/* Create a vector with all four elements equal to F. */ 81090075Sobrienstatic __inline __m128 81190075Sobrien_mm_set1_ps (float __F) 81290075Sobrien{ 81390075Sobrien __v4sf __tmp = __builtin_ia32_loadss (&__F); 81490075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 81590075Sobrien} 81690075Sobrien 81790075Sobrienstatic __inline __m128 81890075Sobrien_mm_set_ps1 (float __F) 81990075Sobrien{ 82090075Sobrien return _mm_set1_ps (__F); 82190075Sobrien} 82290075Sobrien 82390075Sobrien/* Create the vector [Z Y X W]. */ 82490075Sobrienstatic __inline __m128 82590075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W) 82690075Sobrien{ 82790075Sobrien union { 82890075Sobrien float __a[4]; 82990075Sobrien __m128 __v; 83090075Sobrien } __u; 83190075Sobrien 83290075Sobrien __u.__a[0] = __W; 83390075Sobrien __u.__a[1] = __X; 83490075Sobrien __u.__a[2] = __Y; 83590075Sobrien __u.__a[3] = __Z; 83690075Sobrien 83790075Sobrien return __u.__v; 83890075Sobrien} 83990075Sobrien 84090075Sobrien/* Create the vector [W X Y Z]. */ 84190075Sobrienstatic __inline __m128 84290075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W) 84390075Sobrien{ 84490075Sobrien return _mm_set_ps (__W, __X, __Y, __Z); 84590075Sobrien} 84690075Sobrien 84790075Sobrien/* Create a vector of zeros. */ 84890075Sobrienstatic __inline __m128 84990075Sobrien_mm_setzero_ps (void) 85090075Sobrien{ 85190075Sobrien return (__m128) __builtin_ia32_setzerops (); 85290075Sobrien} 85390075Sobrien 85490075Sobrien/* Stores the lower SPFP value. */ 85590075Sobrienstatic __inline void 85690075Sobrien_mm_store_ss (float *__P, __m128 __A) 85790075Sobrien{ 85890075Sobrien __builtin_ia32_storess (__P, (__v4sf)__A); 85990075Sobrien} 86090075Sobrien 86190075Sobrien/* Store the lower SPFP value across four words. */ 86290075Sobrienstatic __inline void 86390075Sobrien_mm_store1_ps (float *__P, __m128 __A) 86490075Sobrien{ 86590075Sobrien __v4sf __va = (__v4sf)__A; 86690075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 86790075Sobrien __builtin_ia32_storeaps (__P, __tmp); 86890075Sobrien} 86990075Sobrien 87090075Sobrienstatic __inline void 87190075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 87290075Sobrien{ 87390075Sobrien _mm_store1_ps (__P, __A); 87490075Sobrien} 87590075Sobrien 87690075Sobrien/* Store four SPFP values. The address must be 16-byte aligned. */ 87790075Sobrienstatic __inline void 87890075Sobrien_mm_store_ps (float *__P, __m128 __A) 87990075Sobrien{ 88090075Sobrien __builtin_ia32_storeaps (__P, (__v4sf)__A); 88190075Sobrien} 88290075Sobrien 88390075Sobrien/* Store four SPFP values. The address need not be 16-byte aligned. */ 88490075Sobrienstatic __inline void 88590075Sobrien_mm_storeu_ps (float *__P, __m128 __A) 88690075Sobrien{ 88790075Sobrien __builtin_ia32_storeups (__P, (__v4sf)__A); 88890075Sobrien} 88990075Sobrien 89090075Sobrien/* Store four SPFP values in reverse order. The addres must be aligned. */ 89190075Sobrienstatic __inline void 89290075Sobrien_mm_storer_ps (float *__P, __m128 __A) 89390075Sobrien{ 89490075Sobrien __v4sf __va = (__v4sf)__A; 89590075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 89690075Sobrien __builtin_ia32_storeaps (__P, __tmp); 89790075Sobrien} 89890075Sobrien 89990075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 90090075Sobrienstatic __inline __m128 90190075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 90290075Sobrien{ 90390075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 90490075Sobrien} 90590075Sobrien 90690075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 90790075Sobrien#if 0 90890075Sobrienstatic __inline int 90990075Sobrien_mm_extract_pi16 (__m64 __A, int __N) 91090075Sobrien{ 91190075Sobrien return __builtin_ia32_pextrw ((__v4hi)__A, __N); 91290075Sobrien} 91390075Sobrien#else 91490075Sobrien#define _mm_extract_pi16(A, N) \ 91590075Sobrien __builtin_ia32_pextrw ((__v4hi)(A), (N)) 91690075Sobrien#endif 91790075Sobrien 91890075Sobrien/* Inserts word D into one of four words of A. The selector N must be 91990075Sobrien immediate. */ 92090075Sobrien#if 0 92190075Sobrienstatic __inline __m64 92290075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N) 92390075Sobrien{ 92490075Sobrien return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 92590075Sobrien} 92690075Sobrien#else 92790075Sobrien#define _mm_insert_pi16(A, D, N) \ 92890075Sobrien ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 92990075Sobrien#endif 93090075Sobrien 93190075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 93290075Sobrienstatic __inline __m64 93390075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 93490075Sobrien{ 93590075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 93690075Sobrien} 93790075Sobrien 93890075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 93990075Sobrienstatic __inline __m64 94090075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 94190075Sobrien{ 94290075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 94390075Sobrien} 94490075Sobrien 94590075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 94690075Sobrienstatic __inline __m64 94790075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 94890075Sobrien{ 94990075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 95090075Sobrien} 95190075Sobrien 95290075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 95390075Sobrienstatic __inline __m64 95490075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 95590075Sobrien{ 95690075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 95790075Sobrien} 95890075Sobrien 95990075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 96090075Sobrienstatic __inline int 96190075Sobrien_mm_movemask_pi8 (__m64 __A) 96290075Sobrien{ 96390075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 96490075Sobrien} 96590075Sobrien 96690075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 96790075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 96890075Sobrienstatic __inline __m64 96990075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 97090075Sobrien{ 97190075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 97290075Sobrien} 97390075Sobrien 97490075Sobrien/* Return a combination of the four 16-bit values in A. The selector 97590075Sobrien must be an immediate. */ 97690075Sobrien#if 0 97790075Sobrienstatic __inline __m64 97890075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 97990075Sobrien{ 98090075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 98190075Sobrien} 98290075Sobrien#else 98390075Sobrien#define _mm_shuffle_pi16(A, N) \ 98490075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 98590075Sobrien#endif 98690075Sobrien 98790075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 98890075Sobrien byte in the selector N determines whether the corresponding byte from 98990075Sobrien A is stored. */ 99090075Sobrienstatic __inline void 99190075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 99290075Sobrien{ 99390075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 99490075Sobrien} 99590075Sobrien 99690075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 99790075Sobrienstatic __inline __m64 99890075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 99990075Sobrien{ 100090075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 100190075Sobrien} 100290075Sobrien 100390075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 100490075Sobrienstatic __inline __m64 100590075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 100690075Sobrien{ 100790075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 100890075Sobrien} 100990075Sobrien 101090075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 101190075Sobrien values in A and B. Return the value in the lower 16-bit word; the 101290075Sobrien upper words are cleared. */ 101390075Sobrienstatic __inline __m64 101490075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 101590075Sobrien{ 101690075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 101790075Sobrien} 101890075Sobrien 101990075Sobrien/* Loads one cache line from address P to a location "closer" to the 102090075Sobrien processor. The selector I specifies the type of prefetch operation. */ 102190075Sobrien#if 0 102290075Sobrienstatic __inline void 102390075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 102490075Sobrien{ 102590075Sobrien __builtin_prefetch (__P, 0, __I); 102690075Sobrien} 102790075Sobrien#else 102890075Sobrien#define _mm_prefetch(P, I) \ 102990075Sobrien __builtin_prefetch ((P), 0, (I)) 103090075Sobrien#endif 103190075Sobrien 103290075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 103390075Sobrienstatic __inline void 103490075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 103590075Sobrien{ 1036107590Sobrien __builtin_ia32_movntq (__P, (long long)__A); 103790075Sobrien} 103890075Sobrien 103990075Sobrien/* Likewise. The address must be 16-byte aligned. */ 104090075Sobrienstatic __inline void 104190075Sobrien_mm_stream_ps (float *__P, __m128 __A) 104290075Sobrien{ 104390075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 104490075Sobrien} 104590075Sobrien 104690075Sobrien/* Guarantees that every preceeding store is globally visible before 104790075Sobrien any subsequent store. */ 104890075Sobrienstatic __inline void 104990075Sobrien_mm_sfence (void) 105090075Sobrien{ 105190075Sobrien __builtin_ia32_sfence (); 105290075Sobrien} 105390075Sobrien 105490075Sobrien/* The execution of the next instruction is delayed by an implementation 105590075Sobrien specific amount of time. The instruction does not modify the 105690075Sobrien architectural state. */ 105790075Sobrienstatic __inline void 105890075Sobrien_mm_pause (void) 105990075Sobrien{ 106090075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 106190075Sobrien} 106290075Sobrien 106390075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 106490075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 106590075Sobriendo { \ 106690075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 106790075Sobrien __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 1068107590Sobrien __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 1069107590Sobrien __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 107090075Sobrien __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 107190075Sobrien (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 107290075Sobrien (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 107390075Sobrien (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 107490075Sobrien (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 107590075Sobrien} while (0) 107690075Sobrien 107790075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1078