xmmintrin.h revision 90075
190075Sobrien/* Copyright (C) 2002 Free Software Foundation, Inc. 290075Sobrien 390075Sobrien This file is part of GNU CC. 490075Sobrien 590075Sobrien GNU CC is free software; you can redistribute it and/or modify 690075Sobrien it under the terms of the GNU General Public License as published by 790075Sobrien the Free Software Foundation; either version 2, or (at your option) 890075Sobrien any later version. 990075Sobrien 1090075Sobrien GNU CC is distributed in the hope that it will be useful, 1190075Sobrien but WITHOUT ANY WARRANTY; without even the implied warranty of 1290075Sobrien MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1390075Sobrien GNU General Public License for more details. 1490075Sobrien 1590075Sobrien You should have received a copy of the GNU General Public License 1690075Sobrien along with GNU CC; see the file COPYING. If not, write to 1790075Sobrien the Free Software Foundation, 59 Temple Place - Suite 330, 1890075Sobrien Boston, MA 02111-1307, USA. */ 1990075Sobrien 2090075Sobrien/* As a special exception, if you include this header file into source 2190075Sobrien files compiled by GCC, this header file does not by itself cause 2290075Sobrien the resulting executable to be covered by the GNU General Public 2390075Sobrien License. This exception does not however invalidate any other 2490075Sobrien reasons why the executable file might be covered by the GNU General 2590075Sobrien Public License. */ 2690075Sobrien 2790075Sobrien/* Implemented from the specification included in the Intel C++ Compiler 2890075Sobrien User Guide and Reference, version 5.0. */ 2990075Sobrien 3090075Sobrien#ifndef _XMMINTRIN_H_INCLUDED 3190075Sobrien#define _XMMINTRIN_H_INCLUDED 3290075Sobrien 3390075Sobrien/* We need type definitions from the MMX header file. */ 3490075Sobrien#include <mmintrin.h> 3590075Sobrien 3690075Sobrien/* The data type indended for user use. */ 3790075Sobrientypedef int __m128 __attribute__ ((__mode__(__V4SF__))); 3890075Sobrien 3990075Sobrien/* Internal data types for implementing the instrinsics. */ 4090075Sobrientypedef int __v4sf __attribute__ ((__mode__(__V4SF__))); 4190075Sobrientypedef int __v4si __attribute__ ((__mode__(__V4SI__))); 4290075Sobrien 4390075Sobrien/* Create a selector for use with the SHUFPS instruction. */ 4490075Sobrien#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 4590075Sobrien (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 4690075Sobrien 4790075Sobrien/* Constants for use with _mm_prefetch. */ 4890075Sobrienenum _mm_hint 4990075Sobrien{ 5090075Sobrien _MM_HINT_T0 = 3, 5190075Sobrien _MM_HINT_T1 = 2, 5290075Sobrien _MM_HINT_T2 = 1, 5390075Sobrien _MM_HINT_NTA = 0 5490075Sobrien}; 5590075Sobrien 5690075Sobrien/* Bits in the MXCSR. */ 5790075Sobrien#define _MM_EXCEPT_MASK 0x003f 5890075Sobrien#define _MM_EXCEPT_INVALID 0x0001 5990075Sobrien#define _MM_EXCEPT_DENORM 0x0002 6090075Sobrien#define _MM_EXCEPT_DIV_ZERO 0x0004 6190075Sobrien#define _MM_EXCEPT_OVERFLOW 0x0008 6290075Sobrien#define _MM_EXCEPT_UNDERFLOW 0x0010 6390075Sobrien#define _MM_EXCEPT_INEXACT 0x0020 6490075Sobrien 6590075Sobrien#define _MM_MASK_MASK 0x1f80 6690075Sobrien#define _MM_MASK_INVALID 0x0080 6790075Sobrien#define _MM_MASK_DENORM 0x0100 6890075Sobrien#define _MM_MASK_DIV_ZERO 0x0200 6990075Sobrien#define _MM_MASK_OVERFLOW 0x0400 7090075Sobrien#define _MM_MASK_UNDERFLOW 0x0800 7190075Sobrien#define _MM_MASK_INEXACT 0x1000 7290075Sobrien 7390075Sobrien#define _MM_ROUND_MASK 0x6000 7490075Sobrien#define _MM_ROUND_NEAREST 0x0000 7590075Sobrien#define _MM_ROUND_DOWN 0x2000 7690075Sobrien#define _MM_ROUND_UP 0x4000 7790075Sobrien#define _MM_ROUND_TOWARD_ZERO 0x6000 7890075Sobrien 7990075Sobrien#define _MM_FLUSH_ZERO_MASK 0x8000 8090075Sobrien#define _MM_FLUSH_ZERO_ON 0x8000 8190075Sobrien#define _MM_FLUSH_ZERO_OFF 0x0000 8290075Sobrien 8390075Sobrien/* Perform the respective operation on the lower SPFP (single-precision 8490075Sobrien floating-point) values of A and B; the upper three SPFP values are 8590075Sobrien passed through from A. */ 8690075Sobrien 8790075Sobrienstatic __inline __m128 8890075Sobrien_mm_add_ss (__m128 __A, __m128 __B) 8990075Sobrien{ 9090075Sobrien return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 9190075Sobrien} 9290075Sobrien 9390075Sobrienstatic __inline __m128 9490075Sobrien_mm_sub_ss (__m128 __A, __m128 __B) 9590075Sobrien{ 9690075Sobrien return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 9790075Sobrien} 9890075Sobrien 9990075Sobrienstatic __inline __m128 10090075Sobrien_mm_mul_ss (__m128 __A, __m128 __B) 10190075Sobrien{ 10290075Sobrien return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 10390075Sobrien} 10490075Sobrien 10590075Sobrienstatic __inline __m128 10690075Sobrien_mm_div_ss (__m128 __A, __m128 __B) 10790075Sobrien{ 10890075Sobrien return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 10990075Sobrien} 11090075Sobrien 11190075Sobrienstatic __inline __m128 11290075Sobrien_mm_sqrt_ss (__m128 __A) 11390075Sobrien{ 11490075Sobrien return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 11590075Sobrien} 11690075Sobrien 11790075Sobrienstatic __inline __m128 11890075Sobrien_mm_rcp_ss (__m128 __A) 11990075Sobrien{ 12090075Sobrien return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 12190075Sobrien} 12290075Sobrien 12390075Sobrienstatic __inline __m128 12490075Sobrien_mm_rsqrt_ss (__m128 __A) 12590075Sobrien{ 12690075Sobrien return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 12790075Sobrien} 12890075Sobrien 12990075Sobrienstatic __inline __m128 13090075Sobrien_mm_min_ss (__m128 __A, __m128 __B) 13190075Sobrien{ 13290075Sobrien return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 13390075Sobrien} 13490075Sobrien 13590075Sobrienstatic __inline __m128 13690075Sobrien_mm_max_ss (__m128 __A, __m128 __B) 13790075Sobrien{ 13890075Sobrien return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 13990075Sobrien} 14090075Sobrien 14190075Sobrien/* Perform the respective operation on the four SPFP values in A and B. */ 14290075Sobrien 14390075Sobrienstatic __inline __m128 14490075Sobrien_mm_add_ps (__m128 __A, __m128 __B) 14590075Sobrien{ 14690075Sobrien return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 14790075Sobrien} 14890075Sobrien 14990075Sobrienstatic __inline __m128 15090075Sobrien_mm_sub_ps (__m128 __A, __m128 __B) 15190075Sobrien{ 15290075Sobrien return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 15390075Sobrien} 15490075Sobrien 15590075Sobrienstatic __inline __m128 15690075Sobrien_mm_mul_ps (__m128 __A, __m128 __B) 15790075Sobrien{ 15890075Sobrien return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 15990075Sobrien} 16090075Sobrien 16190075Sobrienstatic __inline __m128 16290075Sobrien_mm_div_ps (__m128 __A, __m128 __B) 16390075Sobrien{ 16490075Sobrien return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 16590075Sobrien} 16690075Sobrien 16790075Sobrienstatic __inline __m128 16890075Sobrien_mm_sqrt_ps (__m128 __A) 16990075Sobrien{ 17090075Sobrien return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 17190075Sobrien} 17290075Sobrien 17390075Sobrienstatic __inline __m128 17490075Sobrien_mm_rcp_ps (__m128 __A) 17590075Sobrien{ 17690075Sobrien return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 17790075Sobrien} 17890075Sobrien 17990075Sobrienstatic __inline __m128 18090075Sobrien_mm_rsqrt_ps (__m128 __A) 18190075Sobrien{ 18290075Sobrien return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 18390075Sobrien} 18490075Sobrien 18590075Sobrienstatic __inline __m128 18690075Sobrien_mm_min_ps (__m128 __A, __m128 __B) 18790075Sobrien{ 18890075Sobrien return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 18990075Sobrien} 19090075Sobrien 19190075Sobrienstatic __inline __m128 19290075Sobrien_mm_max_ps (__m128 __A, __m128 __B) 19390075Sobrien{ 19490075Sobrien return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 19590075Sobrien} 19690075Sobrien 19790075Sobrien/* Perform logical bit-wise operations on 128-bit values. */ 19890075Sobrien 19990075Sobrienstatic __inline __m128 20090075Sobrien_mm_and_ps (__m128 __A, __m128 __B) 20190075Sobrien{ 20290075Sobrien return __builtin_ia32_andps (__A, __B); 20390075Sobrien} 20490075Sobrien 20590075Sobrienstatic __inline __m128 20690075Sobrien_mm_andnot_ps (__m128 __A, __m128 __B) 20790075Sobrien{ 20890075Sobrien return __builtin_ia32_andnps (__A, __B); 20990075Sobrien} 21090075Sobrien 21190075Sobrienstatic __inline __m128 21290075Sobrien_mm_or_ps (__m128 __A, __m128 __B) 21390075Sobrien{ 21490075Sobrien return __builtin_ia32_orps (__A, __B); 21590075Sobrien} 21690075Sobrien 21790075Sobrienstatic __inline __m128 21890075Sobrien_mm_xor_ps (__m128 __A, __m128 __B) 21990075Sobrien{ 22090075Sobrien return __builtin_ia32_xorps (__A, __B); 22190075Sobrien} 22290075Sobrien 22390075Sobrien/* Perform a comparison on the lower SPFP values of A and B. If the 22490075Sobrien comparison is true, place a mask of all ones in the result, otherwise a 22590075Sobrien mask of zeros. The upper three SPFP values are passed through from A. */ 22690075Sobrien 22790075Sobrienstatic __inline __m128 22890075Sobrien_mm_cmpeq_ss (__m128 __A, __m128 __B) 22990075Sobrien{ 23090075Sobrien return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 23190075Sobrien} 23290075Sobrien 23390075Sobrienstatic __inline __m128 23490075Sobrien_mm_cmplt_ss (__m128 __A, __m128 __B) 23590075Sobrien{ 23690075Sobrien return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 23790075Sobrien} 23890075Sobrien 23990075Sobrienstatic __inline __m128 24090075Sobrien_mm_cmple_ss (__m128 __A, __m128 __B) 24190075Sobrien{ 24290075Sobrien return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 24390075Sobrien} 24490075Sobrien 24590075Sobrienstatic __inline __m128 24690075Sobrien_mm_cmpgt_ss (__m128 __A, __m128 __B) 24790075Sobrien{ 24890075Sobrien return (__m128) __builtin_ia32_cmpgtss ((__v4sf)__A, (__v4sf)__B); 24990075Sobrien} 25090075Sobrien 25190075Sobrienstatic __inline __m128 25290075Sobrien_mm_cmpge_ss (__m128 __A, __m128 __B) 25390075Sobrien{ 25490075Sobrien return (__m128) __builtin_ia32_cmpgess ((__v4sf)__A, (__v4sf)__B); 25590075Sobrien} 25690075Sobrien 25790075Sobrienstatic __inline __m128 25890075Sobrien_mm_cmpneq_ss (__m128 __A, __m128 __B) 25990075Sobrien{ 26090075Sobrien return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 26190075Sobrien} 26290075Sobrien 26390075Sobrienstatic __inline __m128 26490075Sobrien_mm_cmpnlt_ss (__m128 __A, __m128 __B) 26590075Sobrien{ 26690075Sobrien return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 26790075Sobrien} 26890075Sobrien 26990075Sobrienstatic __inline __m128 27090075Sobrien_mm_cmpnle_ss (__m128 __A, __m128 __B) 27190075Sobrien{ 27290075Sobrien return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 27390075Sobrien} 27490075Sobrien 27590075Sobrienstatic __inline __m128 27690075Sobrien_mm_cmpngt_ss (__m128 __A, __m128 __B) 27790075Sobrien{ 27890075Sobrien return (__m128) __builtin_ia32_cmpngtss ((__v4sf)__A, (__v4sf)__B); 27990075Sobrien} 28090075Sobrien 28190075Sobrienstatic __inline __m128 28290075Sobrien_mm_cmpnge_ss (__m128 __A, __m128 __B) 28390075Sobrien{ 28490075Sobrien return (__m128) __builtin_ia32_cmpngess ((__v4sf)__A, (__v4sf)__B); 28590075Sobrien} 28690075Sobrien 28790075Sobrienstatic __inline __m128 28890075Sobrien_mm_cmpord_ss (__m128 __A, __m128 __B) 28990075Sobrien{ 29090075Sobrien return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 29190075Sobrien} 29290075Sobrien 29390075Sobrienstatic __inline __m128 29490075Sobrien_mm_cmpunord_ss (__m128 __A, __m128 __B) 29590075Sobrien{ 29690075Sobrien return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 29790075Sobrien} 29890075Sobrien 29990075Sobrien/* Perform a comparison on the four SPFP values of A and B. For each 30090075Sobrien element, if the comparison is true, place a mask of all ones in the 30190075Sobrien result, otherwise a mask of zeros. */ 30290075Sobrien 30390075Sobrienstatic __inline __m128 30490075Sobrien_mm_cmpeq_ps (__m128 __A, __m128 __B) 30590075Sobrien{ 30690075Sobrien return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 30790075Sobrien} 30890075Sobrien 30990075Sobrienstatic __inline __m128 31090075Sobrien_mm_cmplt_ps (__m128 __A, __m128 __B) 31190075Sobrien{ 31290075Sobrien return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 31390075Sobrien} 31490075Sobrien 31590075Sobrienstatic __inline __m128 31690075Sobrien_mm_cmple_ps (__m128 __A, __m128 __B) 31790075Sobrien{ 31890075Sobrien return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 31990075Sobrien} 32090075Sobrien 32190075Sobrienstatic __inline __m128 32290075Sobrien_mm_cmpgt_ps (__m128 __A, __m128 __B) 32390075Sobrien{ 32490075Sobrien return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 32590075Sobrien} 32690075Sobrien 32790075Sobrienstatic __inline __m128 32890075Sobrien_mm_cmpge_ps (__m128 __A, __m128 __B) 32990075Sobrien{ 33090075Sobrien return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 33190075Sobrien} 33290075Sobrien 33390075Sobrienstatic __inline __m128 33490075Sobrien_mm_cmpneq_ps (__m128 __A, __m128 __B) 33590075Sobrien{ 33690075Sobrien return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 33790075Sobrien} 33890075Sobrien 33990075Sobrienstatic __inline __m128 34090075Sobrien_mm_cmpnlt_ps (__m128 __A, __m128 __B) 34190075Sobrien{ 34290075Sobrien return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 34390075Sobrien} 34490075Sobrien 34590075Sobrienstatic __inline __m128 34690075Sobrien_mm_cmpnle_ps (__m128 __A, __m128 __B) 34790075Sobrien{ 34890075Sobrien return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 34990075Sobrien} 35090075Sobrien 35190075Sobrienstatic __inline __m128 35290075Sobrien_mm_cmpngt_ps (__m128 __A, __m128 __B) 35390075Sobrien{ 35490075Sobrien return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 35590075Sobrien} 35690075Sobrien 35790075Sobrienstatic __inline __m128 35890075Sobrien_mm_cmpnge_ps (__m128 __A, __m128 __B) 35990075Sobrien{ 36090075Sobrien return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 36190075Sobrien} 36290075Sobrien 36390075Sobrienstatic __inline __m128 36490075Sobrien_mm_cmpord_ps (__m128 __A, __m128 __B) 36590075Sobrien{ 36690075Sobrien return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 36790075Sobrien} 36890075Sobrien 36990075Sobrienstatic __inline __m128 37090075Sobrien_mm_cmpunord_ps (__m128 __A, __m128 __B) 37190075Sobrien{ 37290075Sobrien return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 37390075Sobrien} 37490075Sobrien 37590075Sobrien/* Compare the lower SPFP values of A and B and return 1 if true 37690075Sobrien and 0 if false. */ 37790075Sobrien 37890075Sobrienstatic __inline int 37990075Sobrien_mm_comieq_ss (__m128 __A, __m128 __B) 38090075Sobrien{ 38190075Sobrien return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 38290075Sobrien} 38390075Sobrien 38490075Sobrienstatic __inline int 38590075Sobrien_mm_comilt_ss (__m128 __A, __m128 __B) 38690075Sobrien{ 38790075Sobrien return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 38890075Sobrien} 38990075Sobrien 39090075Sobrienstatic __inline int 39190075Sobrien_mm_comile_ss (__m128 __A, __m128 __B) 39290075Sobrien{ 39390075Sobrien return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 39490075Sobrien} 39590075Sobrien 39690075Sobrienstatic __inline int 39790075Sobrien_mm_comigt_ss (__m128 __A, __m128 __B) 39890075Sobrien{ 39990075Sobrien return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 40090075Sobrien} 40190075Sobrien 40290075Sobrienstatic __inline int 40390075Sobrien_mm_comige_ss (__m128 __A, __m128 __B) 40490075Sobrien{ 40590075Sobrien return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 40690075Sobrien} 40790075Sobrien 40890075Sobrienstatic __inline int 40990075Sobrien_mm_comineq_ss (__m128 __A, __m128 __B) 41090075Sobrien{ 41190075Sobrien return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 41290075Sobrien} 41390075Sobrien 41490075Sobrienstatic __inline int 41590075Sobrien_mm_ucomieq_ss (__m128 __A, __m128 __B) 41690075Sobrien{ 41790075Sobrien return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 41890075Sobrien} 41990075Sobrien 42090075Sobrienstatic __inline int 42190075Sobrien_mm_ucomilt_ss (__m128 __A, __m128 __B) 42290075Sobrien{ 42390075Sobrien return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 42490075Sobrien} 42590075Sobrien 42690075Sobrienstatic __inline int 42790075Sobrien_mm_ucomile_ss (__m128 __A, __m128 __B) 42890075Sobrien{ 42990075Sobrien return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 43090075Sobrien} 43190075Sobrien 43290075Sobrienstatic __inline int 43390075Sobrien_mm_ucomigt_ss (__m128 __A, __m128 __B) 43490075Sobrien{ 43590075Sobrien return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 43690075Sobrien} 43790075Sobrien 43890075Sobrienstatic __inline int 43990075Sobrien_mm_ucomige_ss (__m128 __A, __m128 __B) 44090075Sobrien{ 44190075Sobrien return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 44290075Sobrien} 44390075Sobrien 44490075Sobrienstatic __inline int 44590075Sobrien_mm_ucomineq_ss (__m128 __A, __m128 __B) 44690075Sobrien{ 44790075Sobrien return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 44890075Sobrien} 44990075Sobrien 45090075Sobrien/* Convert the lower SPFP value to a 32-bit integer according to the current 45190075Sobrien rounding mode. */ 45290075Sobrienstatic __inline int 45390075Sobrien_mm_cvtss_si32 (__m128 __A) 45490075Sobrien{ 45590075Sobrien return __builtin_ia32_cvtss2si ((__v4sf) __A); 45690075Sobrien} 45790075Sobrien 45890075Sobrien/* Convert the two lower SPFP values to 32-bit integers according to the 45990075Sobrien current rounding mode. Return the integers in packed form. */ 46090075Sobrienstatic __inline __m64 46190075Sobrien_mm_cvtps_pi32 (__m128 __A) 46290075Sobrien{ 46390075Sobrien return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 46490075Sobrien} 46590075Sobrien 46690075Sobrien/* Truncate the lower SPFP value to a 32-bit integer. */ 46790075Sobrienstatic __inline int 46890075Sobrien_mm_cvttss_si32 (__m128 __A) 46990075Sobrien{ 47090075Sobrien return __builtin_ia32_cvttss2si ((__v4sf) __A); 47190075Sobrien} 47290075Sobrien 47390075Sobrien/* Truncate the two lower SPFP values to 32-bit integers. Return the 47490075Sobrien integers in packed form. */ 47590075Sobrienstatic __inline __m64 47690075Sobrien_mm_cvttps_pi32 (__m128 __A) 47790075Sobrien{ 47890075Sobrien return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 47990075Sobrien} 48090075Sobrien 48190075Sobrien/* Convert B to a SPFP value and insert it as element zero in A. */ 48290075Sobrienstatic __inline __m128 48390075Sobrien_mm_cvtsi32_ss (__m128 __A, int __B) 48490075Sobrien{ 48590075Sobrien return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 48690075Sobrien} 48790075Sobrien 48890075Sobrien/* Convert the two 32-bit values in B to SPFP form and insert them 48990075Sobrien as the two lower elements in A. */ 49090075Sobrienstatic __inline __m128 49190075Sobrien_mm_cvtpi32_ps (__m128 __A, __m64 __B) 49290075Sobrien{ 49390075Sobrien return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 49490075Sobrien} 49590075Sobrien 49690075Sobrien/* Convert the four signed 16-bit values in A to SPFP form. */ 49790075Sobrienstatic __inline __m128 49890075Sobrien_mm_cvtpi16_ps (__m64 __A) 49990075Sobrien{ 50090075Sobrien __v4hi __sign; 50190075Sobrien __v2si __hisi, __losi; 50290075Sobrien __v4sf __r; 50390075Sobrien 50490075Sobrien /* This comparison against zero gives us a mask that can be used to 50590075Sobrien fill in the missing sign bits in the unpack operations below, so 50690075Sobrien that we get signed values after unpacking. */ 50790075Sobrien __sign = (__v4hi) __builtin_ia32_mmx_zero (); 50890075Sobrien __sign = __builtin_ia32_pcmpgtw (__sign, (__v4hi)__A); 50990075Sobrien 51090075Sobrien /* Convert the four words to doublewords. */ 51190075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 51290075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 51390075Sobrien 51490075Sobrien /* Convert the doublewords to floating point two at a time. */ 51590075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 51690075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 51790075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 51890075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 51990075Sobrien 52090075Sobrien return (__m128) __r; 52190075Sobrien} 52290075Sobrien 52390075Sobrien/* Convert the four unsigned 16-bit values in A to SPFP form. */ 52490075Sobrienstatic __inline __m128 52590075Sobrien_mm_cvtpu16_ps (__m64 __A) 52690075Sobrien{ 52790075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 52890075Sobrien __v2si __hisi, __losi; 52990075Sobrien __v4sf __r; 53090075Sobrien 53190075Sobrien /* Convert the four words to doublewords. */ 53290075Sobrien __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __zero); 53390075Sobrien __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __zero); 53490075Sobrien 53590075Sobrien /* Convert the doublewords to floating point two at a time. */ 53690075Sobrien __r = (__v4sf) __builtin_ia32_setzerops (); 53790075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 53890075Sobrien __r = __builtin_ia32_movlhps (__r, __r); 53990075Sobrien __r = __builtin_ia32_cvtpi2ps (__r, __losi); 54090075Sobrien 54190075Sobrien return (__m128) __r; 54290075Sobrien} 54390075Sobrien 54490075Sobrien/* Convert the low four signed 8-bit values in A to SPFP form. */ 54590075Sobrienstatic __inline __m128 54690075Sobrien_mm_cvtpi8_ps (__m64 __A) 54790075Sobrien{ 54890075Sobrien __v8qi __sign; 54990075Sobrien 55090075Sobrien /* This comparison against zero gives us a mask that can be used to 55190075Sobrien fill in the missing sign bits in the unpack operations below, so 55290075Sobrien that we get signed values after unpacking. */ 55390075Sobrien __sign = (__v8qi) __builtin_ia32_mmx_zero (); 55490075Sobrien __sign = __builtin_ia32_pcmpgtb (__sign, (__v8qi)__A); 55590075Sobrien 55690075Sobrien /* Convert the four low bytes to words. */ 55790075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 55890075Sobrien 55990075Sobrien return _mm_cvtpi16_ps(__A); 56090075Sobrien} 56190075Sobrien 56290075Sobrien/* Convert the low four unsigned 8-bit values in A to SPFP form. */ 56390075Sobrienstatic __inline __m128 56490075Sobrien_mm_cvtpu8_ps(__m64 __A) 56590075Sobrien{ 56690075Sobrien __v8qi __zero = (__v8qi) __builtin_ia32_mmx_zero (); 56790075Sobrien __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __zero); 56890075Sobrien return _mm_cvtpu16_ps(__A); 56990075Sobrien} 57090075Sobrien 57190075Sobrien/* Convert the four signed 32-bit values in A and B to SPFP form. */ 57290075Sobrienstatic __inline __m128 57390075Sobrien_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 57490075Sobrien{ 57590075Sobrien __v4sf __zero = (__v4sf) __builtin_ia32_setzerops (); 57690075Sobrien __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 57790075Sobrien __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 57890075Sobrien return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 57990075Sobrien} 58090075Sobrien 58190075Sobrien/* Convert the four SPFP values in A to four signed 16-bit integers. */ 58290075Sobrienstatic __inline __m64 58390075Sobrien_mm_cvtps_pi16(__m128 __A) 58490075Sobrien{ 58590075Sobrien __v4sf __hisf = (__v4sf)__A; 58690075Sobrien __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 58790075Sobrien __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 58890075Sobrien __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 58990075Sobrien return (__m64) __builtin_ia32_packssdw (__losi, __hisi); 59090075Sobrien} 59190075Sobrien 59290075Sobrien/* Convert the four SPFP values in A to four signed 8-bit integers. */ 59390075Sobrienstatic __inline __m64 59490075Sobrien_mm_cvtps_pi8(__m128 __A) 59590075Sobrien{ 59690075Sobrien __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 59790075Sobrien __v4hi __zero = (__v4hi) __builtin_ia32_mmx_zero (); 59890075Sobrien return (__m64) __builtin_ia32_packsswb (__tmp, __zero); 59990075Sobrien} 60090075Sobrien 60190075Sobrien/* Selects four specific SPFP values from A and B based on MASK. */ 60290075Sobrien#if 0 60390075Sobrienstatic __inline __m128 60490075Sobrien_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 60590075Sobrien{ 60690075Sobrien return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 60790075Sobrien} 60890075Sobrien#else 60990075Sobrien#define _mm_shuffle_ps(A, B, MASK) \ 61090075Sobrien ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 61190075Sobrien#endif 61290075Sobrien 61390075Sobrien 61490075Sobrien/* Selects and interleaves the upper two SPFP values from A and B. */ 61590075Sobrienstatic __inline __m128 61690075Sobrien_mm_unpackhi_ps (__m128 __A, __m128 __B) 61790075Sobrien{ 61890075Sobrien return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 61990075Sobrien} 62090075Sobrien 62190075Sobrien/* Selects and interleaves the lower two SPFP values from A and B. */ 62290075Sobrienstatic __inline __m128 62390075Sobrien_mm_unpacklo_ps (__m128 __A, __m128 __B) 62490075Sobrien{ 62590075Sobrien return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 62690075Sobrien} 62790075Sobrien 62890075Sobrien/* Sets the upper two SPFP values with 64-bits of data loaded from P; 62990075Sobrien the lower two values are passed through from A. */ 63090075Sobrienstatic __inline __m128 63190075Sobrien_mm_loadh_pi (__m128 __A, __m64 *__P) 63290075Sobrien{ 63390075Sobrien return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 63490075Sobrien} 63590075Sobrien 63690075Sobrien/* Stores the upper two SPFP values of A into P. */ 63790075Sobrienstatic __inline void 63890075Sobrien_mm_storeh_pi (__m64 *__P, __m128 __A) 63990075Sobrien{ 64090075Sobrien __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 64190075Sobrien} 64290075Sobrien 64390075Sobrien/* Moves the upper two values of B into the lower two values of A. */ 64490075Sobrienstatic __inline __m128 64590075Sobrien_mm_movehl_ps (__m128 __A, __m128 __B) 64690075Sobrien{ 64790075Sobrien return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 64890075Sobrien} 64990075Sobrien 65090075Sobrien/* Moves the lower two values of B into the upper two values of A. */ 65190075Sobrienstatic __inline __m128 65290075Sobrien_mm_movelh_ps (__m128 __A, __m128 __B) 65390075Sobrien{ 65490075Sobrien return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 65590075Sobrien} 65690075Sobrien 65790075Sobrien/* Sets the lower two SPFP values with 64-bits of data loaded from P; 65890075Sobrien the upper two values are passed through from A. */ 65990075Sobrienstatic __inline __m128 66090075Sobrien_mm_loadl_pi (__m128 __A, __m64 *__P) 66190075Sobrien{ 66290075Sobrien return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 66390075Sobrien} 66490075Sobrien 66590075Sobrien/* Stores the lower two SPFP values of A into P. */ 66690075Sobrienstatic __inline void 66790075Sobrien_mm_storel_pi (__m64 *__P, __m128 __A) 66890075Sobrien{ 66990075Sobrien __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 67090075Sobrien} 67190075Sobrien 67290075Sobrien/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 67390075Sobrienstatic __inline int 67490075Sobrien_mm_movemask_ps (__m128 __A) 67590075Sobrien{ 67690075Sobrien return __builtin_ia32_movmskps ((__v4sf)__A); 67790075Sobrien} 67890075Sobrien 67990075Sobrien/* Return the contents of the control register. */ 68090075Sobrienstatic __inline unsigned int 68190075Sobrien_mm_getcsr (void) 68290075Sobrien{ 68390075Sobrien return __builtin_ia32_stmxcsr (); 68490075Sobrien} 68590075Sobrien 68690075Sobrien/* Read exception bits from the control register. */ 68790075Sobrienstatic __inline unsigned int 68890075Sobrien_MM_GET_EXCEPTION_STATE (void) 68990075Sobrien{ 69090075Sobrien return _mm_getcsr() & _MM_EXCEPT_MASK; 69190075Sobrien} 69290075Sobrien 69390075Sobrienstatic __inline unsigned int 69490075Sobrien_MM_GET_EXCEPTION_MASK (void) 69590075Sobrien{ 69690075Sobrien return _mm_getcsr() & _MM_MASK_MASK; 69790075Sobrien} 69890075Sobrien 69990075Sobrienstatic __inline unsigned int 70090075Sobrien_MM_GET_ROUNDING_MODE (void) 70190075Sobrien{ 70290075Sobrien return _mm_getcsr() & _MM_ROUND_MASK; 70390075Sobrien} 70490075Sobrien 70590075Sobrienstatic __inline unsigned int 70690075Sobrien_MM_GET_FLUSH_ZERO_MODE (void) 70790075Sobrien{ 70890075Sobrien return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 70990075Sobrien} 71090075Sobrien 71190075Sobrien/* Set the control register to I. */ 71290075Sobrienstatic __inline void 71390075Sobrien_mm_setcsr (unsigned int __I) 71490075Sobrien{ 71590075Sobrien __builtin_ia32_ldmxcsr (__I); 71690075Sobrien} 71790075Sobrien 71890075Sobrien/* Set exception bits in the control register. */ 71990075Sobrienstatic __inline void 72090075Sobrien_MM_SET_EXCEPTION_STATE(unsigned int __mask) 72190075Sobrien{ 72290075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 72390075Sobrien} 72490075Sobrien 72590075Sobrienstatic __inline void 72690075Sobrien_MM_SET_EXCEPTION_MASK (unsigned int __mask) 72790075Sobrien{ 72890075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 72990075Sobrien} 73090075Sobrien 73190075Sobrienstatic __inline void 73290075Sobrien_MM_SET_ROUNDING_MODE (unsigned int __mode) 73390075Sobrien{ 73490075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 73590075Sobrien} 73690075Sobrien 73790075Sobrienstatic __inline void 73890075Sobrien_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 73990075Sobrien{ 74090075Sobrien _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 74190075Sobrien} 74290075Sobrien 74390075Sobrien/* Create a vector with element 0 as *P and the rest zero. */ 74490075Sobrienstatic __inline __m128 74590075Sobrien_mm_load_ss (float *__P) 74690075Sobrien{ 74790075Sobrien return (__m128) __builtin_ia32_loadss (__P); 74890075Sobrien} 74990075Sobrien 75090075Sobrien/* Create a vector with all four elements equal to *P. */ 75190075Sobrienstatic __inline __m128 75290075Sobrien_mm_load1_ps (float *__P) 75390075Sobrien{ 75490075Sobrien __v4sf __tmp = __builtin_ia32_loadss (__P); 75590075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 75690075Sobrien} 75790075Sobrien 75890075Sobrienstatic __inline __m128 75990075Sobrien_mm_load_ps1 (float *__P) 76090075Sobrien{ 76190075Sobrien return _mm_load1_ps (__P); 76290075Sobrien} 76390075Sobrien 76490075Sobrien/* Load four SPFP values from P. The address must be 16-byte aligned. */ 76590075Sobrienstatic __inline __m128 76690075Sobrien_mm_load_ps (float *__P) 76790075Sobrien{ 76890075Sobrien return (__m128) __builtin_ia32_loadaps (__P); 76990075Sobrien} 77090075Sobrien 77190075Sobrien/* Load four SPFP values from P. The address need not be 16-byte aligned. */ 77290075Sobrienstatic __inline __m128 77390075Sobrien_mm_loadu_ps (float *__P) 77490075Sobrien{ 77590075Sobrien return (__m128) __builtin_ia32_loadups (__P); 77690075Sobrien} 77790075Sobrien 77890075Sobrien/* Load four SPFP values in reverse order. The address must be aligned. */ 77990075Sobrienstatic __inline __m128 78090075Sobrien_mm_loadr_ps (float *__P) 78190075Sobrien{ 78290075Sobrien __v4sf __tmp = __builtin_ia32_loadaps (__P); 78390075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 78490075Sobrien} 78590075Sobrien 78690075Sobrien/* Create a vector with element 0 as F and the rest zero. */ 78790075Sobrienstatic __inline __m128 78890075Sobrien_mm_set_ss (float __F) 78990075Sobrien{ 79090075Sobrien return (__m128) __builtin_ia32_loadss (&__F); 79190075Sobrien} 79290075Sobrien 79390075Sobrien/* Create a vector with all four elements equal to F. */ 79490075Sobrienstatic __inline __m128 79590075Sobrien_mm_set1_ps (float __F) 79690075Sobrien{ 79790075Sobrien __v4sf __tmp = __builtin_ia32_loadss (&__F); 79890075Sobrien return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,0,0,0)); 79990075Sobrien} 80090075Sobrien 80190075Sobrienstatic __inline __m128 80290075Sobrien_mm_set_ps1 (float __F) 80390075Sobrien{ 80490075Sobrien return _mm_set1_ps (__F); 80590075Sobrien} 80690075Sobrien 80790075Sobrien/* Create the vector [Z Y X W]. */ 80890075Sobrienstatic __inline __m128 80990075Sobrien_mm_set_ps (float __Z, float __Y, float __X, float __W) 81090075Sobrien{ 81190075Sobrien union { 81290075Sobrien float __a[4]; 81390075Sobrien __m128 __v; 81490075Sobrien } __u; 81590075Sobrien 81690075Sobrien __u.__a[0] = __W; 81790075Sobrien __u.__a[1] = __X; 81890075Sobrien __u.__a[2] = __Y; 81990075Sobrien __u.__a[3] = __Z; 82090075Sobrien 82190075Sobrien return __u.__v; 82290075Sobrien} 82390075Sobrien 82490075Sobrien/* Create the vector [W X Y Z]. */ 82590075Sobrienstatic __inline __m128 82690075Sobrien_mm_setr_ps (float __Z, float __Y, float __X, float __W) 82790075Sobrien{ 82890075Sobrien return _mm_set_ps (__W, __X, __Y, __Z); 82990075Sobrien} 83090075Sobrien 83190075Sobrien/* Create a vector of zeros. */ 83290075Sobrienstatic __inline __m128 83390075Sobrien_mm_setzero_ps (void) 83490075Sobrien{ 83590075Sobrien return (__m128) __builtin_ia32_setzerops (); 83690075Sobrien} 83790075Sobrien 83890075Sobrien/* Stores the lower SPFP value. */ 83990075Sobrienstatic __inline void 84090075Sobrien_mm_store_ss (float *__P, __m128 __A) 84190075Sobrien{ 84290075Sobrien __builtin_ia32_storess (__P, (__v4sf)__A); 84390075Sobrien} 84490075Sobrien 84590075Sobrien/* Store the lower SPFP value across four words. */ 84690075Sobrienstatic __inline void 84790075Sobrien_mm_store1_ps (float *__P, __m128 __A) 84890075Sobrien{ 84990075Sobrien __v4sf __va = (__v4sf)__A; 85090075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 85190075Sobrien __builtin_ia32_storeaps (__P, __tmp); 85290075Sobrien} 85390075Sobrien 85490075Sobrienstatic __inline void 85590075Sobrien_mm_store_ps1 (float *__P, __m128 __A) 85690075Sobrien{ 85790075Sobrien _mm_store1_ps (__P, __A); 85890075Sobrien} 85990075Sobrien 86090075Sobrien/* Store four SPFP values. The address must be 16-byte aligned. */ 86190075Sobrienstatic __inline void 86290075Sobrien_mm_store_ps (float *__P, __m128 __A) 86390075Sobrien{ 86490075Sobrien __builtin_ia32_storeaps (__P, (__v4sf)__A); 86590075Sobrien} 86690075Sobrien 86790075Sobrien/* Store four SPFP values. The address need not be 16-byte aligned. */ 86890075Sobrienstatic __inline void 86990075Sobrien_mm_storeu_ps (float *__P, __m128 __A) 87090075Sobrien{ 87190075Sobrien __builtin_ia32_storeups (__P, (__v4sf)__A); 87290075Sobrien} 87390075Sobrien 87490075Sobrien/* Store four SPFP values in reverse order. The addres must be aligned. */ 87590075Sobrienstatic __inline void 87690075Sobrien_mm_storer_ps (float *__P, __m128 __A) 87790075Sobrien{ 87890075Sobrien __v4sf __va = (__v4sf)__A; 87990075Sobrien __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 88090075Sobrien __builtin_ia32_storeaps (__P, __tmp); 88190075Sobrien} 88290075Sobrien 88390075Sobrien/* Sets the low SPFP value of A from the low value of B. */ 88490075Sobrienstatic __inline __m128 88590075Sobrien_mm_move_ss (__m128 __A, __m128 __B) 88690075Sobrien{ 88790075Sobrien return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 88890075Sobrien} 88990075Sobrien 89090075Sobrien/* Extracts one of the four words of A. The selector N must be immediate. */ 89190075Sobrien#if 0 89290075Sobrienstatic __inline int 89390075Sobrien_mm_extract_pi16 (__m64 __A, int __N) 89490075Sobrien{ 89590075Sobrien return __builtin_ia32_pextrw ((__v4hi)__A, __N); 89690075Sobrien} 89790075Sobrien#else 89890075Sobrien#define _mm_extract_pi16(A, N) \ 89990075Sobrien __builtin_ia32_pextrw ((__v4hi)(A), (N)) 90090075Sobrien#endif 90190075Sobrien 90290075Sobrien/* Inserts word D into one of four words of A. The selector N must be 90390075Sobrien immediate. */ 90490075Sobrien#if 0 90590075Sobrienstatic __inline __m64 90690075Sobrien_mm_insert_pi16 (__m64 __A, int __D, int __N) 90790075Sobrien{ 90890075Sobrien return (__m64)__builtin_ia32_pinsrw ((__v4hi)__A, __D, __N); 90990075Sobrien} 91090075Sobrien#else 91190075Sobrien#define _mm_insert_pi16(A, D, N) \ 91290075Sobrien ((__m64) __builtin_ia32_pinsrw ((__v4hi)(A), (D), (N))) 91390075Sobrien#endif 91490075Sobrien 91590075Sobrien/* Compute the element-wise maximum of signed 16-bit values. */ 91690075Sobrienstatic __inline __m64 91790075Sobrien_mm_max_pi16 (__m64 __A, __m64 __B) 91890075Sobrien{ 91990075Sobrien return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 92090075Sobrien} 92190075Sobrien 92290075Sobrien/* Compute the element-wise maximum of unsigned 8-bit values. */ 92390075Sobrienstatic __inline __m64 92490075Sobrien_mm_max_pu8 (__m64 __A, __m64 __B) 92590075Sobrien{ 92690075Sobrien return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 92790075Sobrien} 92890075Sobrien 92990075Sobrien/* Compute the element-wise minimum of signed 16-bit values. */ 93090075Sobrienstatic __inline __m64 93190075Sobrien_mm_min_pi16 (__m64 __A, __m64 __B) 93290075Sobrien{ 93390075Sobrien return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 93490075Sobrien} 93590075Sobrien 93690075Sobrien/* Compute the element-wise minimum of unsigned 8-bit values. */ 93790075Sobrienstatic __inline __m64 93890075Sobrien_mm_min_pu8 (__m64 __A, __m64 __B) 93990075Sobrien{ 94090075Sobrien return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 94190075Sobrien} 94290075Sobrien 94390075Sobrien/* Create an 8-bit mask of the signs of 8-bit values. */ 94490075Sobrienstatic __inline int 94590075Sobrien_mm_movemask_pi8 (__m64 __A) 94690075Sobrien{ 94790075Sobrien return __builtin_ia32_pmovmskb ((__v8qi)__A); 94890075Sobrien} 94990075Sobrien 95090075Sobrien/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 95190075Sobrien in B and produce the high 16 bits of the 32-bit results. */ 95290075Sobrienstatic __inline __m64 95390075Sobrien_mm_mulhi_pu16 (__m64 __A, __m64 __B) 95490075Sobrien{ 95590075Sobrien return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 95690075Sobrien} 95790075Sobrien 95890075Sobrien/* Return a combination of the four 16-bit values in A. The selector 95990075Sobrien must be an immediate. */ 96090075Sobrien#if 0 96190075Sobrienstatic __inline __m64 96290075Sobrien_mm_shuffle_pi16 (__m64 __A, int __N) 96390075Sobrien{ 96490075Sobrien return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 96590075Sobrien} 96690075Sobrien#else 96790075Sobrien#define _mm_shuffle_pi16(A, N) \ 96890075Sobrien ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 96990075Sobrien#endif 97090075Sobrien 97190075Sobrien/* Conditionally store byte elements of A into P. The high bit of each 97290075Sobrien byte in the selector N determines whether the corresponding byte from 97390075Sobrien A is stored. */ 97490075Sobrienstatic __inline void 97590075Sobrien_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 97690075Sobrien{ 97790075Sobrien __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 97890075Sobrien} 97990075Sobrien 98090075Sobrien/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 98190075Sobrienstatic __inline __m64 98290075Sobrien_mm_avg_pu8 (__m64 __A, __m64 __B) 98390075Sobrien{ 98490075Sobrien return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 98590075Sobrien} 98690075Sobrien 98790075Sobrien/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 98890075Sobrienstatic __inline __m64 98990075Sobrien_mm_avg_pu16 (__m64 __A, __m64 __B) 99090075Sobrien{ 99190075Sobrien return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 99290075Sobrien} 99390075Sobrien 99490075Sobrien/* Compute the sum of the absolute differences of the unsigned 8-bit 99590075Sobrien values in A and B. Return the value in the lower 16-bit word; the 99690075Sobrien upper words are cleared. */ 99790075Sobrienstatic __inline __m64 99890075Sobrien_mm_sad_pu8 (__m64 __A, __m64 __B) 99990075Sobrien{ 100090075Sobrien return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 100190075Sobrien} 100290075Sobrien 100390075Sobrien/* Loads one cache line from address P to a location "closer" to the 100490075Sobrien processor. The selector I specifies the type of prefetch operation. */ 100590075Sobrien#if 0 100690075Sobrienstatic __inline void 100790075Sobrien_mm_prefetch (void *__P, enum _mm_hint __I) 100890075Sobrien{ 100990075Sobrien __builtin_prefetch (__P, 0, __I); 101090075Sobrien} 101190075Sobrien#else 101290075Sobrien#define _mm_prefetch(P, I) \ 101390075Sobrien __builtin_prefetch ((P), 0, (I)) 101490075Sobrien#endif 101590075Sobrien 101690075Sobrien/* Stores the data in A to the address P without polluting the caches. */ 101790075Sobrienstatic __inline void 101890075Sobrien_mm_stream_pi (__m64 *__P, __m64 __A) 101990075Sobrien{ 102090075Sobrien __builtin_ia32_movntq (__P, __A); 102190075Sobrien} 102290075Sobrien 102390075Sobrien/* Likewise. The address must be 16-byte aligned. */ 102490075Sobrienstatic __inline void 102590075Sobrien_mm_stream_ps (float *__P, __m128 __A) 102690075Sobrien{ 102790075Sobrien __builtin_ia32_movntps (__P, (__v4sf)__A); 102890075Sobrien} 102990075Sobrien 103090075Sobrien/* Guarantees that every preceeding store is globally visible before 103190075Sobrien any subsequent store. */ 103290075Sobrienstatic __inline void 103390075Sobrien_mm_sfence (void) 103490075Sobrien{ 103590075Sobrien __builtin_ia32_sfence (); 103690075Sobrien} 103790075Sobrien 103890075Sobrien/* The execution of the next instruction is delayed by an implementation 103990075Sobrien specific amount of time. The instruction does not modify the 104090075Sobrien architectural state. */ 104190075Sobrienstatic __inline void 104290075Sobrien_mm_pause (void) 104390075Sobrien{ 104490075Sobrien __asm__ __volatile__ ("rep; nop" : : ); 104590075Sobrien} 104690075Sobrien 104790075Sobrien/* Transpose the 4x4 matrix composed of row[0-3]. */ 104890075Sobrien#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 104990075Sobriendo { \ 105090075Sobrien __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 105190075Sobrien __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ 105290075Sobrien __v4sf __t1 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ 105390075Sobrien __v4sf __t2 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ 105490075Sobrien __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ 105590075Sobrien (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ 105690075Sobrien (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ 105790075Sobrien (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ 105890075Sobrien (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ 105990075Sobrien} while (0) 106090075Sobrien 106190075Sobrien#endif /* _XMMINTRIN_H_INCLUDED */ 1062