xmmintrin.h revision 341825
1193326Sed/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2193326Sed * 3193326Sed * Permission is hereby granted, free of charge, to any person obtaining a copy 4193326Sed * of this software and associated documentation files (the "Software"), to deal 5193326Sed * in the Software without restriction, including without limitation the rights 6193326Sed * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7193326Sed * copies of the Software, and to permit persons to whom the Software is 8193326Sed * furnished to do so, subject to the following conditions: 9193326Sed * 10193326Sed * The above copyright notice and this permission notice shall be included in 11193326Sed * all copies or substantial portions of the Software. 12193326Sed * 13193326Sed * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14193326Sed * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15193326Sed * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16193326Sed * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17193326Sed * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18193326Sed * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19193326Sed * THE SOFTWARE. 20193326Sed * 21193326Sed *===-----------------------------------------------------------------------=== 22193326Sed */ 23296417Sdim 24193326Sed#ifndef __XMMINTRIN_H 25193326Sed#define __XMMINTRIN_H 26193326Sed 27193326Sed#include <mmintrin.h> 28193326Sed 29205408Srdivackytypedef int __v4si __attribute__((__vector_size__(16))); 30193326Sedtypedef float __v4sf __attribute__((__vector_size__(16))); 31193326Sedtypedef float __m128 __attribute__((__vector_size__(16))); 32193326Sed 33309124Sdim/* Unsigned types */ 34309124Sdimtypedef unsigned int __v4su __attribute__((__vector_size__(16))); 35309124Sdim 36276479Sdim/* This header should only be included in a hosted environment as it depends on 37276479Sdim * a standard library to provide allocation routines. */ 38218893Sdim#if __STDC_HOSTED__ 39193326Sed#include <mm_malloc.h> 40218893Sdim#endif 41193326Sed 42288943Sdim/* Define the default attributes for the functions in this file. */ 43341825Sdim#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) 44341825Sdim#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) 45288943Sdim 46341825Sdim/// Adds the 32-bit float values in the low-order bits of the operands. 47309124Sdim/// 48309124Sdim/// \headerfile <x86intrin.h> 49309124Sdim/// 50314564Sdim/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions. 51309124Sdim/// 52309124Sdim/// \param __a 53309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 54309124Sdim/// The lower 32 bits of this operand are used in the calculation. 55309124Sdim/// \param __b 56309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 57309124Sdim/// The lower 32 bits of this operand are used in the calculation. 58309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 59309124Sdim/// of the lower 32 bits of both operands. The upper 96 bits are copied from 60309124Sdim/// the upper 96 bits of the first source operand. 61288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 62249423Sdim_mm_add_ss(__m128 __a, __m128 __b) 63193326Sed{ 64249423Sdim __a[0] += __b[0]; 65249423Sdim return __a; 66193326Sed} 67193326Sed 68341825Sdim/// Adds two 128-bit vectors of [4 x float], and returns the results of 69309124Sdim/// the addition. 70309124Sdim/// 71309124Sdim/// \headerfile <x86intrin.h> 72309124Sdim/// 73314564Sdim/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions. 74309124Sdim/// 75309124Sdim/// \param __a 76309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 77309124Sdim/// \param __b 78309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 79309124Sdim/// \returns A 128-bit vector of [4 x float] containing the sums of both 80309124Sdim/// operands. 81288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 82249423Sdim_mm_add_ps(__m128 __a, __m128 __b) 83193326Sed{ 84309124Sdim return (__m128)((__v4sf)__a + (__v4sf)__b); 85193326Sed} 86193326Sed 87341825Sdim/// Subtracts the 32-bit float value in the low-order bits of the second 88309124Sdim/// operand from the corresponding value in the first operand. 89309124Sdim/// 90309124Sdim/// \headerfile <x86intrin.h> 91309124Sdim/// 92314564Sdim/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions. 93309124Sdim/// 94309124Sdim/// \param __a 95309124Sdim/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 96309124Sdim/// of this operand are used in the calculation. 97309124Sdim/// \param __b 98309124Sdim/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 99309124Sdim/// bits of this operand are used in the calculation. 100309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 101309124Sdim/// difference of the lower 32 bits of both operands. The upper 96 bits are 102309124Sdim/// copied from the upper 96 bits of the first source operand. 103288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 104249423Sdim_mm_sub_ss(__m128 __a, __m128 __b) 105193326Sed{ 106249423Sdim __a[0] -= __b[0]; 107249423Sdim return __a; 108193326Sed} 109193326Sed 110341825Sdim/// Subtracts each of the values of the second operand from the first 111309124Sdim/// operand, both of which are 128-bit vectors of [4 x float] and returns 112309124Sdim/// the results of the subtraction. 113309124Sdim/// 114309124Sdim/// \headerfile <x86intrin.h> 115309124Sdim/// 116314564Sdim/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions. 117309124Sdim/// 118309124Sdim/// \param __a 119309124Sdim/// A 128-bit vector of [4 x float] containing the minuend. 120309124Sdim/// \param __b 121309124Sdim/// A 128-bit vector of [4 x float] containing the subtrahend. 122309124Sdim/// \returns A 128-bit vector of [4 x float] containing the differences between 123309124Sdim/// both operands. 124288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 125249423Sdim_mm_sub_ps(__m128 __a, __m128 __b) 126193326Sed{ 127309124Sdim return (__m128)((__v4sf)__a - (__v4sf)__b); 128193326Sed} 129193326Sed 130341825Sdim/// Multiplies two 32-bit float values in the low-order bits of the 131309124Sdim/// operands. 132309124Sdim/// 133309124Sdim/// \headerfile <x86intrin.h> 134309124Sdim/// 135314564Sdim/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions. 136309124Sdim/// 137309124Sdim/// \param __a 138309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 139309124Sdim/// The lower 32 bits of this operand are used in the calculation. 140309124Sdim/// \param __b 141309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 142309124Sdim/// The lower 32 bits of this operand are used in the calculation. 143309124Sdim/// \returns A 128-bit vector of [4 x float] containing the product of the lower 144309124Sdim/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 145309124Sdim/// bits of the first source operand. 146288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 147249423Sdim_mm_mul_ss(__m128 __a, __m128 __b) 148193326Sed{ 149249423Sdim __a[0] *= __b[0]; 150249423Sdim return __a; 151193326Sed} 152193326Sed 153341825Sdim/// Multiplies two 128-bit vectors of [4 x float] and returns the 154309124Sdim/// results of the multiplication. 155309124Sdim/// 156309124Sdim/// \headerfile <x86intrin.h> 157309124Sdim/// 158314564Sdim/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions. 159309124Sdim/// 160309124Sdim/// \param __a 161309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 162309124Sdim/// \param __b 163309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 164309124Sdim/// \returns A 128-bit vector of [4 x float] containing the products of both 165309124Sdim/// operands. 166288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 167249423Sdim_mm_mul_ps(__m128 __a, __m128 __b) 168193326Sed{ 169309124Sdim return (__m128)((__v4sf)__a * (__v4sf)__b); 170193326Sed} 171193326Sed 172341825Sdim/// Divides the value in the low-order 32 bits of the first operand by 173309124Sdim/// the corresponding value in the second operand. 174309124Sdim/// 175309124Sdim/// \headerfile <x86intrin.h> 176309124Sdim/// 177314564Sdim/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions. 178309124Sdim/// 179309124Sdim/// \param __a 180309124Sdim/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 181309124Sdim/// bits of this operand are used in the calculation. 182309124Sdim/// \param __b 183309124Sdim/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 184309124Sdim/// of this operand are used in the calculation. 185309124Sdim/// \returns A 128-bit vector of [4 x float] containing the quotients of the 186309124Sdim/// lower 32 bits of both operands. The upper 96 bits are copied from the 187309124Sdim/// upper 96 bits of the first source operand. 188288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 189249423Sdim_mm_div_ss(__m128 __a, __m128 __b) 190193326Sed{ 191249423Sdim __a[0] /= __b[0]; 192249423Sdim return __a; 193193326Sed} 194193326Sed 195341825Sdim/// Divides two 128-bit vectors of [4 x float]. 196309124Sdim/// 197309124Sdim/// \headerfile <x86intrin.h> 198309124Sdim/// 199314564Sdim/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions. 200309124Sdim/// 201309124Sdim/// \param __a 202309124Sdim/// A 128-bit vector of [4 x float] containing the dividend. 203309124Sdim/// \param __b 204309124Sdim/// A 128-bit vector of [4 x float] containing the divisor. 205309124Sdim/// \returns A 128-bit vector of [4 x float] containing the quotients of both 206309124Sdim/// operands. 207288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 208249423Sdim_mm_div_ps(__m128 __a, __m128 __b) 209193326Sed{ 210309124Sdim return (__m128)((__v4sf)__a / (__v4sf)__b); 211193326Sed} 212193326Sed 213341825Sdim/// Calculates the square root of the value stored in the low-order bits 214309124Sdim/// of a 128-bit vector of [4 x float]. 215309124Sdim/// 216309124Sdim/// \headerfile <x86intrin.h> 217309124Sdim/// 218314564Sdim/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions. 219309124Sdim/// 220309124Sdim/// \param __a 221309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 222309124Sdim/// used in the calculation. 223309124Sdim/// \returns A 128-bit vector of [4 x float] containing the square root of the 224309124Sdim/// value in the low-order bits of the operand. 225288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 226249423Sdim_mm_sqrt_ss(__m128 __a) 227193326Sed{ 228341825Sdim return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); 229193326Sed} 230193326Sed 231341825Sdim/// Calculates the square roots of the values stored in a 128-bit vector 232309124Sdim/// of [4 x float]. 233309124Sdim/// 234309124Sdim/// \headerfile <x86intrin.h> 235309124Sdim/// 236314564Sdim/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions. 237309124Sdim/// 238309124Sdim/// \param __a 239309124Sdim/// A 128-bit vector of [4 x float]. 240309124Sdim/// \returns A 128-bit vector of [4 x float] containing the square roots of the 241309124Sdim/// values in the operand. 242288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 243249423Sdim_mm_sqrt_ps(__m128 __a) 244193326Sed{ 245309124Sdim return __builtin_ia32_sqrtps((__v4sf)__a); 246193326Sed} 247193326Sed 248341825Sdim/// Calculates the approximate reciprocal of the value stored in the 249309124Sdim/// low-order bits of a 128-bit vector of [4 x float]. 250309124Sdim/// 251309124Sdim/// \headerfile <x86intrin.h> 252309124Sdim/// 253314564Sdim/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions. 254309124Sdim/// 255309124Sdim/// \param __a 256309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 257309124Sdim/// used in the calculation. 258309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate 259309124Sdim/// reciprocal of the value in the low-order bits of the operand. 260288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 261249423Sdim_mm_rcp_ss(__m128 __a) 262193326Sed{ 263341825Sdim return (__m128)__builtin_ia32_rcpss((__v4sf)__a); 264193326Sed} 265193326Sed 266341825Sdim/// Calculates the approximate reciprocals of the values stored in a 267309124Sdim/// 128-bit vector of [4 x float]. 268309124Sdim/// 269309124Sdim/// \headerfile <x86intrin.h> 270309124Sdim/// 271314564Sdim/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions. 272309124Sdim/// 273309124Sdim/// \param __a 274309124Sdim/// A 128-bit vector of [4 x float]. 275309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate 276309124Sdim/// reciprocals of the values in the operand. 277288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 278249423Sdim_mm_rcp_ps(__m128 __a) 279193326Sed{ 280341825Sdim return (__m128)__builtin_ia32_rcpps((__v4sf)__a); 281193326Sed} 282193326Sed 283341825Sdim/// Calculates the approximate reciprocal of the square root of the value 284309124Sdim/// stored in the low-order bits of a 128-bit vector of [4 x float]. 285309124Sdim/// 286309124Sdim/// \headerfile <x86intrin.h> 287309124Sdim/// 288314564Sdim/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions. 289309124Sdim/// 290309124Sdim/// \param __a 291309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 292309124Sdim/// used in the calculation. 293309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate 294309124Sdim/// reciprocal of the square root of the value in the low-order bits of the 295309124Sdim/// operand. 296288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 297249423Sdim_mm_rsqrt_ss(__m128 __a) 298193326Sed{ 299341825Sdim return __builtin_ia32_rsqrtss((__v4sf)__a); 300193326Sed} 301193326Sed 302341825Sdim/// Calculates the approximate reciprocals of the square roots of the 303309124Sdim/// values stored in a 128-bit vector of [4 x float]. 304309124Sdim/// 305309124Sdim/// \headerfile <x86intrin.h> 306309124Sdim/// 307314564Sdim/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions. 308309124Sdim/// 309309124Sdim/// \param __a 310309124Sdim/// A 128-bit vector of [4 x float]. 311309124Sdim/// \returns A 128-bit vector of [4 x float] containing the approximate 312309124Sdim/// reciprocals of the square roots of the values in the operand. 313288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 314249423Sdim_mm_rsqrt_ps(__m128 __a) 315193326Sed{ 316309124Sdim return __builtin_ia32_rsqrtps((__v4sf)__a); 317193326Sed} 318193326Sed 319341825Sdim/// Compares two 32-bit float values in the low-order bits of both 320309124Sdim/// operands and returns the lesser value in the low-order bits of the 321309124Sdim/// vector of [4 x float]. 322309124Sdim/// 323309124Sdim/// \headerfile <x86intrin.h> 324309124Sdim/// 325314564Sdim/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions. 326309124Sdim/// 327309124Sdim/// \param __a 328309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 329309124Sdim/// 32 bits of this operand are used in the comparison. 330309124Sdim/// \param __b 331309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 332309124Sdim/// 32 bits of this operand are used in the comparison. 333309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 334309124Sdim/// minimum value between both operands. The upper 96 bits are copied from 335309124Sdim/// the upper 96 bits of the first source operand. 336288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 337249423Sdim_mm_min_ss(__m128 __a, __m128 __b) 338193326Sed{ 339309124Sdim return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 340193326Sed} 341193326Sed 342341825Sdim/// Compares two 128-bit vectors of [4 x float] and returns the lesser 343314564Sdim/// of each pair of values. 344309124Sdim/// 345309124Sdim/// \headerfile <x86intrin.h> 346309124Sdim/// 347314564Sdim/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions. 348309124Sdim/// 349309124Sdim/// \param __a 350309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. 351309124Sdim/// \param __b 352309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. 353309124Sdim/// \returns A 128-bit vector of [4 x float] containing the minimum values 354309124Sdim/// between both operands. 355288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 356249423Sdim_mm_min_ps(__m128 __a, __m128 __b) 357193326Sed{ 358309124Sdim return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 359193326Sed} 360193326Sed 361341825Sdim/// Compares two 32-bit float values in the low-order bits of both 362314564Sdim/// operands and returns the greater value in the low-order bits of a 128-bit 363314564Sdim/// vector of [4 x float]. 364309124Sdim/// 365309124Sdim/// \headerfile <x86intrin.h> 366309124Sdim/// 367314564Sdim/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions. 368309124Sdim/// 369309124Sdim/// \param __a 370309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 371309124Sdim/// 32 bits of this operand are used in the comparison. 372309124Sdim/// \param __b 373309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 374309124Sdim/// 32 bits of this operand are used in the comparison. 375309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 376309124Sdim/// maximum value between both operands. The upper 96 bits are copied from 377309124Sdim/// the upper 96 bits of the first source operand. 378288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 379249423Sdim_mm_max_ss(__m128 __a, __m128 __b) 380193326Sed{ 381309124Sdim return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 382193326Sed} 383193326Sed 384341825Sdim/// Compares two 128-bit vectors of [4 x float] and returns the greater 385309124Sdim/// of each pair of values. 386309124Sdim/// 387309124Sdim/// \headerfile <x86intrin.h> 388309124Sdim/// 389314564Sdim/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions. 390309124Sdim/// 391309124Sdim/// \param __a 392309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. 393309124Sdim/// \param __b 394309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. 395309124Sdim/// \returns A 128-bit vector of [4 x float] containing the maximum values 396309124Sdim/// between both operands. 397288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 398249423Sdim_mm_max_ps(__m128 __a, __m128 __b) 399193326Sed{ 400309124Sdim return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 401193326Sed} 402193326Sed 403341825Sdim/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. 404309124Sdim/// 405309124Sdim/// \headerfile <x86intrin.h> 406309124Sdim/// 407314564Sdim/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions. 408309124Sdim/// 409309124Sdim/// \param __a 410309124Sdim/// A 128-bit vector containing one of the source operands. 411309124Sdim/// \param __b 412309124Sdim/// A 128-bit vector containing one of the source operands. 413309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 414309124Sdim/// values between both operands. 415288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 416249423Sdim_mm_and_ps(__m128 __a, __m128 __b) 417193326Sed{ 418309124Sdim return (__m128)((__v4su)__a & (__v4su)__b); 419193326Sed} 420193326Sed 421341825Sdim/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using 422309124Sdim/// the one's complement of the values contained in the first source 423309124Sdim/// operand. 424309124Sdim/// 425309124Sdim/// \headerfile <x86intrin.h> 426309124Sdim/// 427314564Sdim/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions. 428309124Sdim/// 429309124Sdim/// \param __a 430309124Sdim/// A 128-bit vector of [4 x float] containing the first source operand. The 431309124Sdim/// one's complement of this value is used in the bitwise AND. 432309124Sdim/// \param __b 433309124Sdim/// A 128-bit vector of [4 x float] containing the second source operand. 434309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 435309124Sdim/// one's complement of the first operand and the values in the second 436309124Sdim/// operand. 437288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 438249423Sdim_mm_andnot_ps(__m128 __a, __m128 __b) 439193326Sed{ 440309124Sdim return (__m128)(~(__v4su)__a & (__v4su)__b); 441193326Sed} 442193326Sed 443341825Sdim/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. 444309124Sdim/// 445309124Sdim/// \headerfile <x86intrin.h> 446309124Sdim/// 447314564Sdim/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions. 448309124Sdim/// 449309124Sdim/// \param __a 450309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 451309124Sdim/// \param __b 452309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 453309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 454309124Sdim/// values between both operands. 455288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 456249423Sdim_mm_or_ps(__m128 __a, __m128 __b) 457193326Sed{ 458309124Sdim return (__m128)((__v4su)__a | (__v4su)__b); 459193326Sed} 460193326Sed 461341825Sdim/// Performs a bitwise exclusive OR of two 128-bit vectors of 462309124Sdim/// [4 x float]. 463309124Sdim/// 464309124Sdim/// \headerfile <x86intrin.h> 465309124Sdim/// 466314564Sdim/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions. 467309124Sdim/// 468309124Sdim/// \param __a 469309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 470309124Sdim/// \param __b 471309124Sdim/// A 128-bit vector of [4 x float] containing one of the source operands. 472309124Sdim/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 473309124Sdim/// of the values between both operands. 474288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 475249423Sdim_mm_xor_ps(__m128 __a, __m128 __b) 476193326Sed{ 477309124Sdim return (__m128)((__v4su)__a ^ (__v4su)__b); 478193326Sed} 479193326Sed 480341825Sdim/// Compares two 32-bit float values in the low-order bits of both 481309124Sdim/// operands for equality and returns the result of the comparison in the 482309124Sdim/// low-order bits of a vector [4 x float]. 483309124Sdim/// 484309124Sdim/// \headerfile <x86intrin.h> 485309124Sdim/// 486314564Sdim/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions. 487309124Sdim/// 488309124Sdim/// \param __a 489309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 490309124Sdim/// 32 bits of this operand are used in the comparison. 491309124Sdim/// \param __b 492309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 493309124Sdim/// 32 bits of this operand are used in the comparison. 494309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 495309124Sdim/// in the low-order bits. 496288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 497249423Sdim_mm_cmpeq_ss(__m128 __a, __m128 __b) 498193326Sed{ 499309124Sdim return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 500193326Sed} 501193326Sed 502341825Sdim/// Compares each of the corresponding 32-bit float values of the 503309124Sdim/// 128-bit vectors of [4 x float] for equality. 504309124Sdim/// 505309124Sdim/// \headerfile <x86intrin.h> 506309124Sdim/// 507314564Sdim/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions. 508309124Sdim/// 509309124Sdim/// \param __a 510309124Sdim/// A 128-bit vector of [4 x float]. 511309124Sdim/// \param __b 512309124Sdim/// A 128-bit vector of [4 x float]. 513309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 514288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 515249423Sdim_mm_cmpeq_ps(__m128 __a, __m128 __b) 516193326Sed{ 517309124Sdim return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 518193326Sed} 519193326Sed 520341825Sdim/// Compares two 32-bit float values in the low-order bits of both 521309124Sdim/// operands to determine if the value in the first operand is less than the 522309124Sdim/// corresponding value in the second operand and returns the result of the 523309124Sdim/// comparison in the low-order bits of a vector of [4 x float]. 524309124Sdim/// 525309124Sdim/// \headerfile <x86intrin.h> 526309124Sdim/// 527314564Sdim/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 528309124Sdim/// 529309124Sdim/// \param __a 530309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 531309124Sdim/// 32 bits of this operand are used in the comparison. 532309124Sdim/// \param __b 533309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 534309124Sdim/// 32 bits of this operand are used in the comparison. 535309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 536309124Sdim/// in the low-order bits. 537288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 538249423Sdim_mm_cmplt_ss(__m128 __a, __m128 __b) 539193326Sed{ 540309124Sdim return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 541193326Sed} 542193326Sed 543341825Sdim/// Compares each of the corresponding 32-bit float values of the 544309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 545309124Sdim/// operand are less than those in the second operand. 546309124Sdim/// 547309124Sdim/// \headerfile <x86intrin.h> 548309124Sdim/// 549314564Sdim/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 550309124Sdim/// 551309124Sdim/// \param __a 552309124Sdim/// A 128-bit vector of [4 x float]. 553309124Sdim/// \param __b 554309124Sdim/// A 128-bit vector of [4 x float]. 555309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 556288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 557249423Sdim_mm_cmplt_ps(__m128 __a, __m128 __b) 558193326Sed{ 559309124Sdim return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 560193326Sed} 561193326Sed 562341825Sdim/// Compares two 32-bit float values in the low-order bits of both 563309124Sdim/// operands to determine if the value in the first operand is less than or 564309124Sdim/// equal to the corresponding value in the second operand and returns the 565309124Sdim/// result of the comparison in the low-order bits of a vector of 566309124Sdim/// [4 x float]. 567309124Sdim/// 568309124Sdim/// \headerfile <x86intrin.h> 569309124Sdim/// 570314564Sdim/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 571309124Sdim/// 572309124Sdim/// \param __a 573309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 574309124Sdim/// 32 bits of this operand are used in the comparison. 575309124Sdim/// \param __b 576309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 577309124Sdim/// 32 bits of this operand are used in the comparison. 578309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 579309124Sdim/// in the low-order bits. 580288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 581249423Sdim_mm_cmple_ss(__m128 __a, __m128 __b) 582193326Sed{ 583309124Sdim return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 584193326Sed} 585193326Sed 586341825Sdim/// Compares each of the corresponding 32-bit float values of the 587309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 588309124Sdim/// operand are less than or equal to those in the second operand. 589309124Sdim/// 590309124Sdim/// \headerfile <x86intrin.h> 591309124Sdim/// 592314564Sdim/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 593309124Sdim/// 594309124Sdim/// \param __a 595309124Sdim/// A 128-bit vector of [4 x float]. 596309124Sdim/// \param __b 597309124Sdim/// A 128-bit vector of [4 x float]. 598309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 599288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 600249423Sdim_mm_cmple_ps(__m128 __a, __m128 __b) 601193326Sed{ 602309124Sdim return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 603193326Sed} 604193326Sed 605341825Sdim/// Compares two 32-bit float values in the low-order bits of both 606309124Sdim/// operands to determine if the value in the first operand is greater than 607309124Sdim/// the corresponding value in the second operand and returns the result of 608309124Sdim/// the comparison in the low-order bits of a vector of [4 x float]. 609309124Sdim/// 610309124Sdim/// \headerfile <x86intrin.h> 611309124Sdim/// 612314564Sdim/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 613309124Sdim/// 614309124Sdim/// \param __a 615309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 616309124Sdim/// 32 bits of this operand are used in the comparison. 617309124Sdim/// \param __b 618309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 619309124Sdim/// 32 bits of this operand are used in the comparison. 620309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 621309124Sdim/// in the low-order bits. 622288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 623249423Sdim_mm_cmpgt_ss(__m128 __a, __m128 __b) 624193326Sed{ 625309124Sdim return (__m128)__builtin_shufflevector((__v4sf)__a, 626309124Sdim (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 627261991Sdim 4, 1, 2, 3); 628193326Sed} 629193326Sed 630341825Sdim/// Compares each of the corresponding 32-bit float values of the 631309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 632309124Sdim/// operand are greater than those in the second operand. 633309124Sdim/// 634309124Sdim/// \headerfile <x86intrin.h> 635309124Sdim/// 636314564Sdim/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 637309124Sdim/// 638309124Sdim/// \param __a 639309124Sdim/// A 128-bit vector of [4 x float]. 640309124Sdim/// \param __b 641309124Sdim/// A 128-bit vector of [4 x float]. 642309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 643288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 644249423Sdim_mm_cmpgt_ps(__m128 __a, __m128 __b) 645193326Sed{ 646309124Sdim return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 647193326Sed} 648193326Sed 649341825Sdim/// Compares two 32-bit float values in the low-order bits of both 650309124Sdim/// operands to determine if the value in the first operand is greater than 651309124Sdim/// or equal to the corresponding value in the second operand and returns 652309124Sdim/// the result of the comparison in the low-order bits of a vector of 653309124Sdim/// [4 x float]. 654309124Sdim/// 655309124Sdim/// \headerfile <x86intrin.h> 656309124Sdim/// 657314564Sdim/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 658309124Sdim/// 659309124Sdim/// \param __a 660309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 661309124Sdim/// 32 bits of this operand are used in the comparison. 662309124Sdim/// \param __b 663309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 664309124Sdim/// 32 bits of this operand are used in the comparison. 665309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 666309124Sdim/// in the low-order bits. 667288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 668249423Sdim_mm_cmpge_ss(__m128 __a, __m128 __b) 669193326Sed{ 670309124Sdim return (__m128)__builtin_shufflevector((__v4sf)__a, 671309124Sdim (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 672261991Sdim 4, 1, 2, 3); 673193326Sed} 674193326Sed 675341825Sdim/// Compares each of the corresponding 32-bit float values of the 676309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 677309124Sdim/// operand are greater than or equal to those in the second operand. 678309124Sdim/// 679309124Sdim/// \headerfile <x86intrin.h> 680309124Sdim/// 681314564Sdim/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 682309124Sdim/// 683309124Sdim/// \param __a 684309124Sdim/// A 128-bit vector of [4 x float]. 685309124Sdim/// \param __b 686309124Sdim/// A 128-bit vector of [4 x float]. 687309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 688288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 689249423Sdim_mm_cmpge_ps(__m128 __a, __m128 __b) 690193326Sed{ 691309124Sdim return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 692193326Sed} 693193326Sed 694341825Sdim/// Compares two 32-bit float values in the low-order bits of both 695309124Sdim/// operands for inequality and returns the result of the comparison in the 696309124Sdim/// low-order bits of a vector of [4 x float]. 697309124Sdim/// 698309124Sdim/// \headerfile <x86intrin.h> 699309124Sdim/// 700314564Sdim/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c> 701314564Sdim/// instructions. 702309124Sdim/// 703309124Sdim/// \param __a 704309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 705309124Sdim/// 32 bits of this operand are used in the comparison. 706309124Sdim/// \param __b 707309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 708309124Sdim/// 32 bits of this operand are used in the comparison. 709309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 710309124Sdim/// in the low-order bits. 711288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 712249423Sdim_mm_cmpneq_ss(__m128 __a, __m128 __b) 713193326Sed{ 714309124Sdim return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 715193326Sed} 716193326Sed 717341825Sdim/// Compares each of the corresponding 32-bit float values of the 718309124Sdim/// 128-bit vectors of [4 x float] for inequality. 719309124Sdim/// 720309124Sdim/// \headerfile <x86intrin.h> 721309124Sdim/// 722314564Sdim/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c> 723314564Sdim/// instructions. 724309124Sdim/// 725309124Sdim/// \param __a 726309124Sdim/// A 128-bit vector of [4 x float]. 727309124Sdim/// \param __b 728309124Sdim/// A 128-bit vector of [4 x float]. 729309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 730288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 731249423Sdim_mm_cmpneq_ps(__m128 __a, __m128 __b) 732193326Sed{ 733309124Sdim return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 734193326Sed} 735193326Sed 736341825Sdim/// Compares two 32-bit float values in the low-order bits of both 737309124Sdim/// operands to determine if the value in the first operand is not less than 738309124Sdim/// the corresponding value in the second operand and returns the result of 739309124Sdim/// the comparison in the low-order bits of a vector of [4 x float]. 740309124Sdim/// 741309124Sdim/// \headerfile <x86intrin.h> 742309124Sdim/// 743314564Sdim/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 744314564Sdim/// instructions. 745309124Sdim/// 746309124Sdim/// \param __a 747309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 748309124Sdim/// 32 bits of this operand are used in the comparison. 749309124Sdim/// \param __b 750309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 751309124Sdim/// 32 bits of this operand are used in the comparison. 752309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 753309124Sdim/// in the low-order bits. 754288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 755249423Sdim_mm_cmpnlt_ss(__m128 __a, __m128 __b) 756193326Sed{ 757309124Sdim return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 758193326Sed} 759193326Sed 760341825Sdim/// Compares each of the corresponding 32-bit float values of the 761309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 762309124Sdim/// operand are not less than those in the second operand. 763309124Sdim/// 764309124Sdim/// \headerfile <x86intrin.h> 765309124Sdim/// 766314564Sdim/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 767314564Sdim/// instructions. 768309124Sdim/// 769309124Sdim/// \param __a 770309124Sdim/// A 128-bit vector of [4 x float]. 771309124Sdim/// \param __b 772309124Sdim/// A 128-bit vector of [4 x float]. 773309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 774288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 775249423Sdim_mm_cmpnlt_ps(__m128 __a, __m128 __b) 776193326Sed{ 777309124Sdim return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 778193326Sed} 779193326Sed 780341825Sdim/// Compares two 32-bit float values in the low-order bits of both 781309124Sdim/// operands to determine if the value in the first operand is not less than 782309124Sdim/// or equal to the corresponding value in the second operand and returns 783309124Sdim/// the result of the comparison in the low-order bits of a vector of 784309124Sdim/// [4 x float]. 785309124Sdim/// 786309124Sdim/// \headerfile <x86intrin.h> 787309124Sdim/// 788314564Sdim/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 789314564Sdim/// instructions. 790309124Sdim/// 791309124Sdim/// \param __a 792309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 793309124Sdim/// 32 bits of this operand are used in the comparison. 794309124Sdim/// \param __b 795309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 796309124Sdim/// 32 bits of this operand are used in the comparison. 797309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 798309124Sdim/// in the low-order bits. 799288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 800249423Sdim_mm_cmpnle_ss(__m128 __a, __m128 __b) 801193326Sed{ 802309124Sdim return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 803193326Sed} 804193326Sed 805341825Sdim/// Compares each of the corresponding 32-bit float values of the 806309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 807309124Sdim/// operand are not less than or equal to those in the second operand. 808309124Sdim/// 809309124Sdim/// \headerfile <x86intrin.h> 810309124Sdim/// 811314564Sdim/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 812314564Sdim/// instructions. 813309124Sdim/// 814309124Sdim/// \param __a 815309124Sdim/// A 128-bit vector of [4 x float]. 816309124Sdim/// \param __b 817309124Sdim/// A 128-bit vector of [4 x float]. 818309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 819288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 820249423Sdim_mm_cmpnle_ps(__m128 __a, __m128 __b) 821193326Sed{ 822309124Sdim return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 823193326Sed} 824193326Sed 825341825Sdim/// Compares two 32-bit float values in the low-order bits of both 826309124Sdim/// operands to determine if the value in the first operand is not greater 827309124Sdim/// than the corresponding value in the second operand and returns the 828309124Sdim/// result of the comparison in the low-order bits of a vector of 829309124Sdim/// [4 x float]. 830309124Sdim/// 831309124Sdim/// \headerfile <x86intrin.h> 832309124Sdim/// 833314564Sdim/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 834314564Sdim/// instructions. 835309124Sdim/// 836309124Sdim/// \param __a 837309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 838309124Sdim/// 32 bits of this operand are used in the comparison. 839309124Sdim/// \param __b 840309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 841309124Sdim/// 32 bits of this operand are used in the comparison. 842309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 843309124Sdim/// in the low-order bits. 844288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 845249423Sdim_mm_cmpngt_ss(__m128 __a, __m128 __b) 846193326Sed{ 847309124Sdim return (__m128)__builtin_shufflevector((__v4sf)__a, 848309124Sdim (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 849261991Sdim 4, 1, 2, 3); 850193326Sed} 851193326Sed 852341825Sdim/// Compares each of the corresponding 32-bit float values of the 853309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 854309124Sdim/// operand are not greater than those in the second operand. 855309124Sdim/// 856309124Sdim/// \headerfile <x86intrin.h> 857309124Sdim/// 858314564Sdim/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 859314564Sdim/// instructions. 860309124Sdim/// 861309124Sdim/// \param __a 862309124Sdim/// A 128-bit vector of [4 x float]. 863309124Sdim/// \param __b 864309124Sdim/// A 128-bit vector of [4 x float]. 865309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 866288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 867249423Sdim_mm_cmpngt_ps(__m128 __a, __m128 __b) 868193326Sed{ 869309124Sdim return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 870193326Sed} 871193326Sed 872341825Sdim/// Compares two 32-bit float values in the low-order bits of both 873309124Sdim/// operands to determine if the value in the first operand is not greater 874309124Sdim/// than or equal to the corresponding value in the second operand and 875309124Sdim/// returns the result of the comparison in the low-order bits of a vector 876309124Sdim/// of [4 x float]. 877309124Sdim/// 878309124Sdim/// \headerfile <x86intrin.h> 879309124Sdim/// 880314564Sdim/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 881314564Sdim/// instructions. 882309124Sdim/// 883309124Sdim/// \param __a 884309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 885309124Sdim/// 32 bits of this operand are used in the comparison. 886309124Sdim/// \param __b 887309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 888309124Sdim/// 32 bits of this operand are used in the comparison. 889309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 890309124Sdim/// in the low-order bits. 891288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 892249423Sdim_mm_cmpnge_ss(__m128 __a, __m128 __b) 893193326Sed{ 894309124Sdim return (__m128)__builtin_shufflevector((__v4sf)__a, 895309124Sdim (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 896261991Sdim 4, 1, 2, 3); 897193326Sed} 898193326Sed 899341825Sdim/// Compares each of the corresponding 32-bit float values of the 900309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 901309124Sdim/// operand are not greater than or equal to those in the second operand. 902309124Sdim/// 903309124Sdim/// \headerfile <x86intrin.h> 904309124Sdim/// 905314564Sdim/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 906314564Sdim/// instructions. 907309124Sdim/// 908309124Sdim/// \param __a 909309124Sdim/// A 128-bit vector of [4 x float]. 910309124Sdim/// \param __b 911309124Sdim/// A 128-bit vector of [4 x float]. 912309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 913288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 914249423Sdim_mm_cmpnge_ps(__m128 __a, __m128 __b) 915193326Sed{ 916309124Sdim return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 917193326Sed} 918193326Sed 919341825Sdim/// Compares two 32-bit float values in the low-order bits of both 920309124Sdim/// operands to determine if the value in the first operand is ordered with 921309124Sdim/// respect to the corresponding value in the second operand and returns the 922309124Sdim/// result of the comparison in the low-order bits of a vector of 923309124Sdim/// [4 x float]. 924309124Sdim/// 925309124Sdim/// \headerfile <x86intrin.h> 926309124Sdim/// 927314564Sdim/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c> 928314564Sdim/// instructions. 929309124Sdim/// 930309124Sdim/// \param __a 931309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 932309124Sdim/// 32 bits of this operand are used in the comparison. 933309124Sdim/// \param __b 934309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 935309124Sdim/// 32 bits of this operand are used in the comparison. 936309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 937309124Sdim/// in the low-order bits. 938288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 939249423Sdim_mm_cmpord_ss(__m128 __a, __m128 __b) 940193326Sed{ 941309124Sdim return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 942193326Sed} 943193326Sed 944341825Sdim/// Compares each of the corresponding 32-bit float values of the 945309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 946309124Sdim/// operand are ordered with respect to those in the second operand. 947309124Sdim/// 948309124Sdim/// \headerfile <x86intrin.h> 949309124Sdim/// 950314564Sdim/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c> 951314564Sdim/// instructions. 952309124Sdim/// 953309124Sdim/// \param __a 954309124Sdim/// A 128-bit vector of [4 x float]. 955309124Sdim/// \param __b 956309124Sdim/// A 128-bit vector of [4 x float]. 957309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 958288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 959249423Sdim_mm_cmpord_ps(__m128 __a, __m128 __b) 960193326Sed{ 961309124Sdim return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 962193326Sed} 963193326Sed 964341825Sdim/// Compares two 32-bit float values in the low-order bits of both 965309124Sdim/// operands to determine if the value in the first operand is unordered 966309124Sdim/// with respect to the corresponding value in the second operand and 967309124Sdim/// returns the result of the comparison in the low-order bits of a vector 968309124Sdim/// of [4 x float]. 969309124Sdim/// 970309124Sdim/// \headerfile <x86intrin.h> 971309124Sdim/// 972314564Sdim/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c> 973314564Sdim/// instructions. 974309124Sdim/// 975309124Sdim/// \param __a 976309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 977309124Sdim/// 32 bits of this operand are used in the comparison. 978309124Sdim/// \param __b 979309124Sdim/// A 128-bit vector of [4 x float] containing one of the operands. The lower 980309124Sdim/// 32 bits of this operand are used in the comparison. 981309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results 982309124Sdim/// in the low-order bits. 983288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 984249423Sdim_mm_cmpunord_ss(__m128 __a, __m128 __b) 985193326Sed{ 986309124Sdim return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 987193326Sed} 988193326Sed 989341825Sdim/// Compares each of the corresponding 32-bit float values of the 990309124Sdim/// 128-bit vectors of [4 x float] to determine if the values in the first 991309124Sdim/// operand are unordered with respect to those in the second operand. 992309124Sdim/// 993309124Sdim/// \headerfile <x86intrin.h> 994309124Sdim/// 995314564Sdim/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c> 996314564Sdim/// instructions. 997309124Sdim/// 998309124Sdim/// \param __a 999309124Sdim/// A 128-bit vector of [4 x float]. 1000309124Sdim/// \param __b 1001309124Sdim/// A 128-bit vector of [4 x float]. 1002309124Sdim/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1003288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1004249423Sdim_mm_cmpunord_ps(__m128 __a, __m128 __b) 1005193326Sed{ 1006309124Sdim return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 1007193326Sed} 1008193326Sed 1009341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1010309124Sdim/// operands for equality and returns the result of the comparison. 1011309124Sdim/// 1012341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1013341825Sdim/// 1014309124Sdim/// \headerfile <x86intrin.h> 1015309124Sdim/// 1016314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1017314564Sdim/// instructions. 1018309124Sdim/// 1019309124Sdim/// \param __a 1020309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1021309124Sdim/// used in the comparison. 1022309124Sdim/// \param __b 1023309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1024309124Sdim/// used in the comparison. 1025341825Sdim/// \returns An integer containing the comparison results. If either of the 1026341825Sdim/// two lower 32-bit values is NaN, 0 is returned. 1027288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1028249423Sdim_mm_comieq_ss(__m128 __a, __m128 __b) 1029193326Sed{ 1030309124Sdim return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1031193326Sed} 1032193326Sed 1033341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1034309124Sdim/// operands to determine if the first operand is less than the second 1035309124Sdim/// operand and returns the result of the comparison. 1036309124Sdim/// 1037341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1038341825Sdim/// 1039309124Sdim/// \headerfile <x86intrin.h> 1040309124Sdim/// 1041314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1042314564Sdim/// instructions. 1043309124Sdim/// 1044309124Sdim/// \param __a 1045309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1046309124Sdim/// used in the comparison. 1047309124Sdim/// \param __b 1048309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1049309124Sdim/// used in the comparison. 1050341825Sdim/// \returns An integer containing the comparison results. If either of the two 1051341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1052288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1053249423Sdim_mm_comilt_ss(__m128 __a, __m128 __b) 1054193326Sed{ 1055309124Sdim return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1056193326Sed} 1057193326Sed 1058341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1059309124Sdim/// operands to determine if the first operand is less than or equal to the 1060309124Sdim/// second operand and returns the result of the comparison. 1061309124Sdim/// 1062341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1063341825Sdim/// 1064309124Sdim/// \headerfile <x86intrin.h> 1065309124Sdim/// 1066314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1067309124Sdim/// 1068309124Sdim/// \param __a 1069309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1070309124Sdim/// used in the comparison. 1071309124Sdim/// \param __b 1072309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1073309124Sdim/// used in the comparison. 1074341825Sdim/// \returns An integer containing the comparison results. If either of the two 1075341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1076288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1077249423Sdim_mm_comile_ss(__m128 __a, __m128 __b) 1078193326Sed{ 1079309124Sdim return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1080193326Sed} 1081193326Sed 1082341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1083309124Sdim/// operands to determine if the first operand is greater than the second 1084309124Sdim/// operand and returns the result of the comparison. 1085309124Sdim/// 1086341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1087341825Sdim/// 1088309124Sdim/// \headerfile <x86intrin.h> 1089309124Sdim/// 1090314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1091309124Sdim/// 1092309124Sdim/// \param __a 1093309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1094309124Sdim/// used in the comparison. 1095309124Sdim/// \param __b 1096309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1097309124Sdim/// used in the comparison. 1098341825Sdim/// \returns An integer containing the comparison results. If either of the 1099341825Sdim/// two lower 32-bit values is NaN, 0 is returned. 1100288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1101249423Sdim_mm_comigt_ss(__m128 __a, __m128 __b) 1102193326Sed{ 1103309124Sdim return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1104193326Sed} 1105193326Sed 1106341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1107309124Sdim/// operands to determine if the first operand is greater than or equal to 1108309124Sdim/// the second operand and returns the result of the comparison. 1109309124Sdim/// 1110341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1111341825Sdim/// 1112309124Sdim/// \headerfile <x86intrin.h> 1113309124Sdim/// 1114314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1115309124Sdim/// 1116309124Sdim/// \param __a 1117309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1118309124Sdim/// used in the comparison. 1119309124Sdim/// \param __b 1120309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1121309124Sdim/// used in the comparison. 1122341825Sdim/// \returns An integer containing the comparison results. If either of the two 1123341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1124288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1125249423Sdim_mm_comige_ss(__m128 __a, __m128 __b) 1126193326Sed{ 1127309124Sdim return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1128193326Sed} 1129193326Sed 1130341825Sdim/// Compares two 32-bit float values in the low-order bits of both 1131309124Sdim/// operands to determine if the first operand is not equal to the second 1132309124Sdim/// operand and returns the result of the comparison. 1133309124Sdim/// 1134341825Sdim/// If either of the two lower 32-bit values is NaN, 1 is returned. 1135341825Sdim/// 1136309124Sdim/// \headerfile <x86intrin.h> 1137309124Sdim/// 1138314564Sdim/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1139309124Sdim/// 1140309124Sdim/// \param __a 1141309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1142309124Sdim/// used in the comparison. 1143309124Sdim/// \param __b 1144309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1145309124Sdim/// used in the comparison. 1146341825Sdim/// \returns An integer containing the comparison results. If either of the 1147341825Sdim/// two lower 32-bit values is NaN, 1 is returned. 1148288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1149249423Sdim_mm_comineq_ss(__m128 __a, __m128 __b) 1150193326Sed{ 1151309124Sdim return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1152193326Sed} 1153193326Sed 1154341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1155309124Sdim/// the low-order bits of both operands to determine equality and returns 1156309124Sdim/// the result of the comparison. 1157309124Sdim/// 1158341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1159341825Sdim/// 1160309124Sdim/// \headerfile <x86intrin.h> 1161309124Sdim/// 1162314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1163309124Sdim/// 1164309124Sdim/// \param __a 1165309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1166309124Sdim/// used in the comparison. 1167309124Sdim/// \param __b 1168309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1169309124Sdim/// used in the comparison. 1170341825Sdim/// \returns An integer containing the comparison results. If either of the two 1171341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1172288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1173249423Sdim_mm_ucomieq_ss(__m128 __a, __m128 __b) 1174193326Sed{ 1175309124Sdim return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1176193326Sed} 1177193326Sed 1178341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1179309124Sdim/// the low-order bits of both operands to determine if the first operand is 1180309124Sdim/// less than the second operand and returns the result of the comparison. 1181309124Sdim/// 1182341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1183341825Sdim/// 1184309124Sdim/// \headerfile <x86intrin.h> 1185309124Sdim/// 1186314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1187309124Sdim/// 1188309124Sdim/// \param __a 1189309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1190309124Sdim/// used in the comparison. 1191309124Sdim/// \param __b 1192309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1193309124Sdim/// used in the comparison. 1194341825Sdim/// \returns An integer containing the comparison results. If either of the two 1195341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1196288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1197249423Sdim_mm_ucomilt_ss(__m128 __a, __m128 __b) 1198193326Sed{ 1199309124Sdim return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1200193326Sed} 1201193326Sed 1202341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1203314564Sdim/// the low-order bits of both operands to determine if the first operand is 1204314564Sdim/// less than or equal to the second operand and returns the result of the 1205314564Sdim/// comparison. 1206309124Sdim/// 1207341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1208341825Sdim/// 1209309124Sdim/// \headerfile <x86intrin.h> 1210309124Sdim/// 1211314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1212309124Sdim/// 1213309124Sdim/// \param __a 1214309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1215309124Sdim/// used in the comparison. 1216309124Sdim/// \param __b 1217309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1218309124Sdim/// used in the comparison. 1219341825Sdim/// \returns An integer containing the comparison results. If either of the two 1220341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1221288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1222249423Sdim_mm_ucomile_ss(__m128 __a, __m128 __b) 1223193326Sed{ 1224309124Sdim return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1225193326Sed} 1226193326Sed 1227341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1228314564Sdim/// the low-order bits of both operands to determine if the first operand is 1229314564Sdim/// greater than the second operand and returns the result of the 1230309124Sdim/// comparison. 1231309124Sdim/// 1232341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1233341825Sdim/// 1234309124Sdim/// \headerfile <x86intrin.h> 1235309124Sdim/// 1236314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1237309124Sdim/// 1238309124Sdim/// \param __a 1239309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1240309124Sdim/// used in the comparison. 1241309124Sdim/// \param __b 1242309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1243309124Sdim/// used in the comparison. 1244341825Sdim/// \returns An integer containing the comparison results. If either of the two 1245341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1246288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1247249423Sdim_mm_ucomigt_ss(__m128 __a, __m128 __b) 1248193326Sed{ 1249309124Sdim return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1250193326Sed} 1251193326Sed 1252341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1253309124Sdim/// the low-order bits of both operands to determine if the first operand is 1254309124Sdim/// greater than or equal to the second operand and returns the result of 1255309124Sdim/// the comparison. 1256309124Sdim/// 1257341825Sdim/// If either of the two lower 32-bit values is NaN, 0 is returned. 1258341825Sdim/// 1259309124Sdim/// \headerfile <x86intrin.h> 1260309124Sdim/// 1261314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1262309124Sdim/// 1263309124Sdim/// \param __a 1264309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1265309124Sdim/// used in the comparison. 1266309124Sdim/// \param __b 1267309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1268309124Sdim/// used in the comparison. 1269341825Sdim/// \returns An integer containing the comparison results. If either of the two 1270341825Sdim/// lower 32-bit values is NaN, 0 is returned. 1271288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1272249423Sdim_mm_ucomige_ss(__m128 __a, __m128 __b) 1273193326Sed{ 1274309124Sdim return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1275193326Sed} 1276193326Sed 1277341825Sdim/// Performs an unordered comparison of two 32-bit float values using 1278309124Sdim/// the low-order bits of both operands to determine inequality and returns 1279309124Sdim/// the result of the comparison. 1280309124Sdim/// 1281341825Sdim/// If either of the two lower 32-bit values is NaN, 1 is returned. 1282341825Sdim/// 1283309124Sdim/// \headerfile <x86intrin.h> 1284309124Sdim/// 1285314564Sdim/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1286309124Sdim/// 1287309124Sdim/// \param __a 1288309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1289309124Sdim/// used in the comparison. 1290309124Sdim/// \param __b 1291309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1292309124Sdim/// used in the comparison. 1293341825Sdim/// \returns An integer containing the comparison results. If either of the two 1294341825Sdim/// lower 32-bit values is NaN, 1 is returned. 1295288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1296249423Sdim_mm_ucomineq_ss(__m128 __a, __m128 __b) 1297193326Sed{ 1298309124Sdim return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1299193326Sed} 1300193326Sed 1301341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1302309124Sdim/// [4 x float] into a 32-bit integer. 1303309124Sdim/// 1304309124Sdim/// \headerfile <x86intrin.h> 1305309124Sdim/// 1306314564Sdim/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1307314564Sdim/// instructions. 1308309124Sdim/// 1309309124Sdim/// \param __a 1310309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1311309124Sdim/// used in the conversion. 1312309124Sdim/// \returns A 32-bit integer containing the converted value. 1313288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1314249423Sdim_mm_cvtss_si32(__m128 __a) 1315193326Sed{ 1316309124Sdim return __builtin_ia32_cvtss2si((__v4sf)__a); 1317193326Sed} 1318193326Sed 1319341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1320309124Sdim/// [4 x float] into a 32-bit integer. 1321309124Sdim/// 1322309124Sdim/// \headerfile <x86intrin.h> 1323309124Sdim/// 1324314564Sdim/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1325314564Sdim/// instructions. 1326309124Sdim/// 1327309124Sdim/// \param __a 1328309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1329309124Sdim/// used in the conversion. 1330309124Sdim/// \returns A 32-bit integer containing the converted value. 1331288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1332249423Sdim_mm_cvt_ss2si(__m128 __a) 1333204643Srdivacky{ 1334249423Sdim return _mm_cvtss_si32(__a); 1335204643Srdivacky} 1336204643Srdivacky 1337193576Sed#ifdef __x86_64__ 1338193576Sed 1339341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1340309124Sdim/// [4 x float] into a 64-bit integer. 1341309124Sdim/// 1342309124Sdim/// \headerfile <x86intrin.h> 1343309124Sdim/// 1344314564Sdim/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1345314564Sdim/// instructions. 1346309124Sdim/// 1347309124Sdim/// \param __a 1348309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1349309124Sdim/// used in the conversion. 1350309124Sdim/// \returns A 64-bit integer containing the converted value. 1351288943Sdimstatic __inline__ long long __DEFAULT_FN_ATTRS 1352249423Sdim_mm_cvtss_si64(__m128 __a) 1353193326Sed{ 1354309124Sdim return __builtin_ia32_cvtss2si64((__v4sf)__a); 1355193326Sed} 1356193326Sed 1357193576Sed#endif 1358193576Sed 1359341825Sdim/// Converts two low-order float values in a 128-bit vector of 1360309124Sdim/// [4 x float] into a 64-bit vector of [2 x i32]. 1361309124Sdim/// 1362309124Sdim/// \headerfile <x86intrin.h> 1363309124Sdim/// 1364314564Sdim/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1365309124Sdim/// 1366309124Sdim/// \param __a 1367309124Sdim/// A 128-bit vector of [4 x float]. 1368309124Sdim/// \returns A 64-bit integer vector containing the converted values. 1369341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1370249423Sdim_mm_cvtps_pi32(__m128 __a) 1371193326Sed{ 1372309124Sdim return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1373193326Sed} 1374193326Sed 1375341825Sdim/// Converts two low-order float values in a 128-bit vector of 1376309124Sdim/// [4 x float] into a 64-bit vector of [2 x i32]. 1377309124Sdim/// 1378309124Sdim/// \headerfile <x86intrin.h> 1379309124Sdim/// 1380314564Sdim/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1381309124Sdim/// 1382309124Sdim/// \param __a 1383309124Sdim/// A 128-bit vector of [4 x float]. 1384309124Sdim/// \returns A 64-bit integer vector containing the converted values. 1385341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1386249423Sdim_mm_cvt_ps2pi(__m128 __a) 1387212904Sdim{ 1388249423Sdim return _mm_cvtps_pi32(__a); 1389212904Sdim} 1390212904Sdim 1391341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1392309124Sdim/// [4 x float] into a 32-bit integer, truncating the result when it is 1393309124Sdim/// inexact. 1394309124Sdim/// 1395309124Sdim/// \headerfile <x86intrin.h> 1396309124Sdim/// 1397314564Sdim/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1398314564Sdim/// instructions. 1399309124Sdim/// 1400309124Sdim/// \param __a 1401309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1402309124Sdim/// used in the conversion. 1403309124Sdim/// \returns A 32-bit integer containing the converted value. 1404288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1405249423Sdim_mm_cvttss_si32(__m128 __a) 1406193326Sed{ 1407309124Sdim return __builtin_ia32_cvttss2si((__v4sf)__a); 1408193326Sed} 1409193326Sed 1410341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1411309124Sdim/// [4 x float] into a 32-bit integer, truncating the result when it is 1412309124Sdim/// inexact. 1413309124Sdim/// 1414309124Sdim/// \headerfile <x86intrin.h> 1415309124Sdim/// 1416314564Sdim/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1417314564Sdim/// instructions. 1418309124Sdim/// 1419309124Sdim/// \param __a 1420309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1421309124Sdim/// used in the conversion. 1422309124Sdim/// \returns A 32-bit integer containing the converted value. 1423288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 1424249423Sdim_mm_cvtt_ss2si(__m128 __a) 1425204643Srdivacky{ 1426249423Sdim return _mm_cvttss_si32(__a); 1427204643Srdivacky} 1428204643Srdivacky 1429314564Sdim#ifdef __x86_64__ 1430341825Sdim/// Converts a float value contained in the lower 32 bits of a vector of 1431309124Sdim/// [4 x float] into a 64-bit integer, truncating the result when it is 1432309124Sdim/// inexact. 1433309124Sdim/// 1434309124Sdim/// \headerfile <x86intrin.h> 1435309124Sdim/// 1436314564Sdim/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1437314564Sdim/// instructions. 1438309124Sdim/// 1439309124Sdim/// \param __a 1440309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1441309124Sdim/// used in the conversion. 1442309124Sdim/// \returns A 64-bit integer containing the converted value. 1443288943Sdimstatic __inline__ long long __DEFAULT_FN_ATTRS 1444249423Sdim_mm_cvttss_si64(__m128 __a) 1445193326Sed{ 1446309124Sdim return __builtin_ia32_cvttss2si64((__v4sf)__a); 1447193326Sed} 1448314564Sdim#endif 1449193326Sed 1450341825Sdim/// Converts two low-order float values in a 128-bit vector of 1451309124Sdim/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1452309124Sdim/// when it is inexact. 1453309124Sdim/// 1454309124Sdim/// \headerfile <x86intrin.h> 1455309124Sdim/// 1456314564Sdim/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c> 1457314564Sdim/// instructions. 1458309124Sdim/// 1459309124Sdim/// \param __a 1460309124Sdim/// A 128-bit vector of [4 x float]. 1461309124Sdim/// \returns A 64-bit integer vector containing the converted values. 1462341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1463249423Sdim_mm_cvttps_pi32(__m128 __a) 1464193326Sed{ 1465309124Sdim return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1466193326Sed} 1467193326Sed 1468341825Sdim/// Converts two low-order float values in a 128-bit vector of [4 x 1469309124Sdim/// float] into a 64-bit vector of [2 x i32], truncating the result when it 1470309124Sdim/// is inexact. 1471309124Sdim/// 1472309124Sdim/// \headerfile <x86intrin.h> 1473309124Sdim/// 1474314564Sdim/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction. 1475309124Sdim/// 1476309124Sdim/// \param __a 1477309124Sdim/// A 128-bit vector of [4 x float]. 1478309124Sdim/// \returns A 64-bit integer vector containing the converted values. 1479341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1480249423Sdim_mm_cvtt_ps2pi(__m128 __a) 1481212904Sdim{ 1482249423Sdim return _mm_cvttps_pi32(__a); 1483212904Sdim} 1484212904Sdim 1485341825Sdim/// Converts a 32-bit signed integer value into a floating point value 1486309124Sdim/// and writes it to the lower 32 bits of the destination. The remaining 1487309124Sdim/// higher order elements of the destination vector are copied from the 1488309124Sdim/// corresponding elements in the first operand. 1489309124Sdim/// 1490309124Sdim/// \headerfile <x86intrin.h> 1491309124Sdim/// 1492314564Sdim/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1493309124Sdim/// 1494309124Sdim/// \param __a 1495309124Sdim/// A 128-bit vector of [4 x float]. 1496309124Sdim/// \param __b 1497309124Sdim/// A 32-bit signed integer operand containing the value to be converted. 1498309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1499309124Sdim/// converted value of the second operand. The upper 96 bits are copied from 1500309124Sdim/// the upper 96 bits of the first operand. 1501288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1502249423Sdim_mm_cvtsi32_ss(__m128 __a, int __b) 1503193326Sed{ 1504249423Sdim __a[0] = __b; 1505249423Sdim return __a; 1506193326Sed} 1507193326Sed 1508341825Sdim/// Converts a 32-bit signed integer value into a floating point value 1509309124Sdim/// and writes it to the lower 32 bits of the destination. The remaining 1510309124Sdim/// higher order elements of the destination are copied from the 1511309124Sdim/// corresponding elements in the first operand. 1512309124Sdim/// 1513309124Sdim/// \headerfile <x86intrin.h> 1514309124Sdim/// 1515314564Sdim/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1516309124Sdim/// 1517309124Sdim/// \param __a 1518309124Sdim/// A 128-bit vector of [4 x float]. 1519309124Sdim/// \param __b 1520309124Sdim/// A 32-bit signed integer operand containing the value to be converted. 1521309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1522309124Sdim/// converted value of the second operand. The upper 96 bits are copied from 1523309124Sdim/// the upper 96 bits of the first operand. 1524288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1525249423Sdim_mm_cvt_si2ss(__m128 __a, int __b) 1526212904Sdim{ 1527249423Sdim return _mm_cvtsi32_ss(__a, __b); 1528212904Sdim} 1529212904Sdim 1530193326Sed#ifdef __x86_64__ 1531193326Sed 1532341825Sdim/// Converts a 64-bit signed integer value into a floating point value 1533309124Sdim/// and writes it to the lower 32 bits of the destination. The remaining 1534309124Sdim/// higher order elements of the destination are copied from the 1535309124Sdim/// corresponding elements in the first operand. 1536309124Sdim/// 1537309124Sdim/// \headerfile <x86intrin.h> 1538309124Sdim/// 1539314564Sdim/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1540309124Sdim/// 1541309124Sdim/// \param __a 1542309124Sdim/// A 128-bit vector of [4 x float]. 1543309124Sdim/// \param __b 1544309124Sdim/// A 64-bit signed integer operand containing the value to be converted. 1545309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1546309124Sdim/// converted value of the second operand. The upper 96 bits are copied from 1547309124Sdim/// the upper 96 bits of the first operand. 1548288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1549249423Sdim_mm_cvtsi64_ss(__m128 __a, long long __b) 1550193326Sed{ 1551249423Sdim __a[0] = __b; 1552249423Sdim return __a; 1553193326Sed} 1554193326Sed 1555193326Sed#endif 1556193326Sed 1557341825Sdim/// Converts two elements of a 64-bit vector of [2 x i32] into two 1558309124Sdim/// floating point values and writes them to the lower 64-bits of the 1559309124Sdim/// destination. The remaining higher order elements of the destination are 1560309124Sdim/// copied from the corresponding elements in the first operand. 1561309124Sdim/// 1562309124Sdim/// \headerfile <x86intrin.h> 1563309124Sdim/// 1564314564Sdim/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1565309124Sdim/// 1566309124Sdim/// \param __a 1567309124Sdim/// A 128-bit vector of [4 x float]. 1568309124Sdim/// \param __b 1569309124Sdim/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1570309124Sdim/// and written to the corresponding low-order elements in the destination. 1571309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1572309124Sdim/// converted value of the second operand. The upper 64 bits are copied from 1573309124Sdim/// the upper 64 bits of the first operand. 1574341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1575249423Sdim_mm_cvtpi32_ps(__m128 __a, __m64 __b) 1576193326Sed{ 1577309124Sdim return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1578193326Sed} 1579193326Sed 1580341825Sdim/// Converts two elements of a 64-bit vector of [2 x i32] into two 1581309124Sdim/// floating point values and writes them to the lower 64-bits of the 1582309124Sdim/// destination. The remaining higher order elements of the destination are 1583309124Sdim/// copied from the corresponding elements in the first operand. 1584309124Sdim/// 1585309124Sdim/// \headerfile <x86intrin.h> 1586309124Sdim/// 1587314564Sdim/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1588309124Sdim/// 1589309124Sdim/// \param __a 1590309124Sdim/// A 128-bit vector of [4 x float]. 1591309124Sdim/// \param __b 1592309124Sdim/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1593309124Sdim/// and written to the corresponding low-order elements in the destination. 1594309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1595309124Sdim/// converted value from the second operand. The upper 64 bits are copied 1596309124Sdim/// from the upper 64 bits of the first operand. 1597341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1598249423Sdim_mm_cvt_pi2ps(__m128 __a, __m64 __b) 1599212904Sdim{ 1600249423Sdim return _mm_cvtpi32_ps(__a, __b); 1601212904Sdim} 1602212904Sdim 1603341825Sdim/// Extracts a float value contained in the lower 32 bits of a vector of 1604309124Sdim/// [4 x float]. 1605309124Sdim/// 1606309124Sdim/// \headerfile <x86intrin.h> 1607309124Sdim/// 1608341825Sdim/// This intrinsic has no corresponding instruction. 1609309124Sdim/// 1610309124Sdim/// \param __a 1611309124Sdim/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1612309124Sdim/// used in the extraction. 1613309124Sdim/// \returns A 32-bit float containing the extracted value. 1614288943Sdimstatic __inline__ float __DEFAULT_FN_ATTRS 1615249423Sdim_mm_cvtss_f32(__m128 __a) 1616193326Sed{ 1617249423Sdim return __a[0]; 1618193326Sed} 1619193326Sed 1620341825Sdim/// Loads two packed float values from the address \a __p into the 1621309124Sdim/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits 1622309124Sdim/// are copied from the low-order bits of the first operand. 1623309124Sdim/// 1624309124Sdim/// \headerfile <x86intrin.h> 1625309124Sdim/// 1626314564Sdim/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1627309124Sdim/// 1628309124Sdim/// \param __a 1629309124Sdim/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] 1630309124Sdim/// of the destination. 1631309124Sdim/// \param __p 1632309124Sdim/// A pointer to two packed float values. Bits [63:0] are written to bits 1633309124Sdim/// [127:64] of the destination. 1634309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values. 1635288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1636249423Sdim_mm_loadh_pi(__m128 __a, const __m64 *__p) 1637193326Sed{ 1638226633Sdim typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1639226633Sdim struct __mm_loadh_pi_struct { 1640249423Sdim __mm_loadh_pi_v2f32 __u; 1641226633Sdim } __attribute__((__packed__, __may_alias__)); 1642249423Sdim __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 1643249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1644249423Sdim return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1645193326Sed} 1646193326Sed 1647341825Sdim/// Loads two packed float values from the address \a __p into the 1648314564Sdim/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits 1649314564Sdim/// are copied from the high-order bits of the first operand. 1650309124Sdim/// 1651309124Sdim/// \headerfile <x86intrin.h> 1652309124Sdim/// 1653314564Sdim/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1654309124Sdim/// 1655309124Sdim/// \param __a 1656309124Sdim/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits 1657309124Sdim/// [127:64] of the destination. 1658309124Sdim/// \param __p 1659309124Sdim/// A pointer to two packed float values. Bits [63:0] are written to bits 1660309124Sdim/// [63:0] of the destination. 1661309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values. 1662288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1663249423Sdim_mm_loadl_pi(__m128 __a, const __m64 *__p) 1664193326Sed{ 1665226633Sdim typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1666226633Sdim struct __mm_loadl_pi_struct { 1667249423Sdim __mm_loadl_pi_v2f32 __u; 1668226633Sdim } __attribute__((__packed__, __may_alias__)); 1669249423Sdim __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 1670249423Sdim __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1671249423Sdim return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1672193326Sed} 1673193326Sed 1674341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1675309124Sdim/// 32 bits of the vector are initialized with the single-precision 1676309124Sdim/// floating-point value loaded from a specified memory location. The upper 1677309124Sdim/// 96 bits are set to zero. 1678309124Sdim/// 1679309124Sdim/// \headerfile <x86intrin.h> 1680309124Sdim/// 1681314564Sdim/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1682309124Sdim/// 1683309124Sdim/// \param __p 1684309124Sdim/// A pointer to a 32-bit memory location containing a single-precision 1685309124Sdim/// floating-point value. 1686309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1687309124Sdim/// lower 32 bits contain the value loaded from the memory location. The 1688309124Sdim/// upper 96 bits are set to zero. 1689288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1690249423Sdim_mm_load_ss(const float *__p) 1691193326Sed{ 1692226633Sdim struct __mm_load_ss_struct { 1693249423Sdim float __u; 1694226633Sdim } __attribute__((__packed__, __may_alias__)); 1695249423Sdim float __u = ((struct __mm_load_ss_struct*)__p)->__u; 1696341825Sdim return __extension__ (__m128){ __u, 0, 0, 0 }; 1697193326Sed} 1698193326Sed 1699341825Sdim/// Loads a 32-bit float value and duplicates it to all four vector 1700309124Sdim/// elements of a 128-bit vector of [4 x float]. 1701309124Sdim/// 1702309124Sdim/// \headerfile <x86intrin.h> 1703309124Sdim/// 1704341825Sdim/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c> 1705309124Sdim/// instruction. 1706309124Sdim/// 1707309124Sdim/// \param __p 1708309124Sdim/// A pointer to a float value to be loaded and duplicated. 1709314564Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded and 1710314564Sdim/// duplicated values. 1711288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1712249423Sdim_mm_load1_ps(const float *__p) 1713193326Sed{ 1714226633Sdim struct __mm_load1_ps_struct { 1715249423Sdim float __u; 1716226633Sdim } __attribute__((__packed__, __may_alias__)); 1717249423Sdim float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 1718341825Sdim return __extension__ (__m128){ __u, __u, __u, __u }; 1719193326Sed} 1720193326Sed 1721193326Sed#define _mm_load_ps1(p) _mm_load1_ps(p) 1722193326Sed 1723341825Sdim/// Loads a 128-bit floating-point vector of [4 x float] from an aligned 1724309124Sdim/// memory location. 1725309124Sdim/// 1726309124Sdim/// \headerfile <x86intrin.h> 1727309124Sdim/// 1728314564Sdim/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1729309124Sdim/// 1730309124Sdim/// \param __p 1731309124Sdim/// A pointer to a 128-bit memory location. The address of the memory 1732309124Sdim/// location has to be 128-bit aligned. 1733341825Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1734288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1735249423Sdim_mm_load_ps(const float *__p) 1736193326Sed{ 1737249423Sdim return *(__m128*)__p; 1738193326Sed} 1739193326Sed 1740341825Sdim/// Loads a 128-bit floating-point vector of [4 x float] from an 1741309124Sdim/// unaligned memory location. 1742309124Sdim/// 1743309124Sdim/// \headerfile <x86intrin.h> 1744309124Sdim/// 1745314564Sdim/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1746309124Sdim/// 1747309124Sdim/// \param __p 1748309124Sdim/// A pointer to a 128-bit memory location. The address of the memory 1749309124Sdim/// location does not have to be aligned. 1750309124Sdim/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1751288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1752249423Sdim_mm_loadu_ps(const float *__p) 1753193326Sed{ 1754223017Sdim struct __loadu_ps { 1755249423Sdim __m128 __v; 1756226633Sdim } __attribute__((__packed__, __may_alias__)); 1757249423Sdim return ((struct __loadu_ps*)__p)->__v; 1758193326Sed} 1759193326Sed 1760341825Sdim/// Loads four packed float values, in reverse order, from an aligned 1761309124Sdim/// memory location to 32-bit elements in a 128-bit vector of [4 x float]. 1762309124Sdim/// 1763309124Sdim/// \headerfile <x86intrin.h> 1764309124Sdim/// 1765314564Sdim/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 1766309124Sdim/// instruction. 1767309124Sdim/// 1768309124Sdim/// \param __p 1769309124Sdim/// A pointer to a 128-bit memory location. The address of the memory 1770309124Sdim/// location has to be 128-bit aligned. 1771309124Sdim/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 1772309124Sdim/// in reverse order. 1773288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1774249423Sdim_mm_loadr_ps(const float *__p) 1775193326Sed{ 1776249423Sdim __m128 __a = _mm_load_ps(__p); 1777309124Sdim return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1778193326Sed} 1779193326Sed 1780341825Sdim/// Create a 128-bit vector of [4 x float] with undefined values. 1781309124Sdim/// 1782309124Sdim/// \headerfile <x86intrin.h> 1783309124Sdim/// 1784309124Sdim/// This intrinsic has no corresponding instruction. 1785309124Sdim/// 1786309124Sdim/// \returns A 128-bit vector of [4 x float] containing undefined values. 1787288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1788309124Sdim_mm_undefined_ps(void) 1789296417Sdim{ 1790296417Sdim return (__m128)__builtin_ia32_undef128(); 1791296417Sdim} 1792296417Sdim 1793341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1794309124Sdim/// 32 bits of the vector are initialized with the specified single-precision 1795309124Sdim/// floating-point value. The upper 96 bits are set to zero. 1796309124Sdim/// 1797309124Sdim/// \headerfile <x86intrin.h> 1798309124Sdim/// 1799314564Sdim/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1800309124Sdim/// 1801309124Sdim/// \param __w 1802309124Sdim/// A single-precision floating-point value used to initialize the lower 32 1803309124Sdim/// bits of the result. 1804309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1805309124Sdim/// lower 32 bits contain the value provided in the source operand. The 1806309124Sdim/// upper 96 bits are set to zero. 1807296417Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1808249423Sdim_mm_set_ss(float __w) 1809193326Sed{ 1810341825Sdim return __extension__ (__m128){ __w, 0, 0, 0 }; 1811193326Sed} 1812193326Sed 1813341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float], with each 1814309124Sdim/// of the four single-precision floating-point vector elements set to the 1815309124Sdim/// specified single-precision floating-point value. 1816309124Sdim/// 1817309124Sdim/// \headerfile <x86intrin.h> 1818309124Sdim/// 1819314564Sdim/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1820309124Sdim/// 1821309124Sdim/// \param __w 1822309124Sdim/// A single-precision floating-point value used to initialize each vector 1823309124Sdim/// element of the result. 1824309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1825288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1826249423Sdim_mm_set1_ps(float __w) 1827193326Sed{ 1828341825Sdim return __extension__ (__m128){ __w, __w, __w, __w }; 1829193326Sed} 1830193326Sed 1831276479Sdim/* Microsoft specific. */ 1832341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float], with each 1833309124Sdim/// of the four single-precision floating-point vector elements set to the 1834309124Sdim/// specified single-precision floating-point value. 1835309124Sdim/// 1836309124Sdim/// \headerfile <x86intrin.h> 1837309124Sdim/// 1838314564Sdim/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1839309124Sdim/// 1840309124Sdim/// \param __w 1841309124Sdim/// A single-precision floating-point value used to initialize each vector 1842309124Sdim/// element of the result. 1843309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1844288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1845249423Sdim_mm_set_ps1(float __w) 1846193326Sed{ 1847249423Sdim return _mm_set1_ps(__w); 1848193326Sed} 1849193326Sed 1850341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float] 1851309124Sdim/// initialized with the specified single-precision floating-point values. 1852309124Sdim/// 1853309124Sdim/// \headerfile <x86intrin.h> 1854309124Sdim/// 1855309124Sdim/// This intrinsic is a utility function and does not correspond to a specific 1856309124Sdim/// instruction. 1857309124Sdim/// 1858309124Sdim/// \param __z 1859309124Sdim/// A single-precision floating-point value used to initialize bits [127:96] 1860309124Sdim/// of the result. 1861309124Sdim/// \param __y 1862309124Sdim/// A single-precision floating-point value used to initialize bits [95:64] 1863309124Sdim/// of the result. 1864309124Sdim/// \param __x 1865309124Sdim/// A single-precision floating-point value used to initialize bits [63:32] 1866309124Sdim/// of the result. 1867309124Sdim/// \param __w 1868309124Sdim/// A single-precision floating-point value used to initialize bits [31:0] 1869309124Sdim/// of the result. 1870309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1871288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1872249423Sdim_mm_set_ps(float __z, float __y, float __x, float __w) 1873193326Sed{ 1874341825Sdim return __extension__ (__m128){ __w, __x, __y, __z }; 1875193326Sed} 1876193326Sed 1877341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float], 1878309124Sdim/// initialized in reverse order with the specified 32-bit single-precision 1879309124Sdim/// float-point values. 1880309124Sdim/// 1881309124Sdim/// \headerfile <x86intrin.h> 1882309124Sdim/// 1883309124Sdim/// This intrinsic is a utility function and does not correspond to a specific 1884309124Sdim/// instruction. 1885309124Sdim/// 1886309124Sdim/// \param __z 1887309124Sdim/// A single-precision floating-point value used to initialize bits [31:0] 1888309124Sdim/// of the result. 1889309124Sdim/// \param __y 1890309124Sdim/// A single-precision floating-point value used to initialize bits [63:32] 1891309124Sdim/// of the result. 1892309124Sdim/// \param __x 1893309124Sdim/// A single-precision floating-point value used to initialize bits [95:64] 1894309124Sdim/// of the result. 1895309124Sdim/// \param __w 1896309124Sdim/// A single-precision floating-point value used to initialize bits [127:96] 1897309124Sdim/// of the result. 1898309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1899288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1900249423Sdim_mm_setr_ps(float __z, float __y, float __x, float __w) 1901193326Sed{ 1902341825Sdim return __extension__ (__m128){ __z, __y, __x, __w }; 1903193326Sed} 1904193326Sed 1905341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float] initialized 1906309124Sdim/// to zero. 1907309124Sdim/// 1908309124Sdim/// \headerfile <x86intrin.h> 1909309124Sdim/// 1910314564Sdim/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1911309124Sdim/// 1912309124Sdim/// \returns An initialized 128-bit floating-point vector of [4 x float] with 1913309124Sdim/// all elements set to zero. 1914288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 1915193326Sed_mm_setzero_ps(void) 1916193326Sed{ 1917341825Sdim return __extension__ (__m128){ 0, 0, 0, 0 }; 1918193326Sed} 1919193326Sed 1920341825Sdim/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a 1921309124Sdim/// memory location. 1922309124Sdim/// 1923309124Sdim/// \headerfile <x86intrin.h> 1924309124Sdim/// 1925341825Sdim/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1926309124Sdim/// 1927309124Sdim/// \param __p 1928309124Sdim/// A pointer to a 64-bit memory location. 1929309124Sdim/// \param __a 1930309124Sdim/// A 128-bit vector of [4 x float] containing the values to be stored. 1931288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 1932249423Sdim_mm_storeh_pi(__m64 *__p, __m128 __a) 1933193326Sed{ 1934309124Sdim __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); 1935193326Sed} 1936193326Sed 1937341825Sdim/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a 1938309124Sdim/// memory location. 1939309124Sdim/// 1940309124Sdim/// \headerfile <x86intrin.h> 1941309124Sdim/// 1942314564Sdim/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 1943309124Sdim/// 1944309124Sdim/// \param __p 1945309124Sdim/// A pointer to a memory location that will receive the float values. 1946309124Sdim/// \param __a 1947309124Sdim/// A 128-bit vector of [4 x float] containing the values to be stored. 1948288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 1949249423Sdim_mm_storel_pi(__m64 *__p, __m128 __a) 1950193326Sed{ 1951309124Sdim __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); 1952193326Sed} 1953193326Sed 1954341825Sdim/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a 1955309124Sdim/// memory location. 1956309124Sdim/// 1957309124Sdim/// \headerfile <x86intrin.h> 1958309124Sdim/// 1959314564Sdim/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1960309124Sdim/// 1961309124Sdim/// \param __p 1962309124Sdim/// A pointer to a 32-bit memory location. 1963309124Sdim/// \param __a 1964309124Sdim/// A 128-bit vector of [4 x float] containing the value to be stored. 1965288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 1966249423Sdim_mm_store_ss(float *__p, __m128 __a) 1967193326Sed{ 1968226633Sdim struct __mm_store_ss_struct { 1969249423Sdim float __u; 1970226633Sdim } __attribute__((__packed__, __may_alias__)); 1971249423Sdim ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1972193326Sed} 1973193326Sed 1974341825Sdim/// Stores a 128-bit vector of [4 x float] to an unaligned memory 1975314564Sdim/// location. 1976309124Sdim/// 1977309124Sdim/// \headerfile <x86intrin.h> 1978309124Sdim/// 1979314564Sdim/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1980309124Sdim/// 1981309124Sdim/// \param __p 1982309124Sdim/// A pointer to a 128-bit memory location. The address of the memory 1983309124Sdim/// location does not have to be aligned. 1984309124Sdim/// \param __a 1985309124Sdim/// A 128-bit vector of [4 x float] containing the values to be stored. 1986288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 1987249423Sdim_mm_storeu_ps(float *__p, __m128 __a) 1988193326Sed{ 1989309124Sdim struct __storeu_ps { 1990309124Sdim __m128 __v; 1991309124Sdim } __attribute__((__packed__, __may_alias__)); 1992309124Sdim ((struct __storeu_ps*)__p)->__v = __a; 1993193326Sed} 1994193326Sed 1995341825Sdim/// Stores a 128-bit vector of [4 x float] into an aligned memory 1996314564Sdim/// location. 1997309124Sdim/// 1998309124Sdim/// \headerfile <x86intrin.h> 1999309124Sdim/// 2000314564Sdim/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 2001309124Sdim/// 2002309124Sdim/// \param __p 2003314564Sdim/// A pointer to a 128-bit memory location. The address of the memory 2004314564Sdim/// location has to be 16-byte aligned. 2005309124Sdim/// \param __a 2006314564Sdim/// A 128-bit vector of [4 x float] containing the values to be stored. 2007288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 2008309124Sdim_mm_store_ps(float *__p, __m128 __a) 2009193326Sed{ 2010309124Sdim *(__m128*)__p = __a; 2011193326Sed} 2012193326Sed 2013341825Sdim/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2014309124Sdim/// four contiguous elements in an aligned memory location. 2015309124Sdim/// 2016309124Sdim/// \headerfile <x86intrin.h> 2017309124Sdim/// 2018314564Sdim/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2019309124Sdim/// instruction. 2020309124Sdim/// 2021309124Sdim/// \param __p 2022309124Sdim/// A pointer to a 128-bit memory location. 2023309124Sdim/// \param __a 2024309124Sdim/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2025314564Sdim/// of the four contiguous elements pointed by \a __p. 2026288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 2027309124Sdim_mm_store1_ps(float *__p, __m128 __a) 2028212904Sdim{ 2029309124Sdim __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 2030309124Sdim _mm_store_ps(__p, __a); 2031212904Sdim} 2032212904Sdim 2033341825Sdim/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2034314564Sdim/// four contiguous elements in an aligned memory location. 2035309124Sdim/// 2036309124Sdim/// \headerfile <x86intrin.h> 2037309124Sdim/// 2038314564Sdim/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2039314564Sdim/// instruction. 2040309124Sdim/// 2041309124Sdim/// \param __p 2042314564Sdim/// A pointer to a 128-bit memory location. 2043309124Sdim/// \param __a 2044314564Sdim/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2045314564Sdim/// of the four contiguous elements pointed by \a __p. 2046288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 2047309124Sdim_mm_store_ps1(float *__p, __m128 __a) 2048193326Sed{ 2049341825Sdim _mm_store1_ps(__p, __a); 2050193326Sed} 2051193326Sed 2052341825Sdim/// Stores float values from a 128-bit vector of [4 x float] to an 2053309124Sdim/// aligned memory location in reverse order. 2054309124Sdim/// 2055309124Sdim/// \headerfile <x86intrin.h> 2056309124Sdim/// 2057314564Sdim/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 2058309124Sdim/// instruction. 2059309124Sdim/// 2060309124Sdim/// \param __p 2061309124Sdim/// A pointer to a 128-bit memory location. The address of the memory 2062309124Sdim/// location has to be 128-bit aligned. 2063309124Sdim/// \param __a 2064309124Sdim/// A 128-bit vector of [4 x float] containing the values to be stored. 2065288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 2066249423Sdim_mm_storer_ps(float *__p, __m128 __a) 2067193326Sed{ 2068309124Sdim __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 2069249423Sdim _mm_store_ps(__p, __a); 2070193326Sed} 2071193326Sed 2072327952Sdim#define _MM_HINT_ET0 7 2073327952Sdim#define _MM_HINT_ET1 6 2074327952Sdim#define _MM_HINT_T0 3 2075327952Sdim#define _MM_HINT_T1 2 2076327952Sdim#define _MM_HINT_T2 1 2077193326Sed#define _MM_HINT_NTA 0 2078193326Sed 2079276479Sdim#ifndef _MSC_VER 2080210299Sed/* FIXME: We have to #define this because "sel" must be a constant integer, and 2081193326Sed Sema doesn't do any form of constant propagation yet. */ 2082193326Sed 2083341825Sdim/// Loads one cache line of data from the specified address to a location 2084309124Sdim/// closer to the processor. 2085309124Sdim/// 2086309124Sdim/// \headerfile <x86intrin.h> 2087309124Sdim/// 2088309124Sdim/// \code 2089309124Sdim/// void _mm_prefetch(const void * a, const int sel); 2090309124Sdim/// \endcode 2091309124Sdim/// 2092314564Sdim/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction. 2093309124Sdim/// 2094309124Sdim/// \param a 2095309124Sdim/// A pointer to a memory location containing a cache line of data. 2096309124Sdim/// \param sel 2097314564Sdim/// A predefined integer constant specifying the type of prefetch 2098314564Sdim/// operation: \n 2099314564Sdim/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The 2100314564Sdim/// PREFETCHNTA instruction will be generated. \n 2101309124Sdim/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will 2102314564Sdim/// be generated. \n 2103309124Sdim/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will 2104314564Sdim/// be generated. \n 2105309124Sdim/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will 2106321369Sdim/// be generated. 2107327952Sdim#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \ 2108327952Sdim ((sel) >> 2) & 1, (sel) & 0x3)) 2109276479Sdim#endif 2110193326Sed 2111341825Sdim/// Stores a 64-bit integer in the specified aligned memory location. To 2112309124Sdim/// minimize caching, the data is flagged as non-temporal (unlikely to be 2113309124Sdim/// used again soon). 2114309124Sdim/// 2115309124Sdim/// \headerfile <x86intrin.h> 2116309124Sdim/// 2117314564Sdim/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction. 2118309124Sdim/// 2119309124Sdim/// \param __p 2120309124Sdim/// A pointer to an aligned memory location used to store the register value. 2121309124Sdim/// \param __a 2122309124Sdim/// A 64-bit integer containing the value to be stored. 2123341825Sdimstatic __inline__ void __DEFAULT_FN_ATTRS_MMX 2124249423Sdim_mm_stream_pi(__m64 *__p, __m64 __a) 2125193326Sed{ 2126249423Sdim __builtin_ia32_movntq(__p, __a); 2127193326Sed} 2128193326Sed 2129341825Sdim/// Moves packed float values from a 128-bit vector of [4 x float] to a 2130309124Sdim/// 128-bit aligned memory location. To minimize caching, the data is flagged 2131309124Sdim/// as non-temporal (unlikely to be used again soon). 2132309124Sdim/// 2133309124Sdim/// \headerfile <x86intrin.h> 2134309124Sdim/// 2135314564Sdim/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 2136309124Sdim/// 2137309124Sdim/// \param __p 2138309124Sdim/// A pointer to a 128-bit aligned memory location that will receive the 2139321369Sdim/// single-precision floating-point values. 2140309124Sdim/// \param __a 2141309124Sdim/// A 128-bit vector of [4 x float] containing the values to be moved. 2142288943Sdimstatic __inline__ void __DEFAULT_FN_ATTRS 2143249423Sdim_mm_stream_ps(float *__p, __m128 __a) 2144193326Sed{ 2145309124Sdim __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 2146193326Sed} 2147193326Sed 2148314564Sdim#if defined(__cplusplus) 2149314564Sdimextern "C" { 2150314564Sdim#endif 2151314564Sdim 2152341825Sdim/// Forces strong memory ordering (serialization) between store 2153309124Sdim/// instructions preceding this instruction and store instructions following 2154309124Sdim/// this instruction, ensuring the system completes all previous stores 2155309124Sdim/// before executing subsequent stores. 2156309124Sdim/// 2157309124Sdim/// \headerfile <x86intrin.h> 2158309124Sdim/// 2159314564Sdim/// This intrinsic corresponds to the <c> SFENCE </c> instruction. 2160309124Sdim/// 2161314564Sdimvoid _mm_sfence(void); 2162193326Sed 2163314564Sdim#if defined(__cplusplus) 2164314564Sdim} // extern "C" 2165314564Sdim#endif 2166314564Sdim 2167341825Sdim/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and 2168309124Sdim/// returns it, as specified by the immediate integer operand. 2169309124Sdim/// 2170309124Sdim/// \headerfile <x86intrin.h> 2171309124Sdim/// 2172314564Sdim/// \code 2173321369Sdim/// int _mm_extract_pi16(__m64 a, int n); 2174314564Sdim/// \endcode 2175309124Sdim/// 2176314564Sdim/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 2177314564Sdim/// 2178314564Sdim/// \param a 2179309124Sdim/// A 64-bit vector of [4 x i16]. 2180314564Sdim/// \param n 2181314564Sdim/// An immediate integer operand that determines which bits are extracted: \n 2182314564Sdim/// 0: Bits [15:0] are copied to the destination. \n 2183314564Sdim/// 1: Bits [31:16] are copied to the destination. \n 2184314564Sdim/// 2: Bits [47:32] are copied to the destination. \n 2185309124Sdim/// 3: Bits [63:48] are copied to the destination. 2186309124Sdim/// \returns A 16-bit integer containing the extracted 16 bits of packed data. 2187341825Sdim#define _mm_extract_pi16(a, n) \ 2188341825Sdim (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n) 2189193326Sed 2190341825Sdim/// Copies data from the 64-bit vector of [4 x i16] to the destination, 2191309124Sdim/// and inserts the lower 16-bits of an integer operand at the 16-bit offset 2192314564Sdim/// specified by the immediate operand \a n. 2193309124Sdim/// 2194309124Sdim/// \headerfile <x86intrin.h> 2195309124Sdim/// 2196314564Sdim/// \code 2197321369Sdim/// __m64 _mm_insert_pi16(__m64 a, int d, int n); 2198314564Sdim/// \endcode 2199309124Sdim/// 2200341825Sdim/// This intrinsic corresponds to the <c> PINSRW </c> instruction. 2201314564Sdim/// 2202314564Sdim/// \param a 2203309124Sdim/// A 64-bit vector of [4 x i16]. 2204314564Sdim/// \param d 2205309124Sdim/// An integer. The lower 16-bit value from this operand is written to the 2206314564Sdim/// destination at the offset specified by operand \a n. 2207314564Sdim/// \param n 2208309124Sdim/// An immediate integer operant that determines which the bits to be used 2209314564Sdim/// in the destination. \n 2210314564Sdim/// 0: Bits [15:0] are copied to the destination. \n 2211314564Sdim/// 1: Bits [31:16] are copied to the destination. \n 2212314564Sdim/// 2: Bits [47:32] are copied to the destination. \n 2213314564Sdim/// 3: Bits [63:48] are copied to the destination. \n 2214309124Sdim/// The remaining bits in the destination are copied from the corresponding 2215314564Sdim/// bits in operand \a a. 2216309124Sdim/// \returns A 64-bit integer vector containing the copied packed data from the 2217309124Sdim/// operands. 2218341825Sdim#define _mm_insert_pi16(a, d, n) \ 2219341825Sdim (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n) 2220193326Sed 2221341825Sdim/// Compares each of the corresponding packed 16-bit integer values of 2222309124Sdim/// the 64-bit integer vectors, and writes the greater value to the 2223309124Sdim/// corresponding bits in the destination. 2224309124Sdim/// 2225309124Sdim/// \headerfile <x86intrin.h> 2226309124Sdim/// 2227314564Sdim/// This intrinsic corresponds to the <c> PMAXSW </c> instruction. 2228309124Sdim/// 2229309124Sdim/// \param __a 2230309124Sdim/// A 64-bit integer vector containing one of the source operands. 2231309124Sdim/// \param __b 2232309124Sdim/// A 64-bit integer vector containing one of the source operands. 2233309124Sdim/// \returns A 64-bit integer vector containing the comparison results. 2234341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2235249423Sdim_mm_max_pi16(__m64 __a, __m64 __b) 2236193326Sed{ 2237249423Sdim return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 2238193326Sed} 2239193326Sed 2240341825Sdim/// Compares each of the corresponding packed 8-bit unsigned integer 2241309124Sdim/// values of the 64-bit integer vectors, and writes the greater value to the 2242309124Sdim/// corresponding bits in the destination. 2243309124Sdim/// 2244309124Sdim/// \headerfile <x86intrin.h> 2245309124Sdim/// 2246314564Sdim/// This intrinsic corresponds to the <c> PMAXUB </c> instruction. 2247309124Sdim/// 2248309124Sdim/// \param __a 2249309124Sdim/// A 64-bit integer vector containing one of the source operands. 2250309124Sdim/// \param __b 2251309124Sdim/// A 64-bit integer vector containing one of the source operands. 2252309124Sdim/// \returns A 64-bit integer vector containing the comparison results. 2253341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2254249423Sdim_mm_max_pu8(__m64 __a, __m64 __b) 2255193326Sed{ 2256249423Sdim return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 2257193326Sed} 2258193326Sed 2259341825Sdim/// Compares each of the corresponding packed 16-bit integer values of 2260309124Sdim/// the 64-bit integer vectors, and writes the lesser value to the 2261309124Sdim/// corresponding bits in the destination. 2262309124Sdim/// 2263309124Sdim/// \headerfile <x86intrin.h> 2264309124Sdim/// 2265314564Sdim/// This intrinsic corresponds to the <c> PMINSW </c> instruction. 2266309124Sdim/// 2267309124Sdim/// \param __a 2268309124Sdim/// A 64-bit integer vector containing one of the source operands. 2269309124Sdim/// \param __b 2270309124Sdim/// A 64-bit integer vector containing one of the source operands. 2271309124Sdim/// \returns A 64-bit integer vector containing the comparison results. 2272341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2273249423Sdim_mm_min_pi16(__m64 __a, __m64 __b) 2274193326Sed{ 2275249423Sdim return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 2276193326Sed} 2277193326Sed 2278341825Sdim/// Compares each of the corresponding packed 8-bit unsigned integer 2279309124Sdim/// values of the 64-bit integer vectors, and writes the lesser value to the 2280309124Sdim/// corresponding bits in the destination. 2281309124Sdim/// 2282309124Sdim/// \headerfile <x86intrin.h> 2283309124Sdim/// 2284314564Sdim/// This intrinsic corresponds to the <c> PMINUB </c> instruction. 2285309124Sdim/// 2286309124Sdim/// \param __a 2287309124Sdim/// A 64-bit integer vector containing one of the source operands. 2288309124Sdim/// \param __b 2289309124Sdim/// A 64-bit integer vector containing one of the source operands. 2290309124Sdim/// \returns A 64-bit integer vector containing the comparison results. 2291341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2292249423Sdim_mm_min_pu8(__m64 __a, __m64 __b) 2293193326Sed{ 2294249423Sdim return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 2295193326Sed} 2296193326Sed 2297341825Sdim/// Takes the most significant bit from each 8-bit element in a 64-bit 2298341825Sdim/// integer vector to create an 8-bit mask value. Zero-extends the value to 2299309124Sdim/// 32-bit integer and writes it to the destination. 2300309124Sdim/// 2301309124Sdim/// \headerfile <x86intrin.h> 2302309124Sdim/// 2303314564Sdim/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction. 2304309124Sdim/// 2305309124Sdim/// \param __a 2306309124Sdim/// A 64-bit integer vector containing the values with bits to be extracted. 2307341825Sdim/// \returns The most significant bit from each 8-bit element in \a __a, 2308341825Sdim/// written to bits [7:0]. 2309341825Sdimstatic __inline__ int __DEFAULT_FN_ATTRS_MMX 2310249423Sdim_mm_movemask_pi8(__m64 __a) 2311193326Sed{ 2312249423Sdim return __builtin_ia32_pmovmskb((__v8qi)__a); 2313193326Sed} 2314193326Sed 2315341825Sdim/// Multiplies packed 16-bit unsigned integer values and writes the 2316309124Sdim/// high-order 16 bits of each 32-bit product to the corresponding bits in 2317309124Sdim/// the destination. 2318309124Sdim/// 2319309124Sdim/// \headerfile <x86intrin.h> 2320309124Sdim/// 2321314564Sdim/// This intrinsic corresponds to the <c> PMULHUW </c> instruction. 2322309124Sdim/// 2323309124Sdim/// \param __a 2324309124Sdim/// A 64-bit integer vector containing one of the source operands. 2325309124Sdim/// \param __b 2326309124Sdim/// A 64-bit integer vector containing one of the source operands. 2327309124Sdim/// \returns A 64-bit integer vector containing the products of both operands. 2328341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2329249423Sdim_mm_mulhi_pu16(__m64 __a, __m64 __b) 2330193326Sed{ 2331249423Sdim return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 2332193326Sed} 2333193326Sed 2334341825Sdim/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the 2335309124Sdim/// destination, as specified by the immediate value operand. 2336309124Sdim/// 2337309124Sdim/// \headerfile <x86intrin.h> 2338309124Sdim/// 2339309124Sdim/// \code 2340309124Sdim/// __m64 _mm_shuffle_pi16(__m64 a, const int n); 2341309124Sdim/// \endcode 2342309124Sdim/// 2343314564Sdim/// This intrinsic corresponds to the <c> PSHUFW </c> instruction. 2344314564Sdim/// 2345309124Sdim/// \param a 2346309124Sdim/// A 64-bit integer vector containing the values to be shuffled. 2347309124Sdim/// \param n 2348309124Sdim/// An immediate value containing an 8-bit value specifying which elements to 2349314564Sdim/// copy from \a a. The destinations within the 64-bit destination are 2350314564Sdim/// assigned values as follows: \n 2351314564Sdim/// Bits [1:0] are used to assign values to bits [15:0] in the 2352314564Sdim/// destination. \n 2353314564Sdim/// Bits [3:2] are used to assign values to bits [31:16] in the 2354314564Sdim/// destination. \n 2355314564Sdim/// Bits [5:4] are used to assign values to bits [47:32] in the 2356314564Sdim/// destination. \n 2357314564Sdim/// Bits [7:6] are used to assign values to bits [63:48] in the 2358314564Sdim/// destination. \n 2359314564Sdim/// Bit value assignments: \n 2360314564Sdim/// 00: assigned from bits [15:0] of \a a. \n 2361314564Sdim/// 01: assigned from bits [31:16] of \a a. \n 2362314564Sdim/// 10: assigned from bits [47:32] of \a a. \n 2363314564Sdim/// 11: assigned from bits [63:48] of \a a. 2364309124Sdim/// \returns A 64-bit integer vector containing the shuffled values. 2365341825Sdim#define _mm_shuffle_pi16(a, n) \ 2366341825Sdim (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) 2367193326Sed 2368341825Sdim/// Conditionally copies the values from each 8-bit element in the first 2369309124Sdim/// 64-bit integer vector operand to the specified memory location, as 2370309124Sdim/// specified by the most significant bit in the corresponding element in the 2371321369Sdim/// second 64-bit integer vector operand. 2372309124Sdim/// 2373321369Sdim/// To minimize caching, the data is flagged as non-temporal 2374321369Sdim/// (unlikely to be used again soon). 2375321369Sdim/// 2376309124Sdim/// \headerfile <x86intrin.h> 2377309124Sdim/// 2378314564Sdim/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction. 2379309124Sdim/// 2380309124Sdim/// \param __d 2381309124Sdim/// A 64-bit integer vector containing the values with elements to be copied. 2382309124Sdim/// \param __n 2383309124Sdim/// A 64-bit integer vector operand. The most significant bit from each 8-bit 2384314564Sdim/// element determines whether the corresponding element in operand \a __d 2385314564Sdim/// is copied. If the most significant bit of a given element is 1, the 2386314564Sdim/// corresponding element in operand \a __d is copied. 2387309124Sdim/// \param __p 2388309124Sdim/// A pointer to a 64-bit memory location that will receive the conditionally 2389309124Sdim/// copied integer values. The address of the memory location does not have 2390309124Sdim/// to be aligned. 2391341825Sdimstatic __inline__ void __DEFAULT_FN_ATTRS_MMX 2392249423Sdim_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 2393193326Sed{ 2394249423Sdim __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 2395193326Sed} 2396193326Sed 2397341825Sdim/// Computes the rounded averages of the packed unsigned 8-bit integer 2398309124Sdim/// values and writes the averages to the corresponding bits in the 2399309124Sdim/// destination. 2400309124Sdim/// 2401309124Sdim/// \headerfile <x86intrin.h> 2402309124Sdim/// 2403314564Sdim/// This intrinsic corresponds to the <c> PAVGB </c> instruction. 2404309124Sdim/// 2405309124Sdim/// \param __a 2406309124Sdim/// A 64-bit integer vector containing one of the source operands. 2407309124Sdim/// \param __b 2408309124Sdim/// A 64-bit integer vector containing one of the source operands. 2409309124Sdim/// \returns A 64-bit integer vector containing the averages of both operands. 2410341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2411249423Sdim_mm_avg_pu8(__m64 __a, __m64 __b) 2412193326Sed{ 2413249423Sdim return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 2414193326Sed} 2415193326Sed 2416341825Sdim/// Computes the rounded averages of the packed unsigned 16-bit integer 2417309124Sdim/// values and writes the averages to the corresponding bits in the 2418309124Sdim/// destination. 2419309124Sdim/// 2420309124Sdim/// \headerfile <x86intrin.h> 2421309124Sdim/// 2422314564Sdim/// This intrinsic corresponds to the <c> PAVGW </c> instruction. 2423309124Sdim/// 2424309124Sdim/// \param __a 2425309124Sdim/// A 64-bit integer vector containing one of the source operands. 2426309124Sdim/// \param __b 2427309124Sdim/// A 64-bit integer vector containing one of the source operands. 2428309124Sdim/// \returns A 64-bit integer vector containing the averages of both operands. 2429341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2430249423Sdim_mm_avg_pu16(__m64 __a, __m64 __b) 2431193326Sed{ 2432249423Sdim return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 2433193326Sed} 2434193326Sed 2435341825Sdim/// Subtracts the corresponding 8-bit unsigned integer values of the two 2436309124Sdim/// 64-bit vector operands and computes the absolute value for each of the 2437309124Sdim/// difference. Then sum of the 8 absolute differences is written to the 2438309124Sdim/// bits [15:0] of the destination; the remaining bits [63:16] are cleared. 2439309124Sdim/// 2440309124Sdim/// \headerfile <x86intrin.h> 2441309124Sdim/// 2442314564Sdim/// This intrinsic corresponds to the <c> PSADBW </c> instruction. 2443309124Sdim/// 2444309124Sdim/// \param __a 2445309124Sdim/// A 64-bit integer vector containing one of the source operands. 2446309124Sdim/// \param __b 2447309124Sdim/// A 64-bit integer vector containing one of the source operands. 2448309124Sdim/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the 2449309124Sdim/// sets of absolute differences between both operands. The upper bits are 2450309124Sdim/// cleared. 2451341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2452249423Sdim_mm_sad_pu8(__m64 __a, __m64 __b) 2453193326Sed{ 2454249423Sdim return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 2455193326Sed} 2456193326Sed 2457314564Sdim#if defined(__cplusplus) 2458314564Sdimextern "C" { 2459314564Sdim#endif 2460314564Sdim 2461341825Sdim/// Returns the contents of the MXCSR register as a 32-bit unsigned 2462314564Sdim/// integer value. 2463314564Sdim/// 2464314564Sdim/// There are several groups of macros associated with this 2465309124Sdim/// intrinsic, including: 2466314564Sdim/// <ul> 2467314564Sdim/// <li> 2468314564Sdim/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2469309124Sdim/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2470309124Sdim/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2471309124Sdim/// _MM_GET_EXCEPTION_STATE(). 2472314564Sdim/// </li> 2473314564Sdim/// <li> 2474314564Sdim/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2475309124Sdim/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2476309124Sdim/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). 2477321369Sdim/// </li> 2478314564Sdim/// <li> 2479314564Sdim/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2480309124Sdim/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2481341825Sdim/// _MM_GET_ROUNDING_MODE(). 2482314564Sdim/// </li> 2483321369Sdim/// <li> 2484314564Sdim/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2485309124Sdim/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). 2486314564Sdim/// </li> 2487321369Sdim/// <li> 2488314564Sdim/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2489309124Sdim/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2490309124Sdim/// _MM_GET_DENORMALS_ZERO_MODE(). 2491314564Sdim/// </li> 2492314564Sdim/// </ul> 2493309124Sdim/// 2494341825Sdim/// For example, the following expression checks if an overflow exception has 2495309124Sdim/// occurred: 2496341825Sdim/// \code 2497309124Sdim/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) 2498341825Sdim/// \endcode 2499309124Sdim/// 2500341825Sdim/// The following expression gets the current rounding mode: 2501341825Sdim/// \code 2502309124Sdim/// _MM_GET_ROUNDING_MODE() 2503341825Sdim/// \endcode 2504309124Sdim/// 2505309124Sdim/// \headerfile <x86intrin.h> 2506309124Sdim/// 2507314564Sdim/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction. 2508309124Sdim/// 2509309124Sdim/// \returns A 32-bit unsigned integer containing the contents of the MXCSR 2510309124Sdim/// register. 2511314564Sdimunsigned int _mm_getcsr(void); 2512193326Sed 2513341825Sdim/// Sets the MXCSR register with the 32-bit unsigned integer value. 2514321369Sdim/// 2515314564Sdim/// There are several groups of macros associated with this intrinsic, 2516314564Sdim/// including: 2517314564Sdim/// <ul> 2518321369Sdim/// <li> 2519314564Sdim/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2520309124Sdim/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2521309124Sdim/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2522309124Sdim/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. 2523314564Sdim/// </li> 2524314564Sdim/// <li> 2525314564Sdim/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2526309124Sdim/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2527309124Sdim/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one 2528309124Sdim/// of these macros. 2529314564Sdim/// </li> 2530314564Sdim/// <li> 2531314564Sdim/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2532309124Sdim/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2533309124Sdim/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. 2534314564Sdim/// </li> 2535314564Sdim/// <li> 2536314564Sdim/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2537309124Sdim/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is 2538309124Sdim/// one of these macros. 2539314564Sdim/// </li> 2540314564Sdim/// <li> 2541314564Sdim/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2542309124Sdim/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2543309124Sdim/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. 2544314564Sdim/// </li> 2545314564Sdim/// </ul> 2546309124Sdim/// 2547309124Sdim/// For example, the following expression causes subsequent floating-point 2548309124Sdim/// operations to round up: 2549309124Sdim/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) 2550309124Sdim/// 2551309124Sdim/// The following example sets the DAZ and FTZ flags: 2552341825Sdim/// \code 2553341825Sdim/// void setFlags() { 2554341825Sdim/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); 2555341825Sdim/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 2556341825Sdim/// } 2557341825Sdim/// \endcode 2558309124Sdim/// 2559309124Sdim/// \headerfile <x86intrin.h> 2560309124Sdim/// 2561314564Sdim/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction. 2562309124Sdim/// 2563309124Sdim/// \param __i 2564309124Sdim/// A 32-bit unsigned integer value to be written to the MXCSR register. 2565321369Sdimvoid _mm_setcsr(unsigned int __i); 2566193326Sed 2567314564Sdim#if defined(__cplusplus) 2568314564Sdim} // extern "C" 2569314564Sdim#endif 2570314564Sdim 2571341825Sdim/// Selects 4 float values from the 128-bit operands of [4 x float], as 2572309124Sdim/// specified by the immediate value operand. 2573309124Sdim/// 2574309124Sdim/// \headerfile <x86intrin.h> 2575309124Sdim/// 2576309124Sdim/// \code 2577309124Sdim/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); 2578309124Sdim/// \endcode 2579309124Sdim/// 2580314564Sdim/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction. 2581309124Sdim/// 2582309124Sdim/// \param a 2583309124Sdim/// A 128-bit vector of [4 x float]. 2584309124Sdim/// \param b 2585309124Sdim/// A 128-bit vector of [4 x float]. 2586309124Sdim/// \param mask 2587309124Sdim/// An immediate value containing an 8-bit value specifying which elements to 2588321369Sdim/// copy from \a a and \a b. \n 2589314564Sdim/// Bits [3:0] specify the values copied from operand \a a. \n 2590314564Sdim/// Bits [7:4] specify the values copied from operand \a b. \n 2591314564Sdim/// The destinations within the 128-bit destination are assigned values as 2592314564Sdim/// follows: \n 2593314564Sdim/// Bits [1:0] are used to assign values to bits [31:0] in the 2594314564Sdim/// destination. \n 2595314564Sdim/// Bits [3:2] are used to assign values to bits [63:32] in the 2596314564Sdim/// destination. \n 2597314564Sdim/// Bits [5:4] are used to assign values to bits [95:64] in the 2598314564Sdim/// destination. \n 2599314564Sdim/// Bits [7:6] are used to assign values to bits [127:96] in the 2600314564Sdim/// destination. \n 2601314564Sdim/// Bit value assignments: \n 2602314564Sdim/// 00: Bits [31:0] copied from the specified operand. \n 2603314564Sdim/// 01: Bits [63:32] copied from the specified operand. \n 2604314564Sdim/// 10: Bits [95:64] copied from the specified operand. \n 2605309124Sdim/// 11: Bits [127:96] copied from the specified operand. 2606309124Sdim/// \returns A 128-bit vector of [4 x float] containing the shuffled values. 2607341825Sdim#define _mm_shuffle_ps(a, b, mask) \ 2608341825Sdim (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 2609341825Sdim (int)(mask)) 2610193326Sed 2611341825Sdim/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 2612314564Sdim/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2613309124Sdim/// 2614309124Sdim/// \headerfile <x86intrin.h> 2615309124Sdim/// 2616314564Sdim/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction. 2617309124Sdim/// 2618309124Sdim/// \param __a 2619314564Sdim/// A 128-bit vector of [4 x float]. \n 2620314564Sdim/// Bits [95:64] are written to bits [31:0] of the destination. \n 2621309124Sdim/// Bits [127:96] are written to bits [95:64] of the destination. 2622309124Sdim/// \param __b 2623309124Sdim/// A 128-bit vector of [4 x float]. 2624314564Sdim/// Bits [95:64] are written to bits [63:32] of the destination. \n 2625309124Sdim/// Bits [127:96] are written to bits [127:96] of the destination. 2626309124Sdim/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2627288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 2628249423Sdim_mm_unpackhi_ps(__m128 __a, __m128 __b) 2629193326Sed{ 2630309124Sdim return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 2631193326Sed} 2632193326Sed 2633341825Sdim/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 2634314564Sdim/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2635309124Sdim/// 2636309124Sdim/// \headerfile <x86intrin.h> 2637309124Sdim/// 2638314564Sdim/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction. 2639309124Sdim/// 2640309124Sdim/// \param __a 2641314564Sdim/// A 128-bit vector of [4 x float]. \n 2642314564Sdim/// Bits [31:0] are written to bits [31:0] of the destination. \n 2643309124Sdim/// Bits [63:32] are written to bits [95:64] of the destination. 2644309124Sdim/// \param __b 2645314564Sdim/// A 128-bit vector of [4 x float]. \n 2646314564Sdim/// Bits [31:0] are written to bits [63:32] of the destination. \n 2647309124Sdim/// Bits [63:32] are written to bits [127:96] of the destination. 2648309124Sdim/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2649288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 2650249423Sdim_mm_unpacklo_ps(__m128 __a, __m128 __b) 2651193326Sed{ 2652309124Sdim return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 2653193326Sed} 2654193326Sed 2655341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2656309124Sdim/// 32 bits are set to the lower 32 bits of the second parameter. The upper 2657309124Sdim/// 96 bits are set to the upper 96 bits of the first parameter. 2658309124Sdim/// 2659309124Sdim/// \headerfile <x86intrin.h> 2660309124Sdim/// 2661341825Sdim/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c> 2662341825Sdim/// instruction. 2663309124Sdim/// 2664309124Sdim/// \param __a 2665309124Sdim/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are 2666309124Sdim/// written to the upper 96 bits of the result. 2667309124Sdim/// \param __b 2668309124Sdim/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are 2669309124Sdim/// written to the lower 32 bits of the result. 2670309124Sdim/// \returns A 128-bit floating-point vector of [4 x float]. 2671288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 2672249423Sdim_mm_move_ss(__m128 __a, __m128 __b) 2673193326Sed{ 2674341825Sdim __a[0] = __b[0]; 2675341825Sdim return __a; 2676193326Sed} 2677193326Sed 2678341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2679309124Sdim/// 64 bits are set to the upper 64 bits of the second parameter. The upper 2680309124Sdim/// 64 bits are set to the upper 64 bits of the first parameter. 2681309124Sdim/// 2682309124Sdim/// \headerfile <x86intrin.h> 2683309124Sdim/// 2684314564Sdim/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 2685309124Sdim/// 2686309124Sdim/// \param __a 2687309124Sdim/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2688309124Sdim/// written to the upper 64 bits of the result. 2689309124Sdim/// \param __b 2690309124Sdim/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2691309124Sdim/// written to the lower 64 bits of the result. 2692309124Sdim/// \returns A 128-bit floating-point vector of [4 x float]. 2693288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 2694249423Sdim_mm_movehl_ps(__m128 __a, __m128 __b) 2695193326Sed{ 2696309124Sdim return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 2697193326Sed} 2698193326Sed 2699341825Sdim/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2700309124Sdim/// 64 bits are set to the lower 64 bits of the first parameter. The upper 2701309124Sdim/// 64 bits are set to the lower 64 bits of the second parameter. 2702309124Sdim/// 2703309124Sdim/// \headerfile <x86intrin.h> 2704309124Sdim/// 2705314564Sdim/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 2706309124Sdim/// 2707309124Sdim/// \param __a 2708309124Sdim/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2709309124Sdim/// written to the lower 64 bits of the result. 2710309124Sdim/// \param __b 2711309124Sdim/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2712309124Sdim/// written to the upper 64 bits of the result. 2713309124Sdim/// \returns A 128-bit floating-point vector of [4 x float]. 2714288943Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS 2715249423Sdim_mm_movelh_ps(__m128 __a, __m128 __b) 2716193326Sed{ 2717309124Sdim return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 2718193326Sed} 2719193326Sed 2720341825Sdim/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x 2721309124Sdim/// float]. 2722309124Sdim/// 2723309124Sdim/// \headerfile <x86intrin.h> 2724309124Sdim/// 2725321369Sdim/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2726309124Sdim/// 2727309124Sdim/// \param __a 2728309124Sdim/// A 64-bit vector of [4 x i16]. The elements of the destination are copied 2729309124Sdim/// from the corresponding elements in this operand. 2730309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2731309124Sdim/// values from the operand. 2732341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2733249423Sdim_mm_cvtpi16_ps(__m64 __a) 2734193326Sed{ 2735249423Sdim __m64 __b, __c; 2736249423Sdim __m128 __r; 2737193326Sed 2738249423Sdim __b = _mm_setzero_si64(); 2739249423Sdim __b = _mm_cmpgt_pi16(__b, __a); 2740249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 2741249423Sdim __r = _mm_setzero_ps(); 2742249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 2743249423Sdim __r = _mm_movelh_ps(__r, __r); 2744249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 2745249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 2746193326Sed 2747249423Sdim return __r; 2748193326Sed} 2749193326Sed 2750341825Sdim/// Converts a 64-bit vector of 16-bit unsigned integer values into a 2751309124Sdim/// 128-bit vector of [4 x float]. 2752309124Sdim/// 2753309124Sdim/// \headerfile <x86intrin.h> 2754309124Sdim/// 2755321369Sdim/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2756309124Sdim/// 2757309124Sdim/// \param __a 2758309124Sdim/// A 64-bit vector of 16-bit unsigned integer values. The elements of the 2759309124Sdim/// destination are copied from the corresponding elements in this operand. 2760309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2761309124Sdim/// values from the operand. 2762341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2763249423Sdim_mm_cvtpu16_ps(__m64 __a) 2764193326Sed{ 2765249423Sdim __m64 __b, __c; 2766249423Sdim __m128 __r; 2767193326Sed 2768249423Sdim __b = _mm_setzero_si64(); 2769249423Sdim __c = _mm_unpackhi_pi16(__a, __b); 2770249423Sdim __r = _mm_setzero_ps(); 2771249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 2772249423Sdim __r = _mm_movelh_ps(__r, __r); 2773249423Sdim __c = _mm_unpacklo_pi16(__a, __b); 2774249423Sdim __r = _mm_cvtpi32_ps(__r, __c); 2775193326Sed 2776249423Sdim return __r; 2777193326Sed} 2778193326Sed 2779341825Sdim/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] 2780309124Sdim/// into a 128-bit vector of [4 x float]. 2781309124Sdim/// 2782309124Sdim/// \headerfile <x86intrin.h> 2783309124Sdim/// 2784321369Sdim/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2785309124Sdim/// 2786309124Sdim/// \param __a 2787309124Sdim/// A 64-bit vector of [8 x i8]. The elements of the destination are copied 2788309124Sdim/// from the corresponding lower 4 elements in this operand. 2789309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2790309124Sdim/// values from the operand. 2791341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2792249423Sdim_mm_cvtpi8_ps(__m64 __a) 2793193326Sed{ 2794249423Sdim __m64 __b; 2795296417Sdim 2796249423Sdim __b = _mm_setzero_si64(); 2797249423Sdim __b = _mm_cmpgt_pi8(__b, __a); 2798249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 2799193326Sed 2800249423Sdim return _mm_cvtpi16_ps(__b); 2801193326Sed} 2802193326Sed 2803341825Sdim/// Converts the lower four unsigned 8-bit integer values from a 64-bit 2804309124Sdim/// vector of [8 x u8] into a 128-bit vector of [4 x float]. 2805309124Sdim/// 2806309124Sdim/// \headerfile <x86intrin.h> 2807309124Sdim/// 2808321369Sdim/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2809309124Sdim/// 2810309124Sdim/// \param __a 2811309124Sdim/// A 64-bit vector of unsigned 8-bit integer values. The elements of the 2812309124Sdim/// destination are copied from the corresponding lower 4 elements in this 2813309124Sdim/// operand. 2814309124Sdim/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2815309124Sdim/// values from the source operand. 2816341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2817249423Sdim_mm_cvtpu8_ps(__m64 __a) 2818193326Sed{ 2819249423Sdim __m64 __b; 2820296417Sdim 2821249423Sdim __b = _mm_setzero_si64(); 2822249423Sdim __b = _mm_unpacklo_pi8(__a, __b); 2823193326Sed 2824249423Sdim return _mm_cvtpi16_ps(__b); 2825193326Sed} 2826193326Sed 2827341825Sdim/// Converts the two 32-bit signed integer values from each 64-bit vector 2828309124Sdim/// operand of [2 x i32] into a 128-bit vector of [4 x float]. 2829309124Sdim/// 2830309124Sdim/// \headerfile <x86intrin.h> 2831309124Sdim/// 2832321369Sdim/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2833309124Sdim/// 2834309124Sdim/// \param __a 2835309124Sdim/// A 64-bit vector of [2 x i32]. The lower elements of the destination are 2836309124Sdim/// copied from the elements in this operand. 2837309124Sdim/// \param __b 2838309124Sdim/// A 64-bit vector of [2 x i32]. The upper elements of the destination are 2839309124Sdim/// copied from the elements in this operand. 2840309124Sdim/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 2841309124Sdim/// copied and converted values from the first operand. The upper 64 bits 2842309124Sdim/// contain the copied and converted values from the second operand. 2843341825Sdimstatic __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2844249423Sdim_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 2845193326Sed{ 2846249423Sdim __m128 __c; 2847296417Sdim 2848249423Sdim __c = _mm_setzero_ps(); 2849249423Sdim __c = _mm_cvtpi32_ps(__c, __b); 2850249423Sdim __c = _mm_movelh_ps(__c, __c); 2851193326Sed 2852249423Sdim return _mm_cvtpi32_ps(__c, __a); 2853193326Sed} 2854193326Sed 2855341825Sdim/// Converts each single-precision floating-point element of a 128-bit 2856309124Sdim/// floating-point vector of [4 x float] into a 16-bit signed integer, and 2857321369Sdim/// packs the results into a 64-bit integer vector of [4 x i16]. 2858309124Sdim/// 2859321369Sdim/// If the floating-point element is NaN or infinity, or if the 2860321369Sdim/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, 2861321369Sdim/// it is converted to 0x8000. Otherwise if the floating-point element is 2862321369Sdim/// greater than 0x7FFF, it is converted to 0x7FFF. 2863321369Sdim/// 2864309124Sdim/// \headerfile <x86intrin.h> 2865309124Sdim/// 2866321369Sdim/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2867309124Sdim/// 2868309124Sdim/// \param __a 2869309124Sdim/// A 128-bit floating-point vector of [4 x float]. 2870309124Sdim/// \returns A 64-bit integer vector of [4 x i16] containing the converted 2871309124Sdim/// values. 2872341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2873249423Sdim_mm_cvtps_pi16(__m128 __a) 2874193326Sed{ 2875249423Sdim __m64 __b, __c; 2876296417Sdim 2877249423Sdim __b = _mm_cvtps_pi32(__a); 2878249423Sdim __a = _mm_movehl_ps(__a, __a); 2879249423Sdim __c = _mm_cvtps_pi32(__a); 2880296417Sdim 2881266674Sdim return _mm_packs_pi32(__b, __c); 2882193326Sed} 2883193326Sed 2884341825Sdim/// Converts each single-precision floating-point element of a 128-bit 2885309124Sdim/// floating-point vector of [4 x float] into an 8-bit signed integer, and 2886309124Sdim/// packs the results into the lower 32 bits of a 64-bit integer vector of 2887321369Sdim/// [8 x i8]. The upper 32 bits of the vector are set to 0. 2888309124Sdim/// 2889321369Sdim/// If the floating-point element is NaN or infinity, or if the 2890321369Sdim/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it 2891321369Sdim/// is converted to 0x80. Otherwise if the floating-point element is greater 2892321369Sdim/// than 0x7F, it is converted to 0x7F. 2893321369Sdim/// 2894309124Sdim/// \headerfile <x86intrin.h> 2895309124Sdim/// 2896321369Sdim/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2897309124Sdim/// 2898309124Sdim/// \param __a 2899309124Sdim/// 128-bit floating-point vector of [4 x float]. 2900309124Sdim/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the 2901309124Sdim/// converted values and the uppper 32 bits are set to zero. 2902341825Sdimstatic __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2903249423Sdim_mm_cvtps_pi8(__m128 __a) 2904193326Sed{ 2905249423Sdim __m64 __b, __c; 2906296417Sdim 2907249423Sdim __b = _mm_cvtps_pi16(__a); 2908249423Sdim __c = _mm_setzero_si64(); 2909296417Sdim 2910249423Sdim return _mm_packs_pi16(__b, __c); 2911193326Sed} 2912193326Sed 2913341825Sdim/// Extracts the sign bits from each single-precision floating-point 2914309124Sdim/// element of a 128-bit floating-point vector of [4 x float] and returns the 2915309124Sdim/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set 2916309124Sdim/// to zero. 2917309124Sdim/// 2918309124Sdim/// \headerfile <x86intrin.h> 2919309124Sdim/// 2920314564Sdim/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction. 2921309124Sdim/// 2922309124Sdim/// \param __a 2923309124Sdim/// A 128-bit floating-point vector of [4 x float]. 2924309124Sdim/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each 2925309124Sdim/// single-precision floating-point element of the parameter. Bits [31:4] are 2926309124Sdim/// set to zero. 2927288943Sdimstatic __inline__ int __DEFAULT_FN_ATTRS 2928249423Sdim_mm_movemask_ps(__m128 __a) 2929193326Sed{ 2930309124Sdim return __builtin_ia32_movmskps((__v4sf)__a); 2931193326Sed} 2932193326Sed 2933296417Sdim 2934309124Sdim#define _MM_ALIGN16 __attribute__((aligned(16))) 2935296417Sdim 2936193326Sed#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2937193326Sed 2938193326Sed#define _MM_EXCEPT_INVALID (0x0001) 2939193326Sed#define _MM_EXCEPT_DENORM (0x0002) 2940193326Sed#define _MM_EXCEPT_DIV_ZERO (0x0004) 2941193326Sed#define _MM_EXCEPT_OVERFLOW (0x0008) 2942193326Sed#define _MM_EXCEPT_UNDERFLOW (0x0010) 2943193326Sed#define _MM_EXCEPT_INEXACT (0x0020) 2944193326Sed#define _MM_EXCEPT_MASK (0x003f) 2945193326Sed 2946193326Sed#define _MM_MASK_INVALID (0x0080) 2947193326Sed#define _MM_MASK_DENORM (0x0100) 2948193326Sed#define _MM_MASK_DIV_ZERO (0x0200) 2949193326Sed#define _MM_MASK_OVERFLOW (0x0400) 2950193326Sed#define _MM_MASK_UNDERFLOW (0x0800) 2951193326Sed#define _MM_MASK_INEXACT (0x1000) 2952193326Sed#define _MM_MASK_MASK (0x1f80) 2953193326Sed 2954193326Sed#define _MM_ROUND_NEAREST (0x0000) 2955193326Sed#define _MM_ROUND_DOWN (0x2000) 2956193326Sed#define _MM_ROUND_UP (0x4000) 2957193326Sed#define _MM_ROUND_TOWARD_ZERO (0x6000) 2958193326Sed#define _MM_ROUND_MASK (0x6000) 2959193326Sed 2960193326Sed#define _MM_FLUSH_ZERO_MASK (0x8000) 2961193326Sed#define _MM_FLUSH_ZERO_ON (0x8000) 2962234353Sdim#define _MM_FLUSH_ZERO_OFF (0x0000) 2963193326Sed 2964193326Sed#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 2965193326Sed#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 2966193326Sed#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 2967193326Sed#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 2968193326Sed 2969193326Sed#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 2970193326Sed#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 2971193326Sed#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 2972193326Sed#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 2973193326Sed 2974193326Sed#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2975193326Seddo { \ 2976193326Sed __m128 tmp3, tmp2, tmp1, tmp0; \ 2977193326Sed tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 2978193326Sed tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 2979193326Sed tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 2980193326Sed tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 2981193326Sed (row0) = _mm_movelh_ps(tmp0, tmp2); \ 2982193326Sed (row1) = _mm_movehl_ps(tmp2, tmp0); \ 2983193326Sed (row2) = _mm_movelh_ps(tmp1, tmp3); \ 2984203955Srdivacky (row3) = _mm_movehl_ps(tmp3, tmp1); \ 2985193326Sed} while (0) 2986193326Sed 2987212904Sdim/* Aliases for compatibility. */ 2988212904Sdim#define _m_pextrw _mm_extract_pi16 2989212904Sdim#define _m_pinsrw _mm_insert_pi16 2990212904Sdim#define _m_pmaxsw _mm_max_pi16 2991212904Sdim#define _m_pmaxub _mm_max_pu8 2992212904Sdim#define _m_pminsw _mm_min_pi16 2993212904Sdim#define _m_pminub _mm_min_pu8 2994212904Sdim#define _m_pmovmskb _mm_movemask_pi8 2995212904Sdim#define _m_pmulhuw _mm_mulhi_pu16 2996212904Sdim#define _m_pshufw _mm_shuffle_pi16 2997212904Sdim#define _m_maskmovq _mm_maskmove_si64 2998212904Sdim#define _m_pavgb _mm_avg_pu8 2999212904Sdim#define _m_pavgw _mm_avg_pu16 3000212904Sdim#define _m_psadbw _mm_sad_pu8 3001212904Sdim#define _m_ _mm_ 3002212904Sdim#define _m_ _mm_ 3003212904Sdim 3004288943Sdim#undef __DEFAULT_FN_ATTRS 3005341825Sdim#undef __DEFAULT_FN_ATTRS_MMX 3006288943Sdim 3007194179Sed/* Ugly hack for backwards-compatibility (compatible with gcc) */ 3008309124Sdim#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 3009193326Sed#include <emmintrin.h> 3010194179Sed#endif 3011193326Sed 3012193326Sed#endif /* __XMMINTRIN_H */ 3013