xmmintrin.h revision 341825
159078Smdodd/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 259078Smdodd * 359078Smdodd * Permission is hereby granted, free of charge, to any person obtaining a copy 434480Sjulian * of this software and associated documentation files (the "Software"), to deal 534480Sjulian * in the Software without restriction, including without limitation the rights 634480Sjulian * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 734480Sjulian * copies of the Software, and to permit persons to whom the Software is 834480Sjulian * furnished to do so, subject to the following conditions: 959078Smdodd * 1034480Sjulian * The above copyright notice and this permission notice shall be included in 1134480Sjulian * all copies or substantial portions of the Software. 1234480Sjulian * 1334480Sjulian * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1434480Sjulian * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1534480Sjulian * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1634480Sjulian * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1759078Smdodd * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1859078Smdodd * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 1934480Sjulian * THE SOFTWARE. 2034480Sjulian * 2134480Sjulian *===-----------------------------------------------------------------------=== 2234480Sjulian */ 2334480Sjulian 2434480Sjulian#ifndef __XMMINTRIN_H 2559078Smdodd#define __XMMINTRIN_H 2659078Smdodd 2734480Sjulian#include <mmintrin.h> 2834480Sjulian 2934480Sjuliantypedef int __v4si __attribute__((__vector_size__(16))); 3034480Sjuliantypedef float __v4sf __attribute__((__vector_size__(16))); 3134480Sjuliantypedef float __m128 __attribute__((__vector_size__(16))); 3245791Speter 3345791Speter/* Unsigned types */ 3434480Sjuliantypedef unsigned int __v4su __attribute__((__vector_size__(16))); 3539234Sgibbs 3639234Sgibbs/* This header should only be included in a hosted environment as it depends on 3745791Speter * a standard library to provide allocation routines. */ 3845791Speter#if __STDC_HOSTED__ 3934480Sjulian#include <mm_malloc.h> 4059078Smdodd#endif 4159078Smdodd 4239234Sgibbs/* Define the default attributes for the functions in this file. */ 4339234Sgibbs#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) 4439234Sgibbs#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) 4539234Sgibbs 4652042Smdodd/// Adds the 32-bit float values in the low-order bits of the operands. 4752042Smdodd/// 4852042Smdodd/// \headerfile <x86intrin.h> 4952042Smdodd/// 5059078Smdodd/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions. 5159078Smdodd/// 5259078Smdodd/// \param __a 5359078Smdodd/// A 128-bit vector of [4 x float] containing one of the source operands. 5459078Smdodd/// The lower 32 bits of this operand are used in the calculation. 5559078Smdodd/// \param __b 5659078Smdodd/// A 128-bit vector of [4 x float] containing one of the source operands. 5759078Smdodd/// The lower 32 bits of this operand are used in the calculation. 5859078Smdodd/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 5959078Smdodd/// of the lower 32 bits of both operands. The upper 96 bits are copied from 6059078Smdodd/// the upper 96 bits of the first source operand. 6159078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS 6259078Smdodd_mm_add_ss(__m128 __a, __m128 __b) 6352042Smdodd{ 6434480Sjulian __a[0] += __b[0]; 6534480Sjulian return __a; 6659078Smdodd} 6759078Smdodd 6859078Smdodd/// Adds two 128-bit vectors of [4 x float], and returns the results of 6934480Sjulian/// the addition. 7059078Smdodd/// 7139234Sgibbs/// \headerfile <x86intrin.h> 7259078Smdodd/// 7334480Sjulian/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions. 7452042Smdodd/// 7552042Smdodd/// \param __a 7652042Smdodd/// A 128-bit vector of [4 x float] containing one of the source operands. 7734480Sjulian/// \param __b 7845791Speter/// A 128-bit vector of [4 x float] containing one of the source operands. 7945791Speter/// \returns A 128-bit vector of [4 x float] containing the sums of both 8045791Speter/// operands. 8145791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS 8234480Sjulian_mm_add_ps(__m128 __a, __m128 __b) 8352042Smdodd{ 8445791Speter return (__m128)((__v4sf)__a + (__v4sf)__b); 8552042Smdodd} 8652042Smdodd 8752042Smdodd/// Subtracts the 32-bit float value in the low-order bits of the second 8852042Smdodd/// operand from the corresponding value in the first operand. 8945791Speter/// 9034480Sjulian/// \headerfile <x86intrin.h> 9152042Smdodd/// 9252042Smdodd/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions. 9352042Smdodd/// 9445791Speter/// \param __a 9545791Speter/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 9634480Sjulian/// of this operand are used in the calculation. 9734480Sjulian/// \param __b 9845791Speter/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 9959078Smdodd/// bits of this operand are used in the calculation. 10034480Sjulian/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 10159078Smdodd/// difference of the lower 32 bits of both operands. The upper 96 bits are 10245791Speter/// copied from the upper 96 bits of the first source operand. 10345791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS 10439234Sgibbs_mm_sub_ss(__m128 __a, __m128 __b) 10545791Speter{ 10659078Smdodd __a[0] -= __b[0]; 10759078Smdodd return __a; 10834480Sjulian} 10945791Speter 11059078Smdodd/// Subtracts each of the values of the second operand from the first 11145791Speter/// operand, both of which are 128-bit vectors of [4 x float] and returns 11245791Speter/// the results of the subtraction. 11359078Smdodd/// 11459078Smdodd/// \headerfile <x86intrin.h> 11534480Sjulian/// 11634480Sjulian/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions. 11759078Smdodd/// 11859078Smdodd/// \param __a 11959078Smdodd/// A 128-bit vector of [4 x float] containing the minuend. 12059078Smdodd/// \param __b 12159078Smdodd/// A 128-bit vector of [4 x float] containing the subtrahend. 12259078Smdodd/// \returns A 128-bit vector of [4 x float] containing the differences between 12359078Smdodd/// both operands. 12459078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS 12559078Smdodd_mm_sub_ps(__m128 __a, __m128 __b) 12645791Speter{ 12759078Smdodd return (__m128)((__v4sf)__a - (__v4sf)__b); 12859078Smdodd} 12945791Speter 13059078Smdodd/// Multiplies two 32-bit float values in the low-order bits of the 13134480Sjulian/// operands. 13239234Sgibbs/// 13339234Sgibbs/// \headerfile <x86intrin.h> 13459078Smdodd/// 13559078Smdodd/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions. 13659078Smdodd/// 13759078Smdodd/// \param __a 13859078Smdodd/// A 128-bit vector of [4 x float] containing one of the source operands. 13959078Smdodd/// The lower 32 bits of this operand are used in the calculation. 14059078Smdodd/// \param __b 14159078Smdodd/// A 128-bit vector of [4 x float] containing one of the source operands. 142104710Speter/// The lower 32 bits of this operand are used in the calculation. 14359078Smdodd/// \returns A 128-bit vector of [4 x float] containing the product of the lower 14459078Smdodd/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 14559078Smdodd/// bits of the first source operand. 14639234Sgibbsstatic __inline__ __m128 __DEFAULT_FN_ATTRS 14759078Smdodd_mm_mul_ss(__m128 __a, __m128 __b) 14845791Speter{ 14934480Sjulian __a[0] *= __b[0]; 15034480Sjulian return __a; 15159078Smdodd} 15234480Sjulian 15339234Sgibbs/// Multiplies two 128-bit vectors of [4 x float] and returns the 15439234Sgibbs/// results of the multiplication. 15559078Smdodd/// 15645791Speter/// \headerfile <x86intrin.h> 15734480Sjulian/// 15834480Sjulian/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions. 15939234Sgibbs/// 16039234Sgibbs/// \param __a 16145791Speter/// A 128-bit vector of [4 x float] containing one of the source operands. 16239234Sgibbs/// \param __b 16334480Sjulian/// A 128-bit vector of [4 x float] containing one of the source operands. 16473280Smarkm/// \returns A 128-bit vector of [4 x float] containing the products of both 16573280Smarkm/// operands. 16659078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS 16759078Smdodd_mm_mul_ps(__m128 __a, __m128 __b) 16859078Smdodd{ 16959078Smdodd return (__m128)((__v4sf)__a * (__v4sf)__b); 17045791Speter} 17159078Smdodd 17259078Smdodd/// Divides the value in the low-order 32 bits of the first operand by 17345791Speter/// the corresponding value in the second operand. 17445791Speter/// 17545791Speter/// \headerfile <x86intrin.h> 17645791Speter/// 17745791Speter/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions. 17859078Smdodd/// 17959078Smdodd/// \param __a 18034480Sjulian/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 18134480Sjulian/// bits of this operand are used in the calculation. 18234480Sjulian/// \param __b 18334480Sjulian/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 18434480Sjulian/// of this operand are used in the calculation. 18534480Sjulian/// \returns A 128-bit vector of [4 x float] containing the quotients of the 18634480Sjulian/// lower 32 bits of both operands. The upper 96 bits are copied from the 18759078Smdodd/// upper 96 bits of the first source operand. 18859078Smdoddstatic __inline__ __m128 __DEFAULT_FN_ATTRS 18959078Smdodd_mm_div_ss(__m128 __a, __m128 __b) 19059078Smdodd{ 19159078Smdodd __a[0] /= __b[0]; 19259078Smdodd return __a; 19359078Smdodd} 19459078Smdodd 19559078Smdodd/// Divides two 128-bit vectors of [4 x float]. 19659078Smdodd/// 19759078Smdodd/// \headerfile <x86intrin.h> 19859078Smdodd/// 19959078Smdodd/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions. 20059078Smdodd/// 20134480Sjulian/// \param __a 20234480Sjulian/// A 128-bit vector of [4 x float] containing the dividend. 20334480Sjulian/// \param __b 20434480Sjulian/// A 128-bit vector of [4 x float] containing the divisor. 20534480Sjulian/// \returns A 128-bit vector of [4 x float] containing the quotients of both 20634480Sjulian/// operands. 20734480Sjulianstatic __inline__ __m128 __DEFAULT_FN_ATTRS 20834480Sjulian_mm_div_ps(__m128 __a, __m128 __b) 20945791Speter{ 21045791Speter return (__m128)((__v4sf)__a / (__v4sf)__b); 21145791Speter} 21245791Speter 21345791Speter/// Calculates the square root of the value stored in the low-order bits 21445791Speter/// of a 128-bit vector of [4 x float]. 21545791Speter/// 21645791Speter/// \headerfile <x86intrin.h> 21745791Speter/// 21845791Speter/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions. 21945791Speter/// 22059078Smdodd/// \param __a 22145791Speter/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 22245791Speter/// used in the calculation. 22345791Speter/// \returns A 128-bit vector of [4 x float] containing the square root of the 22445791Speter/// value in the low-order bits of the operand. 22545791Speterstatic __inline__ __m128 __DEFAULT_FN_ATTRS 226_mm_sqrt_ss(__m128 __a) 227{ 228 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); 229} 230 231/// Calculates the square roots of the values stored in a 128-bit vector 232/// of [4 x float]. 233/// 234/// \headerfile <x86intrin.h> 235/// 236/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions. 237/// 238/// \param __a 239/// A 128-bit vector of [4 x float]. 240/// \returns A 128-bit vector of [4 x float] containing the square roots of the 241/// values in the operand. 242static __inline__ __m128 __DEFAULT_FN_ATTRS 243_mm_sqrt_ps(__m128 __a) 244{ 245 return __builtin_ia32_sqrtps((__v4sf)__a); 246} 247 248/// Calculates the approximate reciprocal of the value stored in the 249/// low-order bits of a 128-bit vector of [4 x float]. 250/// 251/// \headerfile <x86intrin.h> 252/// 253/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions. 254/// 255/// \param __a 256/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 257/// used in the calculation. 258/// \returns A 128-bit vector of [4 x float] containing the approximate 259/// reciprocal of the value in the low-order bits of the operand. 260static __inline__ __m128 __DEFAULT_FN_ATTRS 261_mm_rcp_ss(__m128 __a) 262{ 263 return (__m128)__builtin_ia32_rcpss((__v4sf)__a); 264} 265 266/// Calculates the approximate reciprocals of the values stored in a 267/// 128-bit vector of [4 x float]. 268/// 269/// \headerfile <x86intrin.h> 270/// 271/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions. 272/// 273/// \param __a 274/// A 128-bit vector of [4 x float]. 275/// \returns A 128-bit vector of [4 x float] containing the approximate 276/// reciprocals of the values in the operand. 277static __inline__ __m128 __DEFAULT_FN_ATTRS 278_mm_rcp_ps(__m128 __a) 279{ 280 return (__m128)__builtin_ia32_rcpps((__v4sf)__a); 281} 282 283/// Calculates the approximate reciprocal of the square root of the value 284/// stored in the low-order bits of a 128-bit vector of [4 x float]. 285/// 286/// \headerfile <x86intrin.h> 287/// 288/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions. 289/// 290/// \param __a 291/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 292/// used in the calculation. 293/// \returns A 128-bit vector of [4 x float] containing the approximate 294/// reciprocal of the square root of the value in the low-order bits of the 295/// operand. 296static __inline__ __m128 __DEFAULT_FN_ATTRS 297_mm_rsqrt_ss(__m128 __a) 298{ 299 return __builtin_ia32_rsqrtss((__v4sf)__a); 300} 301 302/// Calculates the approximate reciprocals of the square roots of the 303/// values stored in a 128-bit vector of [4 x float]. 304/// 305/// \headerfile <x86intrin.h> 306/// 307/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions. 308/// 309/// \param __a 310/// A 128-bit vector of [4 x float]. 311/// \returns A 128-bit vector of [4 x float] containing the approximate 312/// reciprocals of the square roots of the values in the operand. 313static __inline__ __m128 __DEFAULT_FN_ATTRS 314_mm_rsqrt_ps(__m128 __a) 315{ 316 return __builtin_ia32_rsqrtps((__v4sf)__a); 317} 318 319/// Compares two 32-bit float values in the low-order bits of both 320/// operands and returns the lesser value in the low-order bits of the 321/// vector of [4 x float]. 322/// 323/// \headerfile <x86intrin.h> 324/// 325/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions. 326/// 327/// \param __a 328/// A 128-bit vector of [4 x float] containing one of the operands. The lower 329/// 32 bits of this operand are used in the comparison. 330/// \param __b 331/// A 128-bit vector of [4 x float] containing one of the operands. The lower 332/// 32 bits of this operand are used in the comparison. 333/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 334/// minimum value between both operands. The upper 96 bits are copied from 335/// the upper 96 bits of the first source operand. 336static __inline__ __m128 __DEFAULT_FN_ATTRS 337_mm_min_ss(__m128 __a, __m128 __b) 338{ 339 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 340} 341 342/// Compares two 128-bit vectors of [4 x float] and returns the lesser 343/// of each pair of values. 344/// 345/// \headerfile <x86intrin.h> 346/// 347/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions. 348/// 349/// \param __a 350/// A 128-bit vector of [4 x float] containing one of the operands. 351/// \param __b 352/// A 128-bit vector of [4 x float] containing one of the operands. 353/// \returns A 128-bit vector of [4 x float] containing the minimum values 354/// between both operands. 355static __inline__ __m128 __DEFAULT_FN_ATTRS 356_mm_min_ps(__m128 __a, __m128 __b) 357{ 358 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 359} 360 361/// Compares two 32-bit float values in the low-order bits of both 362/// operands and returns the greater value in the low-order bits of a 128-bit 363/// vector of [4 x float]. 364/// 365/// \headerfile <x86intrin.h> 366/// 367/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions. 368/// 369/// \param __a 370/// A 128-bit vector of [4 x float] containing one of the operands. The lower 371/// 32 bits of this operand are used in the comparison. 372/// \param __b 373/// A 128-bit vector of [4 x float] containing one of the operands. The lower 374/// 32 bits of this operand are used in the comparison. 375/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 376/// maximum value between both operands. The upper 96 bits are copied from 377/// the upper 96 bits of the first source operand. 378static __inline__ __m128 __DEFAULT_FN_ATTRS 379_mm_max_ss(__m128 __a, __m128 __b) 380{ 381 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 382} 383 384/// Compares two 128-bit vectors of [4 x float] and returns the greater 385/// of each pair of values. 386/// 387/// \headerfile <x86intrin.h> 388/// 389/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions. 390/// 391/// \param __a 392/// A 128-bit vector of [4 x float] containing one of the operands. 393/// \param __b 394/// A 128-bit vector of [4 x float] containing one of the operands. 395/// \returns A 128-bit vector of [4 x float] containing the maximum values 396/// between both operands. 397static __inline__ __m128 __DEFAULT_FN_ATTRS 398_mm_max_ps(__m128 __a, __m128 __b) 399{ 400 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 401} 402 403/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. 404/// 405/// \headerfile <x86intrin.h> 406/// 407/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions. 408/// 409/// \param __a 410/// A 128-bit vector containing one of the source operands. 411/// \param __b 412/// A 128-bit vector containing one of the source operands. 413/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 414/// values between both operands. 415static __inline__ __m128 __DEFAULT_FN_ATTRS 416_mm_and_ps(__m128 __a, __m128 __b) 417{ 418 return (__m128)((__v4su)__a & (__v4su)__b); 419} 420 421/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using 422/// the one's complement of the values contained in the first source 423/// operand. 424/// 425/// \headerfile <x86intrin.h> 426/// 427/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions. 428/// 429/// \param __a 430/// A 128-bit vector of [4 x float] containing the first source operand. The 431/// one's complement of this value is used in the bitwise AND. 432/// \param __b 433/// A 128-bit vector of [4 x float] containing the second source operand. 434/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 435/// one's complement of the first operand and the values in the second 436/// operand. 437static __inline__ __m128 __DEFAULT_FN_ATTRS 438_mm_andnot_ps(__m128 __a, __m128 __b) 439{ 440 return (__m128)(~(__v4su)__a & (__v4su)__b); 441} 442 443/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. 444/// 445/// \headerfile <x86intrin.h> 446/// 447/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions. 448/// 449/// \param __a 450/// A 128-bit vector of [4 x float] containing one of the source operands. 451/// \param __b 452/// A 128-bit vector of [4 x float] containing one of the source operands. 453/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 454/// values between both operands. 455static __inline__ __m128 __DEFAULT_FN_ATTRS 456_mm_or_ps(__m128 __a, __m128 __b) 457{ 458 return (__m128)((__v4su)__a | (__v4su)__b); 459} 460 461/// Performs a bitwise exclusive OR of two 128-bit vectors of 462/// [4 x float]. 463/// 464/// \headerfile <x86intrin.h> 465/// 466/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions. 467/// 468/// \param __a 469/// A 128-bit vector of [4 x float] containing one of the source operands. 470/// \param __b 471/// A 128-bit vector of [4 x float] containing one of the source operands. 472/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 473/// of the values between both operands. 474static __inline__ __m128 __DEFAULT_FN_ATTRS 475_mm_xor_ps(__m128 __a, __m128 __b) 476{ 477 return (__m128)((__v4su)__a ^ (__v4su)__b); 478} 479 480/// Compares two 32-bit float values in the low-order bits of both 481/// operands for equality and returns the result of the comparison in the 482/// low-order bits of a vector [4 x float]. 483/// 484/// \headerfile <x86intrin.h> 485/// 486/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions. 487/// 488/// \param __a 489/// A 128-bit vector of [4 x float] containing one of the operands. The lower 490/// 32 bits of this operand are used in the comparison. 491/// \param __b 492/// A 128-bit vector of [4 x float] containing one of the operands. The lower 493/// 32 bits of this operand are used in the comparison. 494/// \returns A 128-bit vector of [4 x float] containing the comparison results 495/// in the low-order bits. 496static __inline__ __m128 __DEFAULT_FN_ATTRS 497_mm_cmpeq_ss(__m128 __a, __m128 __b) 498{ 499 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 500} 501 502/// Compares each of the corresponding 32-bit float values of the 503/// 128-bit vectors of [4 x float] for equality. 504/// 505/// \headerfile <x86intrin.h> 506/// 507/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions. 508/// 509/// \param __a 510/// A 128-bit vector of [4 x float]. 511/// \param __b 512/// A 128-bit vector of [4 x float]. 513/// \returns A 128-bit vector of [4 x float] containing the comparison results. 514static __inline__ __m128 __DEFAULT_FN_ATTRS 515_mm_cmpeq_ps(__m128 __a, __m128 __b) 516{ 517 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 518} 519 520/// Compares two 32-bit float values in the low-order bits of both 521/// operands to determine if the value in the first operand is less than the 522/// corresponding value in the second operand and returns the result of the 523/// comparison in the low-order bits of a vector of [4 x float]. 524/// 525/// \headerfile <x86intrin.h> 526/// 527/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 528/// 529/// \param __a 530/// A 128-bit vector of [4 x float] containing one of the operands. The lower 531/// 32 bits of this operand are used in the comparison. 532/// \param __b 533/// A 128-bit vector of [4 x float] containing one of the operands. The lower 534/// 32 bits of this operand are used in the comparison. 535/// \returns A 128-bit vector of [4 x float] containing the comparison results 536/// in the low-order bits. 537static __inline__ __m128 __DEFAULT_FN_ATTRS 538_mm_cmplt_ss(__m128 __a, __m128 __b) 539{ 540 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 541} 542 543/// Compares each of the corresponding 32-bit float values of the 544/// 128-bit vectors of [4 x float] to determine if the values in the first 545/// operand are less than those in the second operand. 546/// 547/// \headerfile <x86intrin.h> 548/// 549/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 550/// 551/// \param __a 552/// A 128-bit vector of [4 x float]. 553/// \param __b 554/// A 128-bit vector of [4 x float]. 555/// \returns A 128-bit vector of [4 x float] containing the comparison results. 556static __inline__ __m128 __DEFAULT_FN_ATTRS 557_mm_cmplt_ps(__m128 __a, __m128 __b) 558{ 559 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 560} 561 562/// Compares two 32-bit float values in the low-order bits of both 563/// operands to determine if the value in the first operand is less than or 564/// equal to the corresponding value in the second operand and returns the 565/// result of the comparison in the low-order bits of a vector of 566/// [4 x float]. 567/// 568/// \headerfile <x86intrin.h> 569/// 570/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 571/// 572/// \param __a 573/// A 128-bit vector of [4 x float] containing one of the operands. The lower 574/// 32 bits of this operand are used in the comparison. 575/// \param __b 576/// A 128-bit vector of [4 x float] containing one of the operands. The lower 577/// 32 bits of this operand are used in the comparison. 578/// \returns A 128-bit vector of [4 x float] containing the comparison results 579/// in the low-order bits. 580static __inline__ __m128 __DEFAULT_FN_ATTRS 581_mm_cmple_ss(__m128 __a, __m128 __b) 582{ 583 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 584} 585 586/// Compares each of the corresponding 32-bit float values of the 587/// 128-bit vectors of [4 x float] to determine if the values in the first 588/// operand are less than or equal to those in the second operand. 589/// 590/// \headerfile <x86intrin.h> 591/// 592/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 593/// 594/// \param __a 595/// A 128-bit vector of [4 x float]. 596/// \param __b 597/// A 128-bit vector of [4 x float]. 598/// \returns A 128-bit vector of [4 x float] containing the comparison results. 599static __inline__ __m128 __DEFAULT_FN_ATTRS 600_mm_cmple_ps(__m128 __a, __m128 __b) 601{ 602 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 603} 604 605/// Compares two 32-bit float values in the low-order bits of both 606/// operands to determine if the value in the first operand is greater than 607/// the corresponding value in the second operand and returns the result of 608/// the comparison in the low-order bits of a vector of [4 x float]. 609/// 610/// \headerfile <x86intrin.h> 611/// 612/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 613/// 614/// \param __a 615/// A 128-bit vector of [4 x float] containing one of the operands. The lower 616/// 32 bits of this operand are used in the comparison. 617/// \param __b 618/// A 128-bit vector of [4 x float] containing one of the operands. The lower 619/// 32 bits of this operand are used in the comparison. 620/// \returns A 128-bit vector of [4 x float] containing the comparison results 621/// in the low-order bits. 622static __inline__ __m128 __DEFAULT_FN_ATTRS 623_mm_cmpgt_ss(__m128 __a, __m128 __b) 624{ 625 return (__m128)__builtin_shufflevector((__v4sf)__a, 626 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 627 4, 1, 2, 3); 628} 629 630/// Compares each of the corresponding 32-bit float values of the 631/// 128-bit vectors of [4 x float] to determine if the values in the first 632/// operand are greater than those in the second operand. 633/// 634/// \headerfile <x86intrin.h> 635/// 636/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 637/// 638/// \param __a 639/// A 128-bit vector of [4 x float]. 640/// \param __b 641/// A 128-bit vector of [4 x float]. 642/// \returns A 128-bit vector of [4 x float] containing the comparison results. 643static __inline__ __m128 __DEFAULT_FN_ATTRS 644_mm_cmpgt_ps(__m128 __a, __m128 __b) 645{ 646 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 647} 648 649/// Compares two 32-bit float values in the low-order bits of both 650/// operands to determine if the value in the first operand is greater than 651/// or equal to the corresponding value in the second operand and returns 652/// the result of the comparison in the low-order bits of a vector of 653/// [4 x float]. 654/// 655/// \headerfile <x86intrin.h> 656/// 657/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 658/// 659/// \param __a 660/// A 128-bit vector of [4 x float] containing one of the operands. The lower 661/// 32 bits of this operand are used in the comparison. 662/// \param __b 663/// A 128-bit vector of [4 x float] containing one of the operands. The lower 664/// 32 bits of this operand are used in the comparison. 665/// \returns A 128-bit vector of [4 x float] containing the comparison results 666/// in the low-order bits. 667static __inline__ __m128 __DEFAULT_FN_ATTRS 668_mm_cmpge_ss(__m128 __a, __m128 __b) 669{ 670 return (__m128)__builtin_shufflevector((__v4sf)__a, 671 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 672 4, 1, 2, 3); 673} 674 675/// Compares each of the corresponding 32-bit float values of the 676/// 128-bit vectors of [4 x float] to determine if the values in the first 677/// operand are greater than or equal to those in the second operand. 678/// 679/// \headerfile <x86intrin.h> 680/// 681/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 682/// 683/// \param __a 684/// A 128-bit vector of [4 x float]. 685/// \param __b 686/// A 128-bit vector of [4 x float]. 687/// \returns A 128-bit vector of [4 x float] containing the comparison results. 688static __inline__ __m128 __DEFAULT_FN_ATTRS 689_mm_cmpge_ps(__m128 __a, __m128 __b) 690{ 691 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 692} 693 694/// Compares two 32-bit float values in the low-order bits of both 695/// operands for inequality and returns the result of the comparison in the 696/// low-order bits of a vector of [4 x float]. 697/// 698/// \headerfile <x86intrin.h> 699/// 700/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c> 701/// instructions. 702/// 703/// \param __a 704/// A 128-bit vector of [4 x float] containing one of the operands. The lower 705/// 32 bits of this operand are used in the comparison. 706/// \param __b 707/// A 128-bit vector of [4 x float] containing one of the operands. The lower 708/// 32 bits of this operand are used in the comparison. 709/// \returns A 128-bit vector of [4 x float] containing the comparison results 710/// in the low-order bits. 711static __inline__ __m128 __DEFAULT_FN_ATTRS 712_mm_cmpneq_ss(__m128 __a, __m128 __b) 713{ 714 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 715} 716 717/// Compares each of the corresponding 32-bit float values of the 718/// 128-bit vectors of [4 x float] for inequality. 719/// 720/// \headerfile <x86intrin.h> 721/// 722/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c> 723/// instructions. 724/// 725/// \param __a 726/// A 128-bit vector of [4 x float]. 727/// \param __b 728/// A 128-bit vector of [4 x float]. 729/// \returns A 128-bit vector of [4 x float] containing the comparison results. 730static __inline__ __m128 __DEFAULT_FN_ATTRS 731_mm_cmpneq_ps(__m128 __a, __m128 __b) 732{ 733 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 734} 735 736/// Compares two 32-bit float values in the low-order bits of both 737/// operands to determine if the value in the first operand is not less than 738/// the corresponding value in the second operand and returns the result of 739/// the comparison in the low-order bits of a vector of [4 x float]. 740/// 741/// \headerfile <x86intrin.h> 742/// 743/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 744/// instructions. 745/// 746/// \param __a 747/// A 128-bit vector of [4 x float] containing one of the operands. The lower 748/// 32 bits of this operand are used in the comparison. 749/// \param __b 750/// A 128-bit vector of [4 x float] containing one of the operands. The lower 751/// 32 bits of this operand are used in the comparison. 752/// \returns A 128-bit vector of [4 x float] containing the comparison results 753/// in the low-order bits. 754static __inline__ __m128 __DEFAULT_FN_ATTRS 755_mm_cmpnlt_ss(__m128 __a, __m128 __b) 756{ 757 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 758} 759 760/// Compares each of the corresponding 32-bit float values of the 761/// 128-bit vectors of [4 x float] to determine if the values in the first 762/// operand are not less than those in the second operand. 763/// 764/// \headerfile <x86intrin.h> 765/// 766/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 767/// instructions. 768/// 769/// \param __a 770/// A 128-bit vector of [4 x float]. 771/// \param __b 772/// A 128-bit vector of [4 x float]. 773/// \returns A 128-bit vector of [4 x float] containing the comparison results. 774static __inline__ __m128 __DEFAULT_FN_ATTRS 775_mm_cmpnlt_ps(__m128 __a, __m128 __b) 776{ 777 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 778} 779 780/// Compares two 32-bit float values in the low-order bits of both 781/// operands to determine if the value in the first operand is not less than 782/// or equal to the corresponding value in the second operand and returns 783/// the result of the comparison in the low-order bits of a vector of 784/// [4 x float]. 785/// 786/// \headerfile <x86intrin.h> 787/// 788/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 789/// instructions. 790/// 791/// \param __a 792/// A 128-bit vector of [4 x float] containing one of the operands. The lower 793/// 32 bits of this operand are used in the comparison. 794/// \param __b 795/// A 128-bit vector of [4 x float] containing one of the operands. The lower 796/// 32 bits of this operand are used in the comparison. 797/// \returns A 128-bit vector of [4 x float] containing the comparison results 798/// in the low-order bits. 799static __inline__ __m128 __DEFAULT_FN_ATTRS 800_mm_cmpnle_ss(__m128 __a, __m128 __b) 801{ 802 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 803} 804 805/// Compares each of the corresponding 32-bit float values of the 806/// 128-bit vectors of [4 x float] to determine if the values in the first 807/// operand are not less than or equal to those in the second operand. 808/// 809/// \headerfile <x86intrin.h> 810/// 811/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 812/// instructions. 813/// 814/// \param __a 815/// A 128-bit vector of [4 x float]. 816/// \param __b 817/// A 128-bit vector of [4 x float]. 818/// \returns A 128-bit vector of [4 x float] containing the comparison results. 819static __inline__ __m128 __DEFAULT_FN_ATTRS 820_mm_cmpnle_ps(__m128 __a, __m128 __b) 821{ 822 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 823} 824 825/// Compares two 32-bit float values in the low-order bits of both 826/// operands to determine if the value in the first operand is not greater 827/// than the corresponding value in the second operand and returns the 828/// result of the comparison in the low-order bits of a vector of 829/// [4 x float]. 830/// 831/// \headerfile <x86intrin.h> 832/// 833/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 834/// instructions. 835/// 836/// \param __a 837/// A 128-bit vector of [4 x float] containing one of the operands. The lower 838/// 32 bits of this operand are used in the comparison. 839/// \param __b 840/// A 128-bit vector of [4 x float] containing one of the operands. The lower 841/// 32 bits of this operand are used in the comparison. 842/// \returns A 128-bit vector of [4 x float] containing the comparison results 843/// in the low-order bits. 844static __inline__ __m128 __DEFAULT_FN_ATTRS 845_mm_cmpngt_ss(__m128 __a, __m128 __b) 846{ 847 return (__m128)__builtin_shufflevector((__v4sf)__a, 848 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 849 4, 1, 2, 3); 850} 851 852/// Compares each of the corresponding 32-bit float values of the 853/// 128-bit vectors of [4 x float] to determine if the values in the first 854/// operand are not greater than those in the second operand. 855/// 856/// \headerfile <x86intrin.h> 857/// 858/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 859/// instructions. 860/// 861/// \param __a 862/// A 128-bit vector of [4 x float]. 863/// \param __b 864/// A 128-bit vector of [4 x float]. 865/// \returns A 128-bit vector of [4 x float] containing the comparison results. 866static __inline__ __m128 __DEFAULT_FN_ATTRS 867_mm_cmpngt_ps(__m128 __a, __m128 __b) 868{ 869 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 870} 871 872/// Compares two 32-bit float values in the low-order bits of both 873/// operands to determine if the value in the first operand is not greater 874/// than or equal to the corresponding value in the second operand and 875/// returns the result of the comparison in the low-order bits of a vector 876/// of [4 x float]. 877/// 878/// \headerfile <x86intrin.h> 879/// 880/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 881/// instructions. 882/// 883/// \param __a 884/// A 128-bit vector of [4 x float] containing one of the operands. The lower 885/// 32 bits of this operand are used in the comparison. 886/// \param __b 887/// A 128-bit vector of [4 x float] containing one of the operands. The lower 888/// 32 bits of this operand are used in the comparison. 889/// \returns A 128-bit vector of [4 x float] containing the comparison results 890/// in the low-order bits. 891static __inline__ __m128 __DEFAULT_FN_ATTRS 892_mm_cmpnge_ss(__m128 __a, __m128 __b) 893{ 894 return (__m128)__builtin_shufflevector((__v4sf)__a, 895 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 896 4, 1, 2, 3); 897} 898 899/// Compares each of the corresponding 32-bit float values of the 900/// 128-bit vectors of [4 x float] to determine if the values in the first 901/// operand are not greater than or equal to those in the second operand. 902/// 903/// \headerfile <x86intrin.h> 904/// 905/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 906/// instructions. 907/// 908/// \param __a 909/// A 128-bit vector of [4 x float]. 910/// \param __b 911/// A 128-bit vector of [4 x float]. 912/// \returns A 128-bit vector of [4 x float] containing the comparison results. 913static __inline__ __m128 __DEFAULT_FN_ATTRS 914_mm_cmpnge_ps(__m128 __a, __m128 __b) 915{ 916 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 917} 918 919/// Compares two 32-bit float values in the low-order bits of both 920/// operands to determine if the value in the first operand is ordered with 921/// respect to the corresponding value in the second operand and returns the 922/// result of the comparison in the low-order bits of a vector of 923/// [4 x float]. 924/// 925/// \headerfile <x86intrin.h> 926/// 927/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c> 928/// instructions. 929/// 930/// \param __a 931/// A 128-bit vector of [4 x float] containing one of the operands. The lower 932/// 32 bits of this operand are used in the comparison. 933/// \param __b 934/// A 128-bit vector of [4 x float] containing one of the operands. The lower 935/// 32 bits of this operand are used in the comparison. 936/// \returns A 128-bit vector of [4 x float] containing the comparison results 937/// in the low-order bits. 938static __inline__ __m128 __DEFAULT_FN_ATTRS 939_mm_cmpord_ss(__m128 __a, __m128 __b) 940{ 941 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 942} 943 944/// Compares each of the corresponding 32-bit float values of the 945/// 128-bit vectors of [4 x float] to determine if the values in the first 946/// operand are ordered with respect to those in the second operand. 947/// 948/// \headerfile <x86intrin.h> 949/// 950/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c> 951/// instructions. 952/// 953/// \param __a 954/// A 128-bit vector of [4 x float]. 955/// \param __b 956/// A 128-bit vector of [4 x float]. 957/// \returns A 128-bit vector of [4 x float] containing the comparison results. 958static __inline__ __m128 __DEFAULT_FN_ATTRS 959_mm_cmpord_ps(__m128 __a, __m128 __b) 960{ 961 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 962} 963 964/// Compares two 32-bit float values in the low-order bits of both 965/// operands to determine if the value in the first operand is unordered 966/// with respect to the corresponding value in the second operand and 967/// returns the result of the comparison in the low-order bits of a vector 968/// of [4 x float]. 969/// 970/// \headerfile <x86intrin.h> 971/// 972/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c> 973/// instructions. 974/// 975/// \param __a 976/// A 128-bit vector of [4 x float] containing one of the operands. The lower 977/// 32 bits of this operand are used in the comparison. 978/// \param __b 979/// A 128-bit vector of [4 x float] containing one of the operands. The lower 980/// 32 bits of this operand are used in the comparison. 981/// \returns A 128-bit vector of [4 x float] containing the comparison results 982/// in the low-order bits. 983static __inline__ __m128 __DEFAULT_FN_ATTRS 984_mm_cmpunord_ss(__m128 __a, __m128 __b) 985{ 986 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 987} 988 989/// Compares each of the corresponding 32-bit float values of the 990/// 128-bit vectors of [4 x float] to determine if the values in the first 991/// operand are unordered with respect to those in the second operand. 992/// 993/// \headerfile <x86intrin.h> 994/// 995/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c> 996/// instructions. 997/// 998/// \param __a 999/// A 128-bit vector of [4 x float]. 1000/// \param __b 1001/// A 128-bit vector of [4 x float]. 1002/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1003static __inline__ __m128 __DEFAULT_FN_ATTRS 1004_mm_cmpunord_ps(__m128 __a, __m128 __b) 1005{ 1006 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 1007} 1008 1009/// Compares two 32-bit float values in the low-order bits of both 1010/// operands for equality and returns the result of the comparison. 1011/// 1012/// If either of the two lower 32-bit values is NaN, 0 is returned. 1013/// 1014/// \headerfile <x86intrin.h> 1015/// 1016/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1017/// instructions. 1018/// 1019/// \param __a 1020/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1021/// used in the comparison. 1022/// \param __b 1023/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1024/// used in the comparison. 1025/// \returns An integer containing the comparison results. If either of the 1026/// two lower 32-bit values is NaN, 0 is returned. 1027static __inline__ int __DEFAULT_FN_ATTRS 1028_mm_comieq_ss(__m128 __a, __m128 __b) 1029{ 1030 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1031} 1032 1033/// Compares two 32-bit float values in the low-order bits of both 1034/// operands to determine if the first operand is less than the second 1035/// operand and returns the result of the comparison. 1036/// 1037/// If either of the two lower 32-bit values is NaN, 0 is returned. 1038/// 1039/// \headerfile <x86intrin.h> 1040/// 1041/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1042/// instructions. 1043/// 1044/// \param __a 1045/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1046/// used in the comparison. 1047/// \param __b 1048/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1049/// used in the comparison. 1050/// \returns An integer containing the comparison results. If either of the two 1051/// lower 32-bit values is NaN, 0 is returned. 1052static __inline__ int __DEFAULT_FN_ATTRS 1053_mm_comilt_ss(__m128 __a, __m128 __b) 1054{ 1055 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1056} 1057 1058/// Compares two 32-bit float values in the low-order bits of both 1059/// operands to determine if the first operand is less than or equal to the 1060/// second operand and returns the result of the comparison. 1061/// 1062/// If either of the two lower 32-bit values is NaN, 0 is returned. 1063/// 1064/// \headerfile <x86intrin.h> 1065/// 1066/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1067/// 1068/// \param __a 1069/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1070/// used in the comparison. 1071/// \param __b 1072/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1073/// used in the comparison. 1074/// \returns An integer containing the comparison results. If either of the two 1075/// lower 32-bit values is NaN, 0 is returned. 1076static __inline__ int __DEFAULT_FN_ATTRS 1077_mm_comile_ss(__m128 __a, __m128 __b) 1078{ 1079 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1080} 1081 1082/// Compares two 32-bit float values in the low-order bits of both 1083/// operands to determine if the first operand is greater than the second 1084/// operand and returns the result of the comparison. 1085/// 1086/// If either of the two lower 32-bit values is NaN, 0 is returned. 1087/// 1088/// \headerfile <x86intrin.h> 1089/// 1090/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1091/// 1092/// \param __a 1093/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1094/// used in the comparison. 1095/// \param __b 1096/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1097/// used in the comparison. 1098/// \returns An integer containing the comparison results. If either of the 1099/// two lower 32-bit values is NaN, 0 is returned. 1100static __inline__ int __DEFAULT_FN_ATTRS 1101_mm_comigt_ss(__m128 __a, __m128 __b) 1102{ 1103 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1104} 1105 1106/// Compares two 32-bit float values in the low-order bits of both 1107/// operands to determine if the first operand is greater than or equal to 1108/// the second operand and returns the result of the comparison. 1109/// 1110/// If either of the two lower 32-bit values is NaN, 0 is returned. 1111/// 1112/// \headerfile <x86intrin.h> 1113/// 1114/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1115/// 1116/// \param __a 1117/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1118/// used in the comparison. 1119/// \param __b 1120/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1121/// used in the comparison. 1122/// \returns An integer containing the comparison results. If either of the two 1123/// lower 32-bit values is NaN, 0 is returned. 1124static __inline__ int __DEFAULT_FN_ATTRS 1125_mm_comige_ss(__m128 __a, __m128 __b) 1126{ 1127 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1128} 1129 1130/// Compares two 32-bit float values in the low-order bits of both 1131/// operands to determine if the first operand is not equal to the second 1132/// operand and returns the result of the comparison. 1133/// 1134/// If either of the two lower 32-bit values is NaN, 1 is returned. 1135/// 1136/// \headerfile <x86intrin.h> 1137/// 1138/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1139/// 1140/// \param __a 1141/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1142/// used in the comparison. 1143/// \param __b 1144/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1145/// used in the comparison. 1146/// \returns An integer containing the comparison results. If either of the 1147/// two lower 32-bit values is NaN, 1 is returned. 1148static __inline__ int __DEFAULT_FN_ATTRS 1149_mm_comineq_ss(__m128 __a, __m128 __b) 1150{ 1151 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1152} 1153 1154/// Performs an unordered comparison of two 32-bit float values using 1155/// the low-order bits of both operands to determine equality and returns 1156/// the result of the comparison. 1157/// 1158/// If either of the two lower 32-bit values is NaN, 0 is returned. 1159/// 1160/// \headerfile <x86intrin.h> 1161/// 1162/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1163/// 1164/// \param __a 1165/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1166/// used in the comparison. 1167/// \param __b 1168/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1169/// used in the comparison. 1170/// \returns An integer containing the comparison results. If either of the two 1171/// lower 32-bit values is NaN, 0 is returned. 1172static __inline__ int __DEFAULT_FN_ATTRS 1173_mm_ucomieq_ss(__m128 __a, __m128 __b) 1174{ 1175 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1176} 1177 1178/// Performs an unordered comparison of two 32-bit float values using 1179/// the low-order bits of both operands to determine if the first operand is 1180/// less than the second operand and returns the result of the comparison. 1181/// 1182/// If either of the two lower 32-bit values is NaN, 0 is returned. 1183/// 1184/// \headerfile <x86intrin.h> 1185/// 1186/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1187/// 1188/// \param __a 1189/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1190/// used in the comparison. 1191/// \param __b 1192/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1193/// used in the comparison. 1194/// \returns An integer containing the comparison results. If either of the two 1195/// lower 32-bit values is NaN, 0 is returned. 1196static __inline__ int __DEFAULT_FN_ATTRS 1197_mm_ucomilt_ss(__m128 __a, __m128 __b) 1198{ 1199 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1200} 1201 1202/// Performs an unordered comparison of two 32-bit float values using 1203/// the low-order bits of both operands to determine if the first operand is 1204/// less than or equal to the second operand and returns the result of the 1205/// comparison. 1206/// 1207/// If either of the two lower 32-bit values is NaN, 0 is returned. 1208/// 1209/// \headerfile <x86intrin.h> 1210/// 1211/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1212/// 1213/// \param __a 1214/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1215/// used in the comparison. 1216/// \param __b 1217/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1218/// used in the comparison. 1219/// \returns An integer containing the comparison results. If either of the two 1220/// lower 32-bit values is NaN, 0 is returned. 1221static __inline__ int __DEFAULT_FN_ATTRS 1222_mm_ucomile_ss(__m128 __a, __m128 __b) 1223{ 1224 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1225} 1226 1227/// Performs an unordered comparison of two 32-bit float values using 1228/// the low-order bits of both operands to determine if the first operand is 1229/// greater than the second operand and returns the result of the 1230/// comparison. 1231/// 1232/// If either of the two lower 32-bit values is NaN, 0 is returned. 1233/// 1234/// \headerfile <x86intrin.h> 1235/// 1236/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1237/// 1238/// \param __a 1239/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1240/// used in the comparison. 1241/// \param __b 1242/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1243/// used in the comparison. 1244/// \returns An integer containing the comparison results. If either of the two 1245/// lower 32-bit values is NaN, 0 is returned. 1246static __inline__ int __DEFAULT_FN_ATTRS 1247_mm_ucomigt_ss(__m128 __a, __m128 __b) 1248{ 1249 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1250} 1251 1252/// Performs an unordered comparison of two 32-bit float values using 1253/// the low-order bits of both operands to determine if the first operand is 1254/// greater than or equal to the second operand and returns the result of 1255/// the comparison. 1256/// 1257/// If either of the two lower 32-bit values is NaN, 0 is returned. 1258/// 1259/// \headerfile <x86intrin.h> 1260/// 1261/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1262/// 1263/// \param __a 1264/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1265/// used in the comparison. 1266/// \param __b 1267/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1268/// used in the comparison. 1269/// \returns An integer containing the comparison results. If either of the two 1270/// lower 32-bit values is NaN, 0 is returned. 1271static __inline__ int __DEFAULT_FN_ATTRS 1272_mm_ucomige_ss(__m128 __a, __m128 __b) 1273{ 1274 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1275} 1276 1277/// Performs an unordered comparison of two 32-bit float values using 1278/// the low-order bits of both operands to determine inequality and returns 1279/// the result of the comparison. 1280/// 1281/// If either of the two lower 32-bit values is NaN, 1 is returned. 1282/// 1283/// \headerfile <x86intrin.h> 1284/// 1285/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1286/// 1287/// \param __a 1288/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1289/// used in the comparison. 1290/// \param __b 1291/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1292/// used in the comparison. 1293/// \returns An integer containing the comparison results. If either of the two 1294/// lower 32-bit values is NaN, 1 is returned. 1295static __inline__ int __DEFAULT_FN_ATTRS 1296_mm_ucomineq_ss(__m128 __a, __m128 __b) 1297{ 1298 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1299} 1300 1301/// Converts a float value contained in the lower 32 bits of a vector of 1302/// [4 x float] into a 32-bit integer. 1303/// 1304/// \headerfile <x86intrin.h> 1305/// 1306/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1307/// instructions. 1308/// 1309/// \param __a 1310/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1311/// used in the conversion. 1312/// \returns A 32-bit integer containing the converted value. 1313static __inline__ int __DEFAULT_FN_ATTRS 1314_mm_cvtss_si32(__m128 __a) 1315{ 1316 return __builtin_ia32_cvtss2si((__v4sf)__a); 1317} 1318 1319/// Converts a float value contained in the lower 32 bits of a vector of 1320/// [4 x float] into a 32-bit integer. 1321/// 1322/// \headerfile <x86intrin.h> 1323/// 1324/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1325/// instructions. 1326/// 1327/// \param __a 1328/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1329/// used in the conversion. 1330/// \returns A 32-bit integer containing the converted value. 1331static __inline__ int __DEFAULT_FN_ATTRS 1332_mm_cvt_ss2si(__m128 __a) 1333{ 1334 return _mm_cvtss_si32(__a); 1335} 1336 1337#ifdef __x86_64__ 1338 1339/// Converts a float value contained in the lower 32 bits of a vector of 1340/// [4 x float] into a 64-bit integer. 1341/// 1342/// \headerfile <x86intrin.h> 1343/// 1344/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1345/// instructions. 1346/// 1347/// \param __a 1348/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1349/// used in the conversion. 1350/// \returns A 64-bit integer containing the converted value. 1351static __inline__ long long __DEFAULT_FN_ATTRS 1352_mm_cvtss_si64(__m128 __a) 1353{ 1354 return __builtin_ia32_cvtss2si64((__v4sf)__a); 1355} 1356 1357#endif 1358 1359/// Converts two low-order float values in a 128-bit vector of 1360/// [4 x float] into a 64-bit vector of [2 x i32]. 1361/// 1362/// \headerfile <x86intrin.h> 1363/// 1364/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1365/// 1366/// \param __a 1367/// A 128-bit vector of [4 x float]. 1368/// \returns A 64-bit integer vector containing the converted values. 1369static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1370_mm_cvtps_pi32(__m128 __a) 1371{ 1372 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1373} 1374 1375/// Converts two low-order float values in a 128-bit vector of 1376/// [4 x float] into a 64-bit vector of [2 x i32]. 1377/// 1378/// \headerfile <x86intrin.h> 1379/// 1380/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1381/// 1382/// \param __a 1383/// A 128-bit vector of [4 x float]. 1384/// \returns A 64-bit integer vector containing the converted values. 1385static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1386_mm_cvt_ps2pi(__m128 __a) 1387{ 1388 return _mm_cvtps_pi32(__a); 1389} 1390 1391/// Converts a float value contained in the lower 32 bits of a vector of 1392/// [4 x float] into a 32-bit integer, truncating the result when it is 1393/// inexact. 1394/// 1395/// \headerfile <x86intrin.h> 1396/// 1397/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1398/// instructions. 1399/// 1400/// \param __a 1401/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1402/// used in the conversion. 1403/// \returns A 32-bit integer containing the converted value. 1404static __inline__ int __DEFAULT_FN_ATTRS 1405_mm_cvttss_si32(__m128 __a) 1406{ 1407 return __builtin_ia32_cvttss2si((__v4sf)__a); 1408} 1409 1410/// Converts a float value contained in the lower 32 bits of a vector of 1411/// [4 x float] into a 32-bit integer, truncating the result when it is 1412/// inexact. 1413/// 1414/// \headerfile <x86intrin.h> 1415/// 1416/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1417/// instructions. 1418/// 1419/// \param __a 1420/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1421/// used in the conversion. 1422/// \returns A 32-bit integer containing the converted value. 1423static __inline__ int __DEFAULT_FN_ATTRS 1424_mm_cvtt_ss2si(__m128 __a) 1425{ 1426 return _mm_cvttss_si32(__a); 1427} 1428 1429#ifdef __x86_64__ 1430/// Converts a float value contained in the lower 32 bits of a vector of 1431/// [4 x float] into a 64-bit integer, truncating the result when it is 1432/// inexact. 1433/// 1434/// \headerfile <x86intrin.h> 1435/// 1436/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1437/// instructions. 1438/// 1439/// \param __a 1440/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1441/// used in the conversion. 1442/// \returns A 64-bit integer containing the converted value. 1443static __inline__ long long __DEFAULT_FN_ATTRS 1444_mm_cvttss_si64(__m128 __a) 1445{ 1446 return __builtin_ia32_cvttss2si64((__v4sf)__a); 1447} 1448#endif 1449 1450/// Converts two low-order float values in a 128-bit vector of 1451/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1452/// when it is inexact. 1453/// 1454/// \headerfile <x86intrin.h> 1455/// 1456/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c> 1457/// instructions. 1458/// 1459/// \param __a 1460/// A 128-bit vector of [4 x float]. 1461/// \returns A 64-bit integer vector containing the converted values. 1462static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1463_mm_cvttps_pi32(__m128 __a) 1464{ 1465 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1466} 1467 1468/// Converts two low-order float values in a 128-bit vector of [4 x 1469/// float] into a 64-bit vector of [2 x i32], truncating the result when it 1470/// is inexact. 1471/// 1472/// \headerfile <x86intrin.h> 1473/// 1474/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction. 1475/// 1476/// \param __a 1477/// A 128-bit vector of [4 x float]. 1478/// \returns A 64-bit integer vector containing the converted values. 1479static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1480_mm_cvtt_ps2pi(__m128 __a) 1481{ 1482 return _mm_cvttps_pi32(__a); 1483} 1484 1485/// Converts a 32-bit signed integer value into a floating point value 1486/// and writes it to the lower 32 bits of the destination. The remaining 1487/// higher order elements of the destination vector are copied from the 1488/// corresponding elements in the first operand. 1489/// 1490/// \headerfile <x86intrin.h> 1491/// 1492/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1493/// 1494/// \param __a 1495/// A 128-bit vector of [4 x float]. 1496/// \param __b 1497/// A 32-bit signed integer operand containing the value to be converted. 1498/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1499/// converted value of the second operand. The upper 96 bits are copied from 1500/// the upper 96 bits of the first operand. 1501static __inline__ __m128 __DEFAULT_FN_ATTRS 1502_mm_cvtsi32_ss(__m128 __a, int __b) 1503{ 1504 __a[0] = __b; 1505 return __a; 1506} 1507 1508/// Converts a 32-bit signed integer value into a floating point value 1509/// and writes it to the lower 32 bits of the destination. The remaining 1510/// higher order elements of the destination are copied from the 1511/// corresponding elements in the first operand. 1512/// 1513/// \headerfile <x86intrin.h> 1514/// 1515/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1516/// 1517/// \param __a 1518/// A 128-bit vector of [4 x float]. 1519/// \param __b 1520/// A 32-bit signed integer operand containing the value to be converted. 1521/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1522/// converted value of the second operand. The upper 96 bits are copied from 1523/// the upper 96 bits of the first operand. 1524static __inline__ __m128 __DEFAULT_FN_ATTRS 1525_mm_cvt_si2ss(__m128 __a, int __b) 1526{ 1527 return _mm_cvtsi32_ss(__a, __b); 1528} 1529 1530#ifdef __x86_64__ 1531 1532/// Converts a 64-bit signed integer value into a floating point value 1533/// and writes it to the lower 32 bits of the destination. The remaining 1534/// higher order elements of the destination are copied from the 1535/// corresponding elements in the first operand. 1536/// 1537/// \headerfile <x86intrin.h> 1538/// 1539/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1540/// 1541/// \param __a 1542/// A 128-bit vector of [4 x float]. 1543/// \param __b 1544/// A 64-bit signed integer operand containing the value to be converted. 1545/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1546/// converted value of the second operand. The upper 96 bits are copied from 1547/// the upper 96 bits of the first operand. 1548static __inline__ __m128 __DEFAULT_FN_ATTRS 1549_mm_cvtsi64_ss(__m128 __a, long long __b) 1550{ 1551 __a[0] = __b; 1552 return __a; 1553} 1554 1555#endif 1556 1557/// Converts two elements of a 64-bit vector of [2 x i32] into two 1558/// floating point values and writes them to the lower 64-bits of the 1559/// destination. The remaining higher order elements of the destination are 1560/// copied from the corresponding elements in the first operand. 1561/// 1562/// \headerfile <x86intrin.h> 1563/// 1564/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1565/// 1566/// \param __a 1567/// A 128-bit vector of [4 x float]. 1568/// \param __b 1569/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1570/// and written to the corresponding low-order elements in the destination. 1571/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1572/// converted value of the second operand. The upper 64 bits are copied from 1573/// the upper 64 bits of the first operand. 1574static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1575_mm_cvtpi32_ps(__m128 __a, __m64 __b) 1576{ 1577 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1578} 1579 1580/// Converts two elements of a 64-bit vector of [2 x i32] into two 1581/// floating point values and writes them to the lower 64-bits of the 1582/// destination. The remaining higher order elements of the destination are 1583/// copied from the corresponding elements in the first operand. 1584/// 1585/// \headerfile <x86intrin.h> 1586/// 1587/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1588/// 1589/// \param __a 1590/// A 128-bit vector of [4 x float]. 1591/// \param __b 1592/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1593/// and written to the corresponding low-order elements in the destination. 1594/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1595/// converted value from the second operand. The upper 64 bits are copied 1596/// from the upper 64 bits of the first operand. 1597static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1598_mm_cvt_pi2ps(__m128 __a, __m64 __b) 1599{ 1600 return _mm_cvtpi32_ps(__a, __b); 1601} 1602 1603/// Extracts a float value contained in the lower 32 bits of a vector of 1604/// [4 x float]. 1605/// 1606/// \headerfile <x86intrin.h> 1607/// 1608/// This intrinsic has no corresponding instruction. 1609/// 1610/// \param __a 1611/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1612/// used in the extraction. 1613/// \returns A 32-bit float containing the extracted value. 1614static __inline__ float __DEFAULT_FN_ATTRS 1615_mm_cvtss_f32(__m128 __a) 1616{ 1617 return __a[0]; 1618} 1619 1620/// Loads two packed float values from the address \a __p into the 1621/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits 1622/// are copied from the low-order bits of the first operand. 1623/// 1624/// \headerfile <x86intrin.h> 1625/// 1626/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1627/// 1628/// \param __a 1629/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] 1630/// of the destination. 1631/// \param __p 1632/// A pointer to two packed float values. Bits [63:0] are written to bits 1633/// [127:64] of the destination. 1634/// \returns A 128-bit vector of [4 x float] containing the moved values. 1635static __inline__ __m128 __DEFAULT_FN_ATTRS 1636_mm_loadh_pi(__m128 __a, const __m64 *__p) 1637{ 1638 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1639 struct __mm_loadh_pi_struct { 1640 __mm_loadh_pi_v2f32 __u; 1641 } __attribute__((__packed__, __may_alias__)); 1642 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 1643 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1644 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1645} 1646 1647/// Loads two packed float values from the address \a __p into the 1648/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits 1649/// are copied from the high-order bits of the first operand. 1650/// 1651/// \headerfile <x86intrin.h> 1652/// 1653/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1654/// 1655/// \param __a 1656/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits 1657/// [127:64] of the destination. 1658/// \param __p 1659/// A pointer to two packed float values. Bits [63:0] are written to bits 1660/// [63:0] of the destination. 1661/// \returns A 128-bit vector of [4 x float] containing the moved values. 1662static __inline__ __m128 __DEFAULT_FN_ATTRS 1663_mm_loadl_pi(__m128 __a, const __m64 *__p) 1664{ 1665 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1666 struct __mm_loadl_pi_struct { 1667 __mm_loadl_pi_v2f32 __u; 1668 } __attribute__((__packed__, __may_alias__)); 1669 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 1670 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1671 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1672} 1673 1674/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1675/// 32 bits of the vector are initialized with the single-precision 1676/// floating-point value loaded from a specified memory location. The upper 1677/// 96 bits are set to zero. 1678/// 1679/// \headerfile <x86intrin.h> 1680/// 1681/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1682/// 1683/// \param __p 1684/// A pointer to a 32-bit memory location containing a single-precision 1685/// floating-point value. 1686/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1687/// lower 32 bits contain the value loaded from the memory location. The 1688/// upper 96 bits are set to zero. 1689static __inline__ __m128 __DEFAULT_FN_ATTRS 1690_mm_load_ss(const float *__p) 1691{ 1692 struct __mm_load_ss_struct { 1693 float __u; 1694 } __attribute__((__packed__, __may_alias__)); 1695 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 1696 return __extension__ (__m128){ __u, 0, 0, 0 }; 1697} 1698 1699/// Loads a 32-bit float value and duplicates it to all four vector 1700/// elements of a 128-bit vector of [4 x float]. 1701/// 1702/// \headerfile <x86intrin.h> 1703/// 1704/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c> 1705/// instruction. 1706/// 1707/// \param __p 1708/// A pointer to a float value to be loaded and duplicated. 1709/// \returns A 128-bit vector of [4 x float] containing the loaded and 1710/// duplicated values. 1711static __inline__ __m128 __DEFAULT_FN_ATTRS 1712_mm_load1_ps(const float *__p) 1713{ 1714 struct __mm_load1_ps_struct { 1715 float __u; 1716 } __attribute__((__packed__, __may_alias__)); 1717 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 1718 return __extension__ (__m128){ __u, __u, __u, __u }; 1719} 1720 1721#define _mm_load_ps1(p) _mm_load1_ps(p) 1722 1723/// Loads a 128-bit floating-point vector of [4 x float] from an aligned 1724/// memory location. 1725/// 1726/// \headerfile <x86intrin.h> 1727/// 1728/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1729/// 1730/// \param __p 1731/// A pointer to a 128-bit memory location. The address of the memory 1732/// location has to be 128-bit aligned. 1733/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1734static __inline__ __m128 __DEFAULT_FN_ATTRS 1735_mm_load_ps(const float *__p) 1736{ 1737 return *(__m128*)__p; 1738} 1739 1740/// Loads a 128-bit floating-point vector of [4 x float] from an 1741/// unaligned memory location. 1742/// 1743/// \headerfile <x86intrin.h> 1744/// 1745/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1746/// 1747/// \param __p 1748/// A pointer to a 128-bit memory location. The address of the memory 1749/// location does not have to be aligned. 1750/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1751static __inline__ __m128 __DEFAULT_FN_ATTRS 1752_mm_loadu_ps(const float *__p) 1753{ 1754 struct __loadu_ps { 1755 __m128 __v; 1756 } __attribute__((__packed__, __may_alias__)); 1757 return ((struct __loadu_ps*)__p)->__v; 1758} 1759 1760/// Loads four packed float values, in reverse order, from an aligned 1761/// memory location to 32-bit elements in a 128-bit vector of [4 x float]. 1762/// 1763/// \headerfile <x86intrin.h> 1764/// 1765/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 1766/// instruction. 1767/// 1768/// \param __p 1769/// A pointer to a 128-bit memory location. The address of the memory 1770/// location has to be 128-bit aligned. 1771/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 1772/// in reverse order. 1773static __inline__ __m128 __DEFAULT_FN_ATTRS 1774_mm_loadr_ps(const float *__p) 1775{ 1776 __m128 __a = _mm_load_ps(__p); 1777 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1778} 1779 1780/// Create a 128-bit vector of [4 x float] with undefined values. 1781/// 1782/// \headerfile <x86intrin.h> 1783/// 1784/// This intrinsic has no corresponding instruction. 1785/// 1786/// \returns A 128-bit vector of [4 x float] containing undefined values. 1787static __inline__ __m128 __DEFAULT_FN_ATTRS 1788_mm_undefined_ps(void) 1789{ 1790 return (__m128)__builtin_ia32_undef128(); 1791} 1792 1793/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1794/// 32 bits of the vector are initialized with the specified single-precision 1795/// floating-point value. The upper 96 bits are set to zero. 1796/// 1797/// \headerfile <x86intrin.h> 1798/// 1799/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1800/// 1801/// \param __w 1802/// A single-precision floating-point value used to initialize the lower 32 1803/// bits of the result. 1804/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1805/// lower 32 bits contain the value provided in the source operand. The 1806/// upper 96 bits are set to zero. 1807static __inline__ __m128 __DEFAULT_FN_ATTRS 1808_mm_set_ss(float __w) 1809{ 1810 return __extension__ (__m128){ __w, 0, 0, 0 }; 1811} 1812 1813/// Constructs a 128-bit floating-point vector of [4 x float], with each 1814/// of the four single-precision floating-point vector elements set to the 1815/// specified single-precision floating-point value. 1816/// 1817/// \headerfile <x86intrin.h> 1818/// 1819/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1820/// 1821/// \param __w 1822/// A single-precision floating-point value used to initialize each vector 1823/// element of the result. 1824/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1825static __inline__ __m128 __DEFAULT_FN_ATTRS 1826_mm_set1_ps(float __w) 1827{ 1828 return __extension__ (__m128){ __w, __w, __w, __w }; 1829} 1830 1831/* Microsoft specific. */ 1832/// Constructs a 128-bit floating-point vector of [4 x float], with each 1833/// of the four single-precision floating-point vector elements set to the 1834/// specified single-precision floating-point value. 1835/// 1836/// \headerfile <x86intrin.h> 1837/// 1838/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1839/// 1840/// \param __w 1841/// A single-precision floating-point value used to initialize each vector 1842/// element of the result. 1843/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1844static __inline__ __m128 __DEFAULT_FN_ATTRS 1845_mm_set_ps1(float __w) 1846{ 1847 return _mm_set1_ps(__w); 1848} 1849 1850/// Constructs a 128-bit floating-point vector of [4 x float] 1851/// initialized with the specified single-precision floating-point values. 1852/// 1853/// \headerfile <x86intrin.h> 1854/// 1855/// This intrinsic is a utility function and does not correspond to a specific 1856/// instruction. 1857/// 1858/// \param __z 1859/// A single-precision floating-point value used to initialize bits [127:96] 1860/// of the result. 1861/// \param __y 1862/// A single-precision floating-point value used to initialize bits [95:64] 1863/// of the result. 1864/// \param __x 1865/// A single-precision floating-point value used to initialize bits [63:32] 1866/// of the result. 1867/// \param __w 1868/// A single-precision floating-point value used to initialize bits [31:0] 1869/// of the result. 1870/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1871static __inline__ __m128 __DEFAULT_FN_ATTRS 1872_mm_set_ps(float __z, float __y, float __x, float __w) 1873{ 1874 return __extension__ (__m128){ __w, __x, __y, __z }; 1875} 1876 1877/// Constructs a 128-bit floating-point vector of [4 x float], 1878/// initialized in reverse order with the specified 32-bit single-precision 1879/// float-point values. 1880/// 1881/// \headerfile <x86intrin.h> 1882/// 1883/// This intrinsic is a utility function and does not correspond to a specific 1884/// instruction. 1885/// 1886/// \param __z 1887/// A single-precision floating-point value used to initialize bits [31:0] 1888/// of the result. 1889/// \param __y 1890/// A single-precision floating-point value used to initialize bits [63:32] 1891/// of the result. 1892/// \param __x 1893/// A single-precision floating-point value used to initialize bits [95:64] 1894/// of the result. 1895/// \param __w 1896/// A single-precision floating-point value used to initialize bits [127:96] 1897/// of the result. 1898/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1899static __inline__ __m128 __DEFAULT_FN_ATTRS 1900_mm_setr_ps(float __z, float __y, float __x, float __w) 1901{ 1902 return __extension__ (__m128){ __z, __y, __x, __w }; 1903} 1904 1905/// Constructs a 128-bit floating-point vector of [4 x float] initialized 1906/// to zero. 1907/// 1908/// \headerfile <x86intrin.h> 1909/// 1910/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1911/// 1912/// \returns An initialized 128-bit floating-point vector of [4 x float] with 1913/// all elements set to zero. 1914static __inline__ __m128 __DEFAULT_FN_ATTRS 1915_mm_setzero_ps(void) 1916{ 1917 return __extension__ (__m128){ 0, 0, 0, 0 }; 1918} 1919 1920/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a 1921/// memory location. 1922/// 1923/// \headerfile <x86intrin.h> 1924/// 1925/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1926/// 1927/// \param __p 1928/// A pointer to a 64-bit memory location. 1929/// \param __a 1930/// A 128-bit vector of [4 x float] containing the values to be stored. 1931static __inline__ void __DEFAULT_FN_ATTRS 1932_mm_storeh_pi(__m64 *__p, __m128 __a) 1933{ 1934 __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); 1935} 1936 1937/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a 1938/// memory location. 1939/// 1940/// \headerfile <x86intrin.h> 1941/// 1942/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 1943/// 1944/// \param __p 1945/// A pointer to a memory location that will receive the float values. 1946/// \param __a 1947/// A 128-bit vector of [4 x float] containing the values to be stored. 1948static __inline__ void __DEFAULT_FN_ATTRS 1949_mm_storel_pi(__m64 *__p, __m128 __a) 1950{ 1951 __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); 1952} 1953 1954/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a 1955/// memory location. 1956/// 1957/// \headerfile <x86intrin.h> 1958/// 1959/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1960/// 1961/// \param __p 1962/// A pointer to a 32-bit memory location. 1963/// \param __a 1964/// A 128-bit vector of [4 x float] containing the value to be stored. 1965static __inline__ void __DEFAULT_FN_ATTRS 1966_mm_store_ss(float *__p, __m128 __a) 1967{ 1968 struct __mm_store_ss_struct { 1969 float __u; 1970 } __attribute__((__packed__, __may_alias__)); 1971 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1972} 1973 1974/// Stores a 128-bit vector of [4 x float] to an unaligned memory 1975/// location. 1976/// 1977/// \headerfile <x86intrin.h> 1978/// 1979/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1980/// 1981/// \param __p 1982/// A pointer to a 128-bit memory location. The address of the memory 1983/// location does not have to be aligned. 1984/// \param __a 1985/// A 128-bit vector of [4 x float] containing the values to be stored. 1986static __inline__ void __DEFAULT_FN_ATTRS 1987_mm_storeu_ps(float *__p, __m128 __a) 1988{ 1989 struct __storeu_ps { 1990 __m128 __v; 1991 } __attribute__((__packed__, __may_alias__)); 1992 ((struct __storeu_ps*)__p)->__v = __a; 1993} 1994 1995/// Stores a 128-bit vector of [4 x float] into an aligned memory 1996/// location. 1997/// 1998/// \headerfile <x86intrin.h> 1999/// 2000/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 2001/// 2002/// \param __p 2003/// A pointer to a 128-bit memory location. The address of the memory 2004/// location has to be 16-byte aligned. 2005/// \param __a 2006/// A 128-bit vector of [4 x float] containing the values to be stored. 2007static __inline__ void __DEFAULT_FN_ATTRS 2008_mm_store_ps(float *__p, __m128 __a) 2009{ 2010 *(__m128*)__p = __a; 2011} 2012 2013/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2014/// four contiguous elements in an aligned memory location. 2015/// 2016/// \headerfile <x86intrin.h> 2017/// 2018/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2019/// instruction. 2020/// 2021/// \param __p 2022/// A pointer to a 128-bit memory location. 2023/// \param __a 2024/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2025/// of the four contiguous elements pointed by \a __p. 2026static __inline__ void __DEFAULT_FN_ATTRS 2027_mm_store1_ps(float *__p, __m128 __a) 2028{ 2029 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 2030 _mm_store_ps(__p, __a); 2031} 2032 2033/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2034/// four contiguous elements in an aligned memory location. 2035/// 2036/// \headerfile <x86intrin.h> 2037/// 2038/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2039/// instruction. 2040/// 2041/// \param __p 2042/// A pointer to a 128-bit memory location. 2043/// \param __a 2044/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2045/// of the four contiguous elements pointed by \a __p. 2046static __inline__ void __DEFAULT_FN_ATTRS 2047_mm_store_ps1(float *__p, __m128 __a) 2048{ 2049 _mm_store1_ps(__p, __a); 2050} 2051 2052/// Stores float values from a 128-bit vector of [4 x float] to an 2053/// aligned memory location in reverse order. 2054/// 2055/// \headerfile <x86intrin.h> 2056/// 2057/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 2058/// instruction. 2059/// 2060/// \param __p 2061/// A pointer to a 128-bit memory location. The address of the memory 2062/// location has to be 128-bit aligned. 2063/// \param __a 2064/// A 128-bit vector of [4 x float] containing the values to be stored. 2065static __inline__ void __DEFAULT_FN_ATTRS 2066_mm_storer_ps(float *__p, __m128 __a) 2067{ 2068 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 2069 _mm_store_ps(__p, __a); 2070} 2071 2072#define _MM_HINT_ET0 7 2073#define _MM_HINT_ET1 6 2074#define _MM_HINT_T0 3 2075#define _MM_HINT_T1 2 2076#define _MM_HINT_T2 1 2077#define _MM_HINT_NTA 0 2078 2079#ifndef _MSC_VER 2080/* FIXME: We have to #define this because "sel" must be a constant integer, and 2081 Sema doesn't do any form of constant propagation yet. */ 2082 2083/// Loads one cache line of data from the specified address to a location 2084/// closer to the processor. 2085/// 2086/// \headerfile <x86intrin.h> 2087/// 2088/// \code 2089/// void _mm_prefetch(const void * a, const int sel); 2090/// \endcode 2091/// 2092/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction. 2093/// 2094/// \param a 2095/// A pointer to a memory location containing a cache line of data. 2096/// \param sel 2097/// A predefined integer constant specifying the type of prefetch 2098/// operation: \n 2099/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The 2100/// PREFETCHNTA instruction will be generated. \n 2101/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will 2102/// be generated. \n 2103/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will 2104/// be generated. \n 2105/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will 2106/// be generated. 2107#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \ 2108 ((sel) >> 2) & 1, (sel) & 0x3)) 2109#endif 2110 2111/// Stores a 64-bit integer in the specified aligned memory location. To 2112/// minimize caching, the data is flagged as non-temporal (unlikely to be 2113/// used again soon). 2114/// 2115/// \headerfile <x86intrin.h> 2116/// 2117/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction. 2118/// 2119/// \param __p 2120/// A pointer to an aligned memory location used to store the register value. 2121/// \param __a 2122/// A 64-bit integer containing the value to be stored. 2123static __inline__ void __DEFAULT_FN_ATTRS_MMX 2124_mm_stream_pi(__m64 *__p, __m64 __a) 2125{ 2126 __builtin_ia32_movntq(__p, __a); 2127} 2128 2129/// Moves packed float values from a 128-bit vector of [4 x float] to a 2130/// 128-bit aligned memory location. To minimize caching, the data is flagged 2131/// as non-temporal (unlikely to be used again soon). 2132/// 2133/// \headerfile <x86intrin.h> 2134/// 2135/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 2136/// 2137/// \param __p 2138/// A pointer to a 128-bit aligned memory location that will receive the 2139/// single-precision floating-point values. 2140/// \param __a 2141/// A 128-bit vector of [4 x float] containing the values to be moved. 2142static __inline__ void __DEFAULT_FN_ATTRS 2143_mm_stream_ps(float *__p, __m128 __a) 2144{ 2145 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 2146} 2147 2148#if defined(__cplusplus) 2149extern "C" { 2150#endif 2151 2152/// Forces strong memory ordering (serialization) between store 2153/// instructions preceding this instruction and store instructions following 2154/// this instruction, ensuring the system completes all previous stores 2155/// before executing subsequent stores. 2156/// 2157/// \headerfile <x86intrin.h> 2158/// 2159/// This intrinsic corresponds to the <c> SFENCE </c> instruction. 2160/// 2161void _mm_sfence(void); 2162 2163#if defined(__cplusplus) 2164} // extern "C" 2165#endif 2166 2167/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and 2168/// returns it, as specified by the immediate integer operand. 2169/// 2170/// \headerfile <x86intrin.h> 2171/// 2172/// \code 2173/// int _mm_extract_pi16(__m64 a, int n); 2174/// \endcode 2175/// 2176/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 2177/// 2178/// \param a 2179/// A 64-bit vector of [4 x i16]. 2180/// \param n 2181/// An immediate integer operand that determines which bits are extracted: \n 2182/// 0: Bits [15:0] are copied to the destination. \n 2183/// 1: Bits [31:16] are copied to the destination. \n 2184/// 2: Bits [47:32] are copied to the destination. \n 2185/// 3: Bits [63:48] are copied to the destination. 2186/// \returns A 16-bit integer containing the extracted 16 bits of packed data. 2187#define _mm_extract_pi16(a, n) \ 2188 (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n) 2189 2190/// Copies data from the 64-bit vector of [4 x i16] to the destination, 2191/// and inserts the lower 16-bits of an integer operand at the 16-bit offset 2192/// specified by the immediate operand \a n. 2193/// 2194/// \headerfile <x86intrin.h> 2195/// 2196/// \code 2197/// __m64 _mm_insert_pi16(__m64 a, int d, int n); 2198/// \endcode 2199/// 2200/// This intrinsic corresponds to the <c> PINSRW </c> instruction. 2201/// 2202/// \param a 2203/// A 64-bit vector of [4 x i16]. 2204/// \param d 2205/// An integer. The lower 16-bit value from this operand is written to the 2206/// destination at the offset specified by operand \a n. 2207/// \param n 2208/// An immediate integer operant that determines which the bits to be used 2209/// in the destination. \n 2210/// 0: Bits [15:0] are copied to the destination. \n 2211/// 1: Bits [31:16] are copied to the destination. \n 2212/// 2: Bits [47:32] are copied to the destination. \n 2213/// 3: Bits [63:48] are copied to the destination. \n 2214/// The remaining bits in the destination are copied from the corresponding 2215/// bits in operand \a a. 2216/// \returns A 64-bit integer vector containing the copied packed data from the 2217/// operands. 2218#define _mm_insert_pi16(a, d, n) \ 2219 (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n) 2220 2221/// Compares each of the corresponding packed 16-bit integer values of 2222/// the 64-bit integer vectors, and writes the greater value to the 2223/// corresponding bits in the destination. 2224/// 2225/// \headerfile <x86intrin.h> 2226/// 2227/// This intrinsic corresponds to the <c> PMAXSW </c> instruction. 2228/// 2229/// \param __a 2230/// A 64-bit integer vector containing one of the source operands. 2231/// \param __b 2232/// A 64-bit integer vector containing one of the source operands. 2233/// \returns A 64-bit integer vector containing the comparison results. 2234static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2235_mm_max_pi16(__m64 __a, __m64 __b) 2236{ 2237 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 2238} 2239 2240/// Compares each of the corresponding packed 8-bit unsigned integer 2241/// values of the 64-bit integer vectors, and writes the greater value to the 2242/// corresponding bits in the destination. 2243/// 2244/// \headerfile <x86intrin.h> 2245/// 2246/// This intrinsic corresponds to the <c> PMAXUB </c> instruction. 2247/// 2248/// \param __a 2249/// A 64-bit integer vector containing one of the source operands. 2250/// \param __b 2251/// A 64-bit integer vector containing one of the source operands. 2252/// \returns A 64-bit integer vector containing the comparison results. 2253static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2254_mm_max_pu8(__m64 __a, __m64 __b) 2255{ 2256 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 2257} 2258 2259/// Compares each of the corresponding packed 16-bit integer values of 2260/// the 64-bit integer vectors, and writes the lesser value to the 2261/// corresponding bits in the destination. 2262/// 2263/// \headerfile <x86intrin.h> 2264/// 2265/// This intrinsic corresponds to the <c> PMINSW </c> instruction. 2266/// 2267/// \param __a 2268/// A 64-bit integer vector containing one of the source operands. 2269/// \param __b 2270/// A 64-bit integer vector containing one of the source operands. 2271/// \returns A 64-bit integer vector containing the comparison results. 2272static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2273_mm_min_pi16(__m64 __a, __m64 __b) 2274{ 2275 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 2276} 2277 2278/// Compares each of the corresponding packed 8-bit unsigned integer 2279/// values of the 64-bit integer vectors, and writes the lesser value to the 2280/// corresponding bits in the destination. 2281/// 2282/// \headerfile <x86intrin.h> 2283/// 2284/// This intrinsic corresponds to the <c> PMINUB </c> instruction. 2285/// 2286/// \param __a 2287/// A 64-bit integer vector containing one of the source operands. 2288/// \param __b 2289/// A 64-bit integer vector containing one of the source operands. 2290/// \returns A 64-bit integer vector containing the comparison results. 2291static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2292_mm_min_pu8(__m64 __a, __m64 __b) 2293{ 2294 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 2295} 2296 2297/// Takes the most significant bit from each 8-bit element in a 64-bit 2298/// integer vector to create an 8-bit mask value. Zero-extends the value to 2299/// 32-bit integer and writes it to the destination. 2300/// 2301/// \headerfile <x86intrin.h> 2302/// 2303/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction. 2304/// 2305/// \param __a 2306/// A 64-bit integer vector containing the values with bits to be extracted. 2307/// \returns The most significant bit from each 8-bit element in \a __a, 2308/// written to bits [7:0]. 2309static __inline__ int __DEFAULT_FN_ATTRS_MMX 2310_mm_movemask_pi8(__m64 __a) 2311{ 2312 return __builtin_ia32_pmovmskb((__v8qi)__a); 2313} 2314 2315/// Multiplies packed 16-bit unsigned integer values and writes the 2316/// high-order 16 bits of each 32-bit product to the corresponding bits in 2317/// the destination. 2318/// 2319/// \headerfile <x86intrin.h> 2320/// 2321/// This intrinsic corresponds to the <c> PMULHUW </c> instruction. 2322/// 2323/// \param __a 2324/// A 64-bit integer vector containing one of the source operands. 2325/// \param __b 2326/// A 64-bit integer vector containing one of the source operands. 2327/// \returns A 64-bit integer vector containing the products of both operands. 2328static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2329_mm_mulhi_pu16(__m64 __a, __m64 __b) 2330{ 2331 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 2332} 2333 2334/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the 2335/// destination, as specified by the immediate value operand. 2336/// 2337/// \headerfile <x86intrin.h> 2338/// 2339/// \code 2340/// __m64 _mm_shuffle_pi16(__m64 a, const int n); 2341/// \endcode 2342/// 2343/// This intrinsic corresponds to the <c> PSHUFW </c> instruction. 2344/// 2345/// \param a 2346/// A 64-bit integer vector containing the values to be shuffled. 2347/// \param n 2348/// An immediate value containing an 8-bit value specifying which elements to 2349/// copy from \a a. The destinations within the 64-bit destination are 2350/// assigned values as follows: \n 2351/// Bits [1:0] are used to assign values to bits [15:0] in the 2352/// destination. \n 2353/// Bits [3:2] are used to assign values to bits [31:16] in the 2354/// destination. \n 2355/// Bits [5:4] are used to assign values to bits [47:32] in the 2356/// destination. \n 2357/// Bits [7:6] are used to assign values to bits [63:48] in the 2358/// destination. \n 2359/// Bit value assignments: \n 2360/// 00: assigned from bits [15:0] of \a a. \n 2361/// 01: assigned from bits [31:16] of \a a. \n 2362/// 10: assigned from bits [47:32] of \a a. \n 2363/// 11: assigned from bits [63:48] of \a a. 2364/// \returns A 64-bit integer vector containing the shuffled values. 2365#define _mm_shuffle_pi16(a, n) \ 2366 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) 2367 2368/// Conditionally copies the values from each 8-bit element in the first 2369/// 64-bit integer vector operand to the specified memory location, as 2370/// specified by the most significant bit in the corresponding element in the 2371/// second 64-bit integer vector operand. 2372/// 2373/// To minimize caching, the data is flagged as non-temporal 2374/// (unlikely to be used again soon). 2375/// 2376/// \headerfile <x86intrin.h> 2377/// 2378/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction. 2379/// 2380/// \param __d 2381/// A 64-bit integer vector containing the values with elements to be copied. 2382/// \param __n 2383/// A 64-bit integer vector operand. The most significant bit from each 8-bit 2384/// element determines whether the corresponding element in operand \a __d 2385/// is copied. If the most significant bit of a given element is 1, the 2386/// corresponding element in operand \a __d is copied. 2387/// \param __p 2388/// A pointer to a 64-bit memory location that will receive the conditionally 2389/// copied integer values. The address of the memory location does not have 2390/// to be aligned. 2391static __inline__ void __DEFAULT_FN_ATTRS_MMX 2392_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 2393{ 2394 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 2395} 2396 2397/// Computes the rounded averages of the packed unsigned 8-bit integer 2398/// values and writes the averages to the corresponding bits in the 2399/// destination. 2400/// 2401/// \headerfile <x86intrin.h> 2402/// 2403/// This intrinsic corresponds to the <c> PAVGB </c> instruction. 2404/// 2405/// \param __a 2406/// A 64-bit integer vector containing one of the source operands. 2407/// \param __b 2408/// A 64-bit integer vector containing one of the source operands. 2409/// \returns A 64-bit integer vector containing the averages of both operands. 2410static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2411_mm_avg_pu8(__m64 __a, __m64 __b) 2412{ 2413 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 2414} 2415 2416/// Computes the rounded averages of the packed unsigned 16-bit integer 2417/// values and writes the averages to the corresponding bits in the 2418/// destination. 2419/// 2420/// \headerfile <x86intrin.h> 2421/// 2422/// This intrinsic corresponds to the <c> PAVGW </c> instruction. 2423/// 2424/// \param __a 2425/// A 64-bit integer vector containing one of the source operands. 2426/// \param __b 2427/// A 64-bit integer vector containing one of the source operands. 2428/// \returns A 64-bit integer vector containing the averages of both operands. 2429static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2430_mm_avg_pu16(__m64 __a, __m64 __b) 2431{ 2432 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 2433} 2434 2435/// Subtracts the corresponding 8-bit unsigned integer values of the two 2436/// 64-bit vector operands and computes the absolute value for each of the 2437/// difference. Then sum of the 8 absolute differences is written to the 2438/// bits [15:0] of the destination; the remaining bits [63:16] are cleared. 2439/// 2440/// \headerfile <x86intrin.h> 2441/// 2442/// This intrinsic corresponds to the <c> PSADBW </c> instruction. 2443/// 2444/// \param __a 2445/// A 64-bit integer vector containing one of the source operands. 2446/// \param __b 2447/// A 64-bit integer vector containing one of the source operands. 2448/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the 2449/// sets of absolute differences between both operands. The upper bits are 2450/// cleared. 2451static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2452_mm_sad_pu8(__m64 __a, __m64 __b) 2453{ 2454 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 2455} 2456 2457#if defined(__cplusplus) 2458extern "C" { 2459#endif 2460 2461/// Returns the contents of the MXCSR register as a 32-bit unsigned 2462/// integer value. 2463/// 2464/// There are several groups of macros associated with this 2465/// intrinsic, including: 2466/// <ul> 2467/// <li> 2468/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2469/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2470/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2471/// _MM_GET_EXCEPTION_STATE(). 2472/// </li> 2473/// <li> 2474/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2475/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2476/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). 2477/// </li> 2478/// <li> 2479/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2480/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2481/// _MM_GET_ROUNDING_MODE(). 2482/// </li> 2483/// <li> 2484/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2485/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). 2486/// </li> 2487/// <li> 2488/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2489/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2490/// _MM_GET_DENORMALS_ZERO_MODE(). 2491/// </li> 2492/// </ul> 2493/// 2494/// For example, the following expression checks if an overflow exception has 2495/// occurred: 2496/// \code 2497/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) 2498/// \endcode 2499/// 2500/// The following expression gets the current rounding mode: 2501/// \code 2502/// _MM_GET_ROUNDING_MODE() 2503/// \endcode 2504/// 2505/// \headerfile <x86intrin.h> 2506/// 2507/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction. 2508/// 2509/// \returns A 32-bit unsigned integer containing the contents of the MXCSR 2510/// register. 2511unsigned int _mm_getcsr(void); 2512 2513/// Sets the MXCSR register with the 32-bit unsigned integer value. 2514/// 2515/// There are several groups of macros associated with this intrinsic, 2516/// including: 2517/// <ul> 2518/// <li> 2519/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2520/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2521/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2522/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. 2523/// </li> 2524/// <li> 2525/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2526/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2527/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one 2528/// of these macros. 2529/// </li> 2530/// <li> 2531/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2532/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2533/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. 2534/// </li> 2535/// <li> 2536/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2537/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is 2538/// one of these macros. 2539/// </li> 2540/// <li> 2541/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2542/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2543/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. 2544/// </li> 2545/// </ul> 2546/// 2547/// For example, the following expression causes subsequent floating-point 2548/// operations to round up: 2549/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) 2550/// 2551/// The following example sets the DAZ and FTZ flags: 2552/// \code 2553/// void setFlags() { 2554/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); 2555/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 2556/// } 2557/// \endcode 2558/// 2559/// \headerfile <x86intrin.h> 2560/// 2561/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction. 2562/// 2563/// \param __i 2564/// A 32-bit unsigned integer value to be written to the MXCSR register. 2565void _mm_setcsr(unsigned int __i); 2566 2567#if defined(__cplusplus) 2568} // extern "C" 2569#endif 2570 2571/// Selects 4 float values from the 128-bit operands of [4 x float], as 2572/// specified by the immediate value operand. 2573/// 2574/// \headerfile <x86intrin.h> 2575/// 2576/// \code 2577/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); 2578/// \endcode 2579/// 2580/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction. 2581/// 2582/// \param a 2583/// A 128-bit vector of [4 x float]. 2584/// \param b 2585/// A 128-bit vector of [4 x float]. 2586/// \param mask 2587/// An immediate value containing an 8-bit value specifying which elements to 2588/// copy from \a a and \a b. \n 2589/// Bits [3:0] specify the values copied from operand \a a. \n 2590/// Bits [7:4] specify the values copied from operand \a b. \n 2591/// The destinations within the 128-bit destination are assigned values as 2592/// follows: \n 2593/// Bits [1:0] are used to assign values to bits [31:0] in the 2594/// destination. \n 2595/// Bits [3:2] are used to assign values to bits [63:32] in the 2596/// destination. \n 2597/// Bits [5:4] are used to assign values to bits [95:64] in the 2598/// destination. \n 2599/// Bits [7:6] are used to assign values to bits [127:96] in the 2600/// destination. \n 2601/// Bit value assignments: \n 2602/// 00: Bits [31:0] copied from the specified operand. \n 2603/// 01: Bits [63:32] copied from the specified operand. \n 2604/// 10: Bits [95:64] copied from the specified operand. \n 2605/// 11: Bits [127:96] copied from the specified operand. 2606/// \returns A 128-bit vector of [4 x float] containing the shuffled values. 2607#define _mm_shuffle_ps(a, b, mask) \ 2608 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 2609 (int)(mask)) 2610 2611/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 2612/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2613/// 2614/// \headerfile <x86intrin.h> 2615/// 2616/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction. 2617/// 2618/// \param __a 2619/// A 128-bit vector of [4 x float]. \n 2620/// Bits [95:64] are written to bits [31:0] of the destination. \n 2621/// Bits [127:96] are written to bits [95:64] of the destination. 2622/// \param __b 2623/// A 128-bit vector of [4 x float]. 2624/// Bits [95:64] are written to bits [63:32] of the destination. \n 2625/// Bits [127:96] are written to bits [127:96] of the destination. 2626/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2627static __inline__ __m128 __DEFAULT_FN_ATTRS 2628_mm_unpackhi_ps(__m128 __a, __m128 __b) 2629{ 2630 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 2631} 2632 2633/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 2634/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2635/// 2636/// \headerfile <x86intrin.h> 2637/// 2638/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction. 2639/// 2640/// \param __a 2641/// A 128-bit vector of [4 x float]. \n 2642/// Bits [31:0] are written to bits [31:0] of the destination. \n 2643/// Bits [63:32] are written to bits [95:64] of the destination. 2644/// \param __b 2645/// A 128-bit vector of [4 x float]. \n 2646/// Bits [31:0] are written to bits [63:32] of the destination. \n 2647/// Bits [63:32] are written to bits [127:96] of the destination. 2648/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2649static __inline__ __m128 __DEFAULT_FN_ATTRS 2650_mm_unpacklo_ps(__m128 __a, __m128 __b) 2651{ 2652 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 2653} 2654 2655/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2656/// 32 bits are set to the lower 32 bits of the second parameter. The upper 2657/// 96 bits are set to the upper 96 bits of the first parameter. 2658/// 2659/// \headerfile <x86intrin.h> 2660/// 2661/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c> 2662/// instruction. 2663/// 2664/// \param __a 2665/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are 2666/// written to the upper 96 bits of the result. 2667/// \param __b 2668/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are 2669/// written to the lower 32 bits of the result. 2670/// \returns A 128-bit floating-point vector of [4 x float]. 2671static __inline__ __m128 __DEFAULT_FN_ATTRS 2672_mm_move_ss(__m128 __a, __m128 __b) 2673{ 2674 __a[0] = __b[0]; 2675 return __a; 2676} 2677 2678/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2679/// 64 bits are set to the upper 64 bits of the second parameter. The upper 2680/// 64 bits are set to the upper 64 bits of the first parameter. 2681/// 2682/// \headerfile <x86intrin.h> 2683/// 2684/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 2685/// 2686/// \param __a 2687/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2688/// written to the upper 64 bits of the result. 2689/// \param __b 2690/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2691/// written to the lower 64 bits of the result. 2692/// \returns A 128-bit floating-point vector of [4 x float]. 2693static __inline__ __m128 __DEFAULT_FN_ATTRS 2694_mm_movehl_ps(__m128 __a, __m128 __b) 2695{ 2696 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 2697} 2698 2699/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2700/// 64 bits are set to the lower 64 bits of the first parameter. The upper 2701/// 64 bits are set to the lower 64 bits of the second parameter. 2702/// 2703/// \headerfile <x86intrin.h> 2704/// 2705/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 2706/// 2707/// \param __a 2708/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2709/// written to the lower 64 bits of the result. 2710/// \param __b 2711/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2712/// written to the upper 64 bits of the result. 2713/// \returns A 128-bit floating-point vector of [4 x float]. 2714static __inline__ __m128 __DEFAULT_FN_ATTRS 2715_mm_movelh_ps(__m128 __a, __m128 __b) 2716{ 2717 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 2718} 2719 2720/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x 2721/// float]. 2722/// 2723/// \headerfile <x86intrin.h> 2724/// 2725/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2726/// 2727/// \param __a 2728/// A 64-bit vector of [4 x i16]. The elements of the destination are copied 2729/// from the corresponding elements in this operand. 2730/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2731/// values from the operand. 2732static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2733_mm_cvtpi16_ps(__m64 __a) 2734{ 2735 __m64 __b, __c; 2736 __m128 __r; 2737 2738 __b = _mm_setzero_si64(); 2739 __b = _mm_cmpgt_pi16(__b, __a); 2740 __c = _mm_unpackhi_pi16(__a, __b); 2741 __r = _mm_setzero_ps(); 2742 __r = _mm_cvtpi32_ps(__r, __c); 2743 __r = _mm_movelh_ps(__r, __r); 2744 __c = _mm_unpacklo_pi16(__a, __b); 2745 __r = _mm_cvtpi32_ps(__r, __c); 2746 2747 return __r; 2748} 2749 2750/// Converts a 64-bit vector of 16-bit unsigned integer values into a 2751/// 128-bit vector of [4 x float]. 2752/// 2753/// \headerfile <x86intrin.h> 2754/// 2755/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2756/// 2757/// \param __a 2758/// A 64-bit vector of 16-bit unsigned integer values. The elements of the 2759/// destination are copied from the corresponding elements in this operand. 2760/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2761/// values from the operand. 2762static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2763_mm_cvtpu16_ps(__m64 __a) 2764{ 2765 __m64 __b, __c; 2766 __m128 __r; 2767 2768 __b = _mm_setzero_si64(); 2769 __c = _mm_unpackhi_pi16(__a, __b); 2770 __r = _mm_setzero_ps(); 2771 __r = _mm_cvtpi32_ps(__r, __c); 2772 __r = _mm_movelh_ps(__r, __r); 2773 __c = _mm_unpacklo_pi16(__a, __b); 2774 __r = _mm_cvtpi32_ps(__r, __c); 2775 2776 return __r; 2777} 2778 2779/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] 2780/// into a 128-bit vector of [4 x float]. 2781/// 2782/// \headerfile <x86intrin.h> 2783/// 2784/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2785/// 2786/// \param __a 2787/// A 64-bit vector of [8 x i8]. The elements of the destination are copied 2788/// from the corresponding lower 4 elements in this operand. 2789/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2790/// values from the operand. 2791static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2792_mm_cvtpi8_ps(__m64 __a) 2793{ 2794 __m64 __b; 2795 2796 __b = _mm_setzero_si64(); 2797 __b = _mm_cmpgt_pi8(__b, __a); 2798 __b = _mm_unpacklo_pi8(__a, __b); 2799 2800 return _mm_cvtpi16_ps(__b); 2801} 2802 2803/// Converts the lower four unsigned 8-bit integer values from a 64-bit 2804/// vector of [8 x u8] into a 128-bit vector of [4 x float]. 2805/// 2806/// \headerfile <x86intrin.h> 2807/// 2808/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2809/// 2810/// \param __a 2811/// A 64-bit vector of unsigned 8-bit integer values. The elements of the 2812/// destination are copied from the corresponding lower 4 elements in this 2813/// operand. 2814/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2815/// values from the source operand. 2816static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2817_mm_cvtpu8_ps(__m64 __a) 2818{ 2819 __m64 __b; 2820 2821 __b = _mm_setzero_si64(); 2822 __b = _mm_unpacklo_pi8(__a, __b); 2823 2824 return _mm_cvtpi16_ps(__b); 2825} 2826 2827/// Converts the two 32-bit signed integer values from each 64-bit vector 2828/// operand of [2 x i32] into a 128-bit vector of [4 x float]. 2829/// 2830/// \headerfile <x86intrin.h> 2831/// 2832/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2833/// 2834/// \param __a 2835/// A 64-bit vector of [2 x i32]. The lower elements of the destination are 2836/// copied from the elements in this operand. 2837/// \param __b 2838/// A 64-bit vector of [2 x i32]. The upper elements of the destination are 2839/// copied from the elements in this operand. 2840/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 2841/// copied and converted values from the first operand. The upper 64 bits 2842/// contain the copied and converted values from the second operand. 2843static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2844_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 2845{ 2846 __m128 __c; 2847 2848 __c = _mm_setzero_ps(); 2849 __c = _mm_cvtpi32_ps(__c, __b); 2850 __c = _mm_movelh_ps(__c, __c); 2851 2852 return _mm_cvtpi32_ps(__c, __a); 2853} 2854 2855/// Converts each single-precision floating-point element of a 128-bit 2856/// floating-point vector of [4 x float] into a 16-bit signed integer, and 2857/// packs the results into a 64-bit integer vector of [4 x i16]. 2858/// 2859/// If the floating-point element is NaN or infinity, or if the 2860/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, 2861/// it is converted to 0x8000. Otherwise if the floating-point element is 2862/// greater than 0x7FFF, it is converted to 0x7FFF. 2863/// 2864/// \headerfile <x86intrin.h> 2865/// 2866/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2867/// 2868/// \param __a 2869/// A 128-bit floating-point vector of [4 x float]. 2870/// \returns A 64-bit integer vector of [4 x i16] containing the converted 2871/// values. 2872static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2873_mm_cvtps_pi16(__m128 __a) 2874{ 2875 __m64 __b, __c; 2876 2877 __b = _mm_cvtps_pi32(__a); 2878 __a = _mm_movehl_ps(__a, __a); 2879 __c = _mm_cvtps_pi32(__a); 2880 2881 return _mm_packs_pi32(__b, __c); 2882} 2883 2884/// Converts each single-precision floating-point element of a 128-bit 2885/// floating-point vector of [4 x float] into an 8-bit signed integer, and 2886/// packs the results into the lower 32 bits of a 64-bit integer vector of 2887/// [8 x i8]. The upper 32 bits of the vector are set to 0. 2888/// 2889/// If the floating-point element is NaN or infinity, or if the 2890/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it 2891/// is converted to 0x80. Otherwise if the floating-point element is greater 2892/// than 0x7F, it is converted to 0x7F. 2893/// 2894/// \headerfile <x86intrin.h> 2895/// 2896/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2897/// 2898/// \param __a 2899/// 128-bit floating-point vector of [4 x float]. 2900/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the 2901/// converted values and the uppper 32 bits are set to zero. 2902static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2903_mm_cvtps_pi8(__m128 __a) 2904{ 2905 __m64 __b, __c; 2906 2907 __b = _mm_cvtps_pi16(__a); 2908 __c = _mm_setzero_si64(); 2909 2910 return _mm_packs_pi16(__b, __c); 2911} 2912 2913/// Extracts the sign bits from each single-precision floating-point 2914/// element of a 128-bit floating-point vector of [4 x float] and returns the 2915/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set 2916/// to zero. 2917/// 2918/// \headerfile <x86intrin.h> 2919/// 2920/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction. 2921/// 2922/// \param __a 2923/// A 128-bit floating-point vector of [4 x float]. 2924/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each 2925/// single-precision floating-point element of the parameter. Bits [31:4] are 2926/// set to zero. 2927static __inline__ int __DEFAULT_FN_ATTRS 2928_mm_movemask_ps(__m128 __a) 2929{ 2930 return __builtin_ia32_movmskps((__v4sf)__a); 2931} 2932 2933 2934#define _MM_ALIGN16 __attribute__((aligned(16))) 2935 2936#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2937 2938#define _MM_EXCEPT_INVALID (0x0001) 2939#define _MM_EXCEPT_DENORM (0x0002) 2940#define _MM_EXCEPT_DIV_ZERO (0x0004) 2941#define _MM_EXCEPT_OVERFLOW (0x0008) 2942#define _MM_EXCEPT_UNDERFLOW (0x0010) 2943#define _MM_EXCEPT_INEXACT (0x0020) 2944#define _MM_EXCEPT_MASK (0x003f) 2945 2946#define _MM_MASK_INVALID (0x0080) 2947#define _MM_MASK_DENORM (0x0100) 2948#define _MM_MASK_DIV_ZERO (0x0200) 2949#define _MM_MASK_OVERFLOW (0x0400) 2950#define _MM_MASK_UNDERFLOW (0x0800) 2951#define _MM_MASK_INEXACT (0x1000) 2952#define _MM_MASK_MASK (0x1f80) 2953 2954#define _MM_ROUND_NEAREST (0x0000) 2955#define _MM_ROUND_DOWN (0x2000) 2956#define _MM_ROUND_UP (0x4000) 2957#define _MM_ROUND_TOWARD_ZERO (0x6000) 2958#define _MM_ROUND_MASK (0x6000) 2959 2960#define _MM_FLUSH_ZERO_MASK (0x8000) 2961#define _MM_FLUSH_ZERO_ON (0x8000) 2962#define _MM_FLUSH_ZERO_OFF (0x0000) 2963 2964#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 2965#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 2966#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 2967#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 2968 2969#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 2970#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 2971#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 2972#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 2973 2974#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2975do { \ 2976 __m128 tmp3, tmp2, tmp1, tmp0; \ 2977 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 2978 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 2979 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 2980 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 2981 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 2982 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 2983 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 2984 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 2985} while (0) 2986 2987/* Aliases for compatibility. */ 2988#define _m_pextrw _mm_extract_pi16 2989#define _m_pinsrw _mm_insert_pi16 2990#define _m_pmaxsw _mm_max_pi16 2991#define _m_pmaxub _mm_max_pu8 2992#define _m_pminsw _mm_min_pi16 2993#define _m_pminub _mm_min_pu8 2994#define _m_pmovmskb _mm_movemask_pi8 2995#define _m_pmulhuw _mm_mulhi_pu16 2996#define _m_pshufw _mm_shuffle_pi16 2997#define _m_maskmovq _mm_maskmove_si64 2998#define _m_pavgb _mm_avg_pu8 2999#define _m_pavgw _mm_avg_pu16 3000#define _m_psadbw _mm_sad_pu8 3001#define _m_ _mm_ 3002#define _m_ _mm_ 3003 3004#undef __DEFAULT_FN_ATTRS 3005#undef __DEFAULT_FN_ATTRS_MMX 3006 3007/* Ugly hack for backwards-compatibility (compatible with gcc) */ 3008#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 3009#include <emmintrin.h> 3010#endif 3011 3012#endif /* __XMMINTRIN_H */ 3013