xmmintrin.h revision 360784
157434Smarkm/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 257434Smarkm * 3156813Sru * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4156813Sru * See https://llvm.org/LICENSE.txt for license information. 557434Smarkm * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6255386Sdes * 7195767Skensmith *===-----------------------------------------------------------------------=== 8248619Sdes */ 9248619Sdes 10126282Sdes#ifndef __XMMINTRIN_H 11126282Sdes#define __XMMINTRIN_H 12261320Sdes 13181111Sdes#include <mmintrin.h> 14137018Sdes 15221420Sdestypedef int __v4si __attribute__((__vector_size__(16))); 16221420Sdestypedef float __v4sf __attribute__((__vector_size__(16))); 17261320Sdestypedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 18263712Sdes 19261320Sdestypedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); 20263712Sdes 21261320Sdes/* Unsigned types */ 22147098Sdestypedef unsigned int __v4su __attribute__((__vector_size__(16))); 23147098Sdes 24147098Sdes/* This header should only be included in a hosted environment as it depends on 25147098Sdes * a standard library to provide allocation routines. */ 2698820Sdes#if __STDC_HOSTED__ 27263712Sdes#include <mm_malloc.h> 28263712Sdes#endif 29221420Sdes 3057434Smarkm/* Define the default attributes for the functions in this file. */ 31255460Sdes#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) 32255460Sdes#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) 33255460Sdes 34255460Sdes/// Adds the 32-bit float values in the low-order bits of the operands. 35255460Sdes/// 36255460Sdes/// \headerfile <x86intrin.h> 37255460Sdes/// 38255460Sdes/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions. 39255460Sdes/// 40106538Sobrien/// \param __a 41103960Smarkm/// A 128-bit vector of [4 x float] containing one of the source operands. 42158519Sdes/// The lower 32 bits of this operand are used in the calculation. 43124250Sru/// \param __b 44156813Sru/// A 128-bit vector of [4 x float] containing one of the source operands. 45255829Sdes/// The lower 32 bits of this operand are used in the calculation. 46178828Sdfr/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 47178828Sdfr/// of the lower 32 bits of both operands. The upper 96 bits are copied from 48106132Sdes/// the upper 96 bits of the first source operand. 49106132Sdesstatic __inline__ __m128 __DEFAULT_FN_ATTRS 50245527Sbz_mm_add_ss(__m128 __a, __m128 __b) 51245527Sbz{ 52245527Sbz __a[0] += __b[0]; 53245527Sbz return __a; 54139106Sru} 5557434Smarkm 56255460Sdes/// Adds two 128-bit vectors of [4 x float], and returns the results of 57255460Sdes/// the addition. 5890405Sru/// 5957434Smarkm/// \headerfile <x86intrin.h> 6074818Sru/// 61106538Sobrien/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions. 62158529Sdes/// 63158529Sdes/// \param __a 64255829Sdes/// A 128-bit vector of [4 x float] containing one of the source operands. 65255829Sdes/// \param __b 66255829Sdes/// A 128-bit vector of [4 x float] containing one of the source operands. 67/// \returns A 128-bit vector of [4 x float] containing the sums of both 68/// operands. 69static __inline__ __m128 __DEFAULT_FN_ATTRS 70_mm_add_ps(__m128 __a, __m128 __b) 71{ 72 return (__m128)((__v4sf)__a + (__v4sf)__b); 73} 74 75/// Subtracts the 32-bit float value in the low-order bits of the second 76/// operand from the corresponding value in the first operand. 77/// 78/// \headerfile <x86intrin.h> 79/// 80/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions. 81/// 82/// \param __a 83/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 84/// of this operand are used in the calculation. 85/// \param __b 86/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 87/// bits of this operand are used in the calculation. 88/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 89/// difference of the lower 32 bits of both operands. The upper 96 bits are 90/// copied from the upper 96 bits of the first source operand. 91static __inline__ __m128 __DEFAULT_FN_ATTRS 92_mm_sub_ss(__m128 __a, __m128 __b) 93{ 94 __a[0] -= __b[0]; 95 return __a; 96} 97 98/// Subtracts each of the values of the second operand from the first 99/// operand, both of which are 128-bit vectors of [4 x float] and returns 100/// the results of the subtraction. 101/// 102/// \headerfile <x86intrin.h> 103/// 104/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions. 105/// 106/// \param __a 107/// A 128-bit vector of [4 x float] containing the minuend. 108/// \param __b 109/// A 128-bit vector of [4 x float] containing the subtrahend. 110/// \returns A 128-bit vector of [4 x float] containing the differences between 111/// both operands. 112static __inline__ __m128 __DEFAULT_FN_ATTRS 113_mm_sub_ps(__m128 __a, __m128 __b) 114{ 115 return (__m128)((__v4sf)__a - (__v4sf)__b); 116} 117 118/// Multiplies two 32-bit float values in the low-order bits of the 119/// operands. 120/// 121/// \headerfile <x86intrin.h> 122/// 123/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions. 124/// 125/// \param __a 126/// A 128-bit vector of [4 x float] containing one of the source operands. 127/// The lower 32 bits of this operand are used in the calculation. 128/// \param __b 129/// A 128-bit vector of [4 x float] containing one of the source operands. 130/// The lower 32 bits of this operand are used in the calculation. 131/// \returns A 128-bit vector of [4 x float] containing the product of the lower 132/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 133/// bits of the first source operand. 134static __inline__ __m128 __DEFAULT_FN_ATTRS 135_mm_mul_ss(__m128 __a, __m128 __b) 136{ 137 __a[0] *= __b[0]; 138 return __a; 139} 140 141/// Multiplies two 128-bit vectors of [4 x float] and returns the 142/// results of the multiplication. 143/// 144/// \headerfile <x86intrin.h> 145/// 146/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions. 147/// 148/// \param __a 149/// A 128-bit vector of [4 x float] containing one of the source operands. 150/// \param __b 151/// A 128-bit vector of [4 x float] containing one of the source operands. 152/// \returns A 128-bit vector of [4 x float] containing the products of both 153/// operands. 154static __inline__ __m128 __DEFAULT_FN_ATTRS 155_mm_mul_ps(__m128 __a, __m128 __b) 156{ 157 return (__m128)((__v4sf)__a * (__v4sf)__b); 158} 159 160/// Divides the value in the low-order 32 bits of the first operand by 161/// the corresponding value in the second operand. 162/// 163/// \headerfile <x86intrin.h> 164/// 165/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions. 166/// 167/// \param __a 168/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 169/// bits of this operand are used in the calculation. 170/// \param __b 171/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 172/// of this operand are used in the calculation. 173/// \returns A 128-bit vector of [4 x float] containing the quotients of the 174/// lower 32 bits of both operands. The upper 96 bits are copied from the 175/// upper 96 bits of the first source operand. 176static __inline__ __m128 __DEFAULT_FN_ATTRS 177_mm_div_ss(__m128 __a, __m128 __b) 178{ 179 __a[0] /= __b[0]; 180 return __a; 181} 182 183/// Divides two 128-bit vectors of [4 x float]. 184/// 185/// \headerfile <x86intrin.h> 186/// 187/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions. 188/// 189/// \param __a 190/// A 128-bit vector of [4 x float] containing the dividend. 191/// \param __b 192/// A 128-bit vector of [4 x float] containing the divisor. 193/// \returns A 128-bit vector of [4 x float] containing the quotients of both 194/// operands. 195static __inline__ __m128 __DEFAULT_FN_ATTRS 196_mm_div_ps(__m128 __a, __m128 __b) 197{ 198 return (__m128)((__v4sf)__a / (__v4sf)__b); 199} 200 201/// Calculates the square root of the value stored in the low-order bits 202/// of a 128-bit vector of [4 x float]. 203/// 204/// \headerfile <x86intrin.h> 205/// 206/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions. 207/// 208/// \param __a 209/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 210/// used in the calculation. 211/// \returns A 128-bit vector of [4 x float] containing the square root of the 212/// value in the low-order bits of the operand. 213static __inline__ __m128 __DEFAULT_FN_ATTRS 214_mm_sqrt_ss(__m128 __a) 215{ 216 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); 217} 218 219/// Calculates the square roots of the values stored in a 128-bit vector 220/// of [4 x float]. 221/// 222/// \headerfile <x86intrin.h> 223/// 224/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions. 225/// 226/// \param __a 227/// A 128-bit vector of [4 x float]. 228/// \returns A 128-bit vector of [4 x float] containing the square roots of the 229/// values in the operand. 230static __inline__ __m128 __DEFAULT_FN_ATTRS 231_mm_sqrt_ps(__m128 __a) 232{ 233 return __builtin_ia32_sqrtps((__v4sf)__a); 234} 235 236/// Calculates the approximate reciprocal of the value stored in the 237/// low-order bits of a 128-bit vector of [4 x float]. 238/// 239/// \headerfile <x86intrin.h> 240/// 241/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions. 242/// 243/// \param __a 244/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 245/// used in the calculation. 246/// \returns A 128-bit vector of [4 x float] containing the approximate 247/// reciprocal of the value in the low-order bits of the operand. 248static __inline__ __m128 __DEFAULT_FN_ATTRS 249_mm_rcp_ss(__m128 __a) 250{ 251 return (__m128)__builtin_ia32_rcpss((__v4sf)__a); 252} 253 254/// Calculates the approximate reciprocals of the values stored in a 255/// 128-bit vector of [4 x float]. 256/// 257/// \headerfile <x86intrin.h> 258/// 259/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions. 260/// 261/// \param __a 262/// A 128-bit vector of [4 x float]. 263/// \returns A 128-bit vector of [4 x float] containing the approximate 264/// reciprocals of the values in the operand. 265static __inline__ __m128 __DEFAULT_FN_ATTRS 266_mm_rcp_ps(__m128 __a) 267{ 268 return (__m128)__builtin_ia32_rcpps((__v4sf)__a); 269} 270 271/// Calculates the approximate reciprocal of the square root of the value 272/// stored in the low-order bits of a 128-bit vector of [4 x float]. 273/// 274/// \headerfile <x86intrin.h> 275/// 276/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions. 277/// 278/// \param __a 279/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 280/// used in the calculation. 281/// \returns A 128-bit vector of [4 x float] containing the approximate 282/// reciprocal of the square root of the value in the low-order bits of the 283/// operand. 284static __inline__ __m128 __DEFAULT_FN_ATTRS 285_mm_rsqrt_ss(__m128 __a) 286{ 287 return __builtin_ia32_rsqrtss((__v4sf)__a); 288} 289 290/// Calculates the approximate reciprocals of the square roots of the 291/// values stored in a 128-bit vector of [4 x float]. 292/// 293/// \headerfile <x86intrin.h> 294/// 295/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions. 296/// 297/// \param __a 298/// A 128-bit vector of [4 x float]. 299/// \returns A 128-bit vector of [4 x float] containing the approximate 300/// reciprocals of the square roots of the values in the operand. 301static __inline__ __m128 __DEFAULT_FN_ATTRS 302_mm_rsqrt_ps(__m128 __a) 303{ 304 return __builtin_ia32_rsqrtps((__v4sf)__a); 305} 306 307/// Compares two 32-bit float values in the low-order bits of both 308/// operands and returns the lesser value in the low-order bits of the 309/// vector of [4 x float]. 310/// 311/// \headerfile <x86intrin.h> 312/// 313/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions. 314/// 315/// \param __a 316/// A 128-bit vector of [4 x float] containing one of the operands. The lower 317/// 32 bits of this operand are used in the comparison. 318/// \param __b 319/// A 128-bit vector of [4 x float] containing one of the operands. The lower 320/// 32 bits of this operand are used in the comparison. 321/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 322/// minimum value between both operands. The upper 96 bits are copied from 323/// the upper 96 bits of the first source operand. 324static __inline__ __m128 __DEFAULT_FN_ATTRS 325_mm_min_ss(__m128 __a, __m128 __b) 326{ 327 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 328} 329 330/// Compares two 128-bit vectors of [4 x float] and returns the lesser 331/// of each pair of values. 332/// 333/// \headerfile <x86intrin.h> 334/// 335/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions. 336/// 337/// \param __a 338/// A 128-bit vector of [4 x float] containing one of the operands. 339/// \param __b 340/// A 128-bit vector of [4 x float] containing one of the operands. 341/// \returns A 128-bit vector of [4 x float] containing the minimum values 342/// between both operands. 343static __inline__ __m128 __DEFAULT_FN_ATTRS 344_mm_min_ps(__m128 __a, __m128 __b) 345{ 346 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 347} 348 349/// Compares two 32-bit float values in the low-order bits of both 350/// operands and returns the greater value in the low-order bits of a 128-bit 351/// vector of [4 x float]. 352/// 353/// \headerfile <x86intrin.h> 354/// 355/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions. 356/// 357/// \param __a 358/// A 128-bit vector of [4 x float] containing one of the operands. The lower 359/// 32 bits of this operand are used in the comparison. 360/// \param __b 361/// A 128-bit vector of [4 x float] containing one of the operands. The lower 362/// 32 bits of this operand are used in the comparison. 363/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 364/// maximum value between both operands. The upper 96 bits are copied from 365/// the upper 96 bits of the first source operand. 366static __inline__ __m128 __DEFAULT_FN_ATTRS 367_mm_max_ss(__m128 __a, __m128 __b) 368{ 369 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 370} 371 372/// Compares two 128-bit vectors of [4 x float] and returns the greater 373/// of each pair of values. 374/// 375/// \headerfile <x86intrin.h> 376/// 377/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions. 378/// 379/// \param __a 380/// A 128-bit vector of [4 x float] containing one of the operands. 381/// \param __b 382/// A 128-bit vector of [4 x float] containing one of the operands. 383/// \returns A 128-bit vector of [4 x float] containing the maximum values 384/// between both operands. 385static __inline__ __m128 __DEFAULT_FN_ATTRS 386_mm_max_ps(__m128 __a, __m128 __b) 387{ 388 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 389} 390 391/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. 392/// 393/// \headerfile <x86intrin.h> 394/// 395/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions. 396/// 397/// \param __a 398/// A 128-bit vector containing one of the source operands. 399/// \param __b 400/// A 128-bit vector containing one of the source operands. 401/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 402/// values between both operands. 403static __inline__ __m128 __DEFAULT_FN_ATTRS 404_mm_and_ps(__m128 __a, __m128 __b) 405{ 406 return (__m128)((__v4su)__a & (__v4su)__b); 407} 408 409/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using 410/// the one's complement of the values contained in the first source 411/// operand. 412/// 413/// \headerfile <x86intrin.h> 414/// 415/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions. 416/// 417/// \param __a 418/// A 128-bit vector of [4 x float] containing the first source operand. The 419/// one's complement of this value is used in the bitwise AND. 420/// \param __b 421/// A 128-bit vector of [4 x float] containing the second source operand. 422/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 423/// one's complement of the first operand and the values in the second 424/// operand. 425static __inline__ __m128 __DEFAULT_FN_ATTRS 426_mm_andnot_ps(__m128 __a, __m128 __b) 427{ 428 return (__m128)(~(__v4su)__a & (__v4su)__b); 429} 430 431/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. 432/// 433/// \headerfile <x86intrin.h> 434/// 435/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions. 436/// 437/// \param __a 438/// A 128-bit vector of [4 x float] containing one of the source operands. 439/// \param __b 440/// A 128-bit vector of [4 x float] containing one of the source operands. 441/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 442/// values between both operands. 443static __inline__ __m128 __DEFAULT_FN_ATTRS 444_mm_or_ps(__m128 __a, __m128 __b) 445{ 446 return (__m128)((__v4su)__a | (__v4su)__b); 447} 448 449/// Performs a bitwise exclusive OR of two 128-bit vectors of 450/// [4 x float]. 451/// 452/// \headerfile <x86intrin.h> 453/// 454/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions. 455/// 456/// \param __a 457/// A 128-bit vector of [4 x float] containing one of the source operands. 458/// \param __b 459/// A 128-bit vector of [4 x float] containing one of the source operands. 460/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 461/// of the values between both operands. 462static __inline__ __m128 __DEFAULT_FN_ATTRS 463_mm_xor_ps(__m128 __a, __m128 __b) 464{ 465 return (__m128)((__v4su)__a ^ (__v4su)__b); 466} 467 468/// Compares two 32-bit float values in the low-order bits of both 469/// operands for equality and returns the result of the comparison in the 470/// low-order bits of a vector [4 x float]. 471/// 472/// \headerfile <x86intrin.h> 473/// 474/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions. 475/// 476/// \param __a 477/// A 128-bit vector of [4 x float] containing one of the operands. The lower 478/// 32 bits of this operand are used in the comparison. 479/// \param __b 480/// A 128-bit vector of [4 x float] containing one of the operands. The lower 481/// 32 bits of this operand are used in the comparison. 482/// \returns A 128-bit vector of [4 x float] containing the comparison results 483/// in the low-order bits. 484static __inline__ __m128 __DEFAULT_FN_ATTRS 485_mm_cmpeq_ss(__m128 __a, __m128 __b) 486{ 487 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 488} 489 490/// Compares each of the corresponding 32-bit float values of the 491/// 128-bit vectors of [4 x float] for equality. 492/// 493/// \headerfile <x86intrin.h> 494/// 495/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions. 496/// 497/// \param __a 498/// A 128-bit vector of [4 x float]. 499/// \param __b 500/// A 128-bit vector of [4 x float]. 501/// \returns A 128-bit vector of [4 x float] containing the comparison results. 502static __inline__ __m128 __DEFAULT_FN_ATTRS 503_mm_cmpeq_ps(__m128 __a, __m128 __b) 504{ 505 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 506} 507 508/// Compares two 32-bit float values in the low-order bits of both 509/// operands to determine if the value in the first operand is less than the 510/// corresponding value in the second operand and returns the result of the 511/// comparison in the low-order bits of a vector of [4 x float]. 512/// 513/// \headerfile <x86intrin.h> 514/// 515/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 516/// 517/// \param __a 518/// A 128-bit vector of [4 x float] containing one of the operands. The lower 519/// 32 bits of this operand are used in the comparison. 520/// \param __b 521/// A 128-bit vector of [4 x float] containing one of the operands. The lower 522/// 32 bits of this operand are used in the comparison. 523/// \returns A 128-bit vector of [4 x float] containing the comparison results 524/// in the low-order bits. 525static __inline__ __m128 __DEFAULT_FN_ATTRS 526_mm_cmplt_ss(__m128 __a, __m128 __b) 527{ 528 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 529} 530 531/// Compares each of the corresponding 32-bit float values of the 532/// 128-bit vectors of [4 x float] to determine if the values in the first 533/// operand are less than those in the second operand. 534/// 535/// \headerfile <x86intrin.h> 536/// 537/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 538/// 539/// \param __a 540/// A 128-bit vector of [4 x float]. 541/// \param __b 542/// A 128-bit vector of [4 x float]. 543/// \returns A 128-bit vector of [4 x float] containing the comparison results. 544static __inline__ __m128 __DEFAULT_FN_ATTRS 545_mm_cmplt_ps(__m128 __a, __m128 __b) 546{ 547 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 548} 549 550/// Compares two 32-bit float values in the low-order bits of both 551/// operands to determine if the value in the first operand is less than or 552/// equal to the corresponding value in the second operand and returns the 553/// result of the comparison in the low-order bits of a vector of 554/// [4 x float]. 555/// 556/// \headerfile <x86intrin.h> 557/// 558/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 559/// 560/// \param __a 561/// A 128-bit vector of [4 x float] containing one of the operands. The lower 562/// 32 bits of this operand are used in the comparison. 563/// \param __b 564/// A 128-bit vector of [4 x float] containing one of the operands. The lower 565/// 32 bits of this operand are used in the comparison. 566/// \returns A 128-bit vector of [4 x float] containing the comparison results 567/// in the low-order bits. 568static __inline__ __m128 __DEFAULT_FN_ATTRS 569_mm_cmple_ss(__m128 __a, __m128 __b) 570{ 571 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 572} 573 574/// Compares each of the corresponding 32-bit float values of the 575/// 128-bit vectors of [4 x float] to determine if the values in the first 576/// operand are less than or equal to those in the second operand. 577/// 578/// \headerfile <x86intrin.h> 579/// 580/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 581/// 582/// \param __a 583/// A 128-bit vector of [4 x float]. 584/// \param __b 585/// A 128-bit vector of [4 x float]. 586/// \returns A 128-bit vector of [4 x float] containing the comparison results. 587static __inline__ __m128 __DEFAULT_FN_ATTRS 588_mm_cmple_ps(__m128 __a, __m128 __b) 589{ 590 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 591} 592 593/// Compares two 32-bit float values in the low-order bits of both 594/// operands to determine if the value in the first operand is greater than 595/// the corresponding value in the second operand and returns the result of 596/// the comparison in the low-order bits of a vector of [4 x float]. 597/// 598/// \headerfile <x86intrin.h> 599/// 600/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 601/// 602/// \param __a 603/// A 128-bit vector of [4 x float] containing one of the operands. The lower 604/// 32 bits of this operand are used in the comparison. 605/// \param __b 606/// A 128-bit vector of [4 x float] containing one of the operands. The lower 607/// 32 bits of this operand are used in the comparison. 608/// \returns A 128-bit vector of [4 x float] containing the comparison results 609/// in the low-order bits. 610static __inline__ __m128 __DEFAULT_FN_ATTRS 611_mm_cmpgt_ss(__m128 __a, __m128 __b) 612{ 613 return (__m128)__builtin_shufflevector((__v4sf)__a, 614 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 615 4, 1, 2, 3); 616} 617 618/// Compares each of the corresponding 32-bit float values of the 619/// 128-bit vectors of [4 x float] to determine if the values in the first 620/// operand are greater than those in the second operand. 621/// 622/// \headerfile <x86intrin.h> 623/// 624/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 625/// 626/// \param __a 627/// A 128-bit vector of [4 x float]. 628/// \param __b 629/// A 128-bit vector of [4 x float]. 630/// \returns A 128-bit vector of [4 x float] containing the comparison results. 631static __inline__ __m128 __DEFAULT_FN_ATTRS 632_mm_cmpgt_ps(__m128 __a, __m128 __b) 633{ 634 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 635} 636 637/// Compares two 32-bit float values in the low-order bits of both 638/// operands to determine if the value in the first operand is greater than 639/// or equal to the corresponding value in the second operand and returns 640/// the result of the comparison in the low-order bits of a vector of 641/// [4 x float]. 642/// 643/// \headerfile <x86intrin.h> 644/// 645/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 646/// 647/// \param __a 648/// A 128-bit vector of [4 x float] containing one of the operands. The lower 649/// 32 bits of this operand are used in the comparison. 650/// \param __b 651/// A 128-bit vector of [4 x float] containing one of the operands. The lower 652/// 32 bits of this operand are used in the comparison. 653/// \returns A 128-bit vector of [4 x float] containing the comparison results 654/// in the low-order bits. 655static __inline__ __m128 __DEFAULT_FN_ATTRS 656_mm_cmpge_ss(__m128 __a, __m128 __b) 657{ 658 return (__m128)__builtin_shufflevector((__v4sf)__a, 659 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 660 4, 1, 2, 3); 661} 662 663/// Compares each of the corresponding 32-bit float values of the 664/// 128-bit vectors of [4 x float] to determine if the values in the first 665/// operand are greater than or equal to those in the second operand. 666/// 667/// \headerfile <x86intrin.h> 668/// 669/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 670/// 671/// \param __a 672/// A 128-bit vector of [4 x float]. 673/// \param __b 674/// A 128-bit vector of [4 x float]. 675/// \returns A 128-bit vector of [4 x float] containing the comparison results. 676static __inline__ __m128 __DEFAULT_FN_ATTRS 677_mm_cmpge_ps(__m128 __a, __m128 __b) 678{ 679 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 680} 681 682/// Compares two 32-bit float values in the low-order bits of both 683/// operands for inequality and returns the result of the comparison in the 684/// low-order bits of a vector of [4 x float]. 685/// 686/// \headerfile <x86intrin.h> 687/// 688/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c> 689/// instructions. 690/// 691/// \param __a 692/// A 128-bit vector of [4 x float] containing one of the operands. The lower 693/// 32 bits of this operand are used in the comparison. 694/// \param __b 695/// A 128-bit vector of [4 x float] containing one of the operands. The lower 696/// 32 bits of this operand are used in the comparison. 697/// \returns A 128-bit vector of [4 x float] containing the comparison results 698/// in the low-order bits. 699static __inline__ __m128 __DEFAULT_FN_ATTRS 700_mm_cmpneq_ss(__m128 __a, __m128 __b) 701{ 702 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 703} 704 705/// Compares each of the corresponding 32-bit float values of the 706/// 128-bit vectors of [4 x float] for inequality. 707/// 708/// \headerfile <x86intrin.h> 709/// 710/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c> 711/// instructions. 712/// 713/// \param __a 714/// A 128-bit vector of [4 x float]. 715/// \param __b 716/// A 128-bit vector of [4 x float]. 717/// \returns A 128-bit vector of [4 x float] containing the comparison results. 718static __inline__ __m128 __DEFAULT_FN_ATTRS 719_mm_cmpneq_ps(__m128 __a, __m128 __b) 720{ 721 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 722} 723 724/// Compares two 32-bit float values in the low-order bits of both 725/// operands to determine if the value in the first operand is not less than 726/// the corresponding value in the second operand and returns the result of 727/// the comparison in the low-order bits of a vector of [4 x float]. 728/// 729/// \headerfile <x86intrin.h> 730/// 731/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 732/// instructions. 733/// 734/// \param __a 735/// A 128-bit vector of [4 x float] containing one of the operands. The lower 736/// 32 bits of this operand are used in the comparison. 737/// \param __b 738/// A 128-bit vector of [4 x float] containing one of the operands. The lower 739/// 32 bits of this operand are used in the comparison. 740/// \returns A 128-bit vector of [4 x float] containing the comparison results 741/// in the low-order bits. 742static __inline__ __m128 __DEFAULT_FN_ATTRS 743_mm_cmpnlt_ss(__m128 __a, __m128 __b) 744{ 745 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 746} 747 748/// Compares each of the corresponding 32-bit float values of the 749/// 128-bit vectors of [4 x float] to determine if the values in the first 750/// operand are not less than those in the second operand. 751/// 752/// \headerfile <x86intrin.h> 753/// 754/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 755/// instructions. 756/// 757/// \param __a 758/// A 128-bit vector of [4 x float]. 759/// \param __b 760/// A 128-bit vector of [4 x float]. 761/// \returns A 128-bit vector of [4 x float] containing the comparison results. 762static __inline__ __m128 __DEFAULT_FN_ATTRS 763_mm_cmpnlt_ps(__m128 __a, __m128 __b) 764{ 765 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 766} 767 768/// Compares two 32-bit float values in the low-order bits of both 769/// operands to determine if the value in the first operand is not less than 770/// or equal to the corresponding value in the second operand and returns 771/// the result of the comparison in the low-order bits of a vector of 772/// [4 x float]. 773/// 774/// \headerfile <x86intrin.h> 775/// 776/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 777/// instructions. 778/// 779/// \param __a 780/// A 128-bit vector of [4 x float] containing one of the operands. The lower 781/// 32 bits of this operand are used in the comparison. 782/// \param __b 783/// A 128-bit vector of [4 x float] containing one of the operands. The lower 784/// 32 bits of this operand are used in the comparison. 785/// \returns A 128-bit vector of [4 x float] containing the comparison results 786/// in the low-order bits. 787static __inline__ __m128 __DEFAULT_FN_ATTRS 788_mm_cmpnle_ss(__m128 __a, __m128 __b) 789{ 790 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 791} 792 793/// Compares each of the corresponding 32-bit float values of the 794/// 128-bit vectors of [4 x float] to determine if the values in the first 795/// operand are not less than or equal to those in the second operand. 796/// 797/// \headerfile <x86intrin.h> 798/// 799/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 800/// instructions. 801/// 802/// \param __a 803/// A 128-bit vector of [4 x float]. 804/// \param __b 805/// A 128-bit vector of [4 x float]. 806/// \returns A 128-bit vector of [4 x float] containing the comparison results. 807static __inline__ __m128 __DEFAULT_FN_ATTRS 808_mm_cmpnle_ps(__m128 __a, __m128 __b) 809{ 810 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 811} 812 813/// Compares two 32-bit float values in the low-order bits of both 814/// operands to determine if the value in the first operand is not greater 815/// than the corresponding value in the second operand and returns the 816/// result of the comparison in the low-order bits of a vector of 817/// [4 x float]. 818/// 819/// \headerfile <x86intrin.h> 820/// 821/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 822/// instructions. 823/// 824/// \param __a 825/// A 128-bit vector of [4 x float] containing one of the operands. The lower 826/// 32 bits of this operand are used in the comparison. 827/// \param __b 828/// A 128-bit vector of [4 x float] containing one of the operands. The lower 829/// 32 bits of this operand are used in the comparison. 830/// \returns A 128-bit vector of [4 x float] containing the comparison results 831/// in the low-order bits. 832static __inline__ __m128 __DEFAULT_FN_ATTRS 833_mm_cmpngt_ss(__m128 __a, __m128 __b) 834{ 835 return (__m128)__builtin_shufflevector((__v4sf)__a, 836 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 837 4, 1, 2, 3); 838} 839 840/// Compares each of the corresponding 32-bit float values of the 841/// 128-bit vectors of [4 x float] to determine if the values in the first 842/// operand are not greater than those in the second operand. 843/// 844/// \headerfile <x86intrin.h> 845/// 846/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 847/// instructions. 848/// 849/// \param __a 850/// A 128-bit vector of [4 x float]. 851/// \param __b 852/// A 128-bit vector of [4 x float]. 853/// \returns A 128-bit vector of [4 x float] containing the comparison results. 854static __inline__ __m128 __DEFAULT_FN_ATTRS 855_mm_cmpngt_ps(__m128 __a, __m128 __b) 856{ 857 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 858} 859 860/// Compares two 32-bit float values in the low-order bits of both 861/// operands to determine if the value in the first operand is not greater 862/// than or equal to the corresponding value in the second operand and 863/// returns the result of the comparison in the low-order bits of a vector 864/// of [4 x float]. 865/// 866/// \headerfile <x86intrin.h> 867/// 868/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 869/// instructions. 870/// 871/// \param __a 872/// A 128-bit vector of [4 x float] containing one of the operands. The lower 873/// 32 bits of this operand are used in the comparison. 874/// \param __b 875/// A 128-bit vector of [4 x float] containing one of the operands. The lower 876/// 32 bits of this operand are used in the comparison. 877/// \returns A 128-bit vector of [4 x float] containing the comparison results 878/// in the low-order bits. 879static __inline__ __m128 __DEFAULT_FN_ATTRS 880_mm_cmpnge_ss(__m128 __a, __m128 __b) 881{ 882 return (__m128)__builtin_shufflevector((__v4sf)__a, 883 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 884 4, 1, 2, 3); 885} 886 887/// Compares each of the corresponding 32-bit float values of the 888/// 128-bit vectors of [4 x float] to determine if the values in the first 889/// operand are not greater than or equal to those in the second operand. 890/// 891/// \headerfile <x86intrin.h> 892/// 893/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 894/// instructions. 895/// 896/// \param __a 897/// A 128-bit vector of [4 x float]. 898/// \param __b 899/// A 128-bit vector of [4 x float]. 900/// \returns A 128-bit vector of [4 x float] containing the comparison results. 901static __inline__ __m128 __DEFAULT_FN_ATTRS 902_mm_cmpnge_ps(__m128 __a, __m128 __b) 903{ 904 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 905} 906 907/// Compares two 32-bit float values in the low-order bits of both 908/// operands to determine if the value in the first operand is ordered with 909/// respect to the corresponding value in the second operand and returns the 910/// result of the comparison in the low-order bits of a vector of 911/// [4 x float]. 912/// 913/// \headerfile <x86intrin.h> 914/// 915/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c> 916/// instructions. 917/// 918/// \param __a 919/// A 128-bit vector of [4 x float] containing one of the operands. The lower 920/// 32 bits of this operand are used in the comparison. 921/// \param __b 922/// A 128-bit vector of [4 x float] containing one of the operands. The lower 923/// 32 bits of this operand are used in the comparison. 924/// \returns A 128-bit vector of [4 x float] containing the comparison results 925/// in the low-order bits. 926static __inline__ __m128 __DEFAULT_FN_ATTRS 927_mm_cmpord_ss(__m128 __a, __m128 __b) 928{ 929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 930} 931 932/// Compares each of the corresponding 32-bit float values of the 933/// 128-bit vectors of [4 x float] to determine if the values in the first 934/// operand are ordered with respect to those in the second operand. 935/// 936/// \headerfile <x86intrin.h> 937/// 938/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c> 939/// instructions. 940/// 941/// \param __a 942/// A 128-bit vector of [4 x float]. 943/// \param __b 944/// A 128-bit vector of [4 x float]. 945/// \returns A 128-bit vector of [4 x float] containing the comparison results. 946static __inline__ __m128 __DEFAULT_FN_ATTRS 947_mm_cmpord_ps(__m128 __a, __m128 __b) 948{ 949 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 950} 951 952/// Compares two 32-bit float values in the low-order bits of both 953/// operands to determine if the value in the first operand is unordered 954/// with respect to the corresponding value in the second operand and 955/// returns the result of the comparison in the low-order bits of a vector 956/// of [4 x float]. 957/// 958/// \headerfile <x86intrin.h> 959/// 960/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c> 961/// instructions. 962/// 963/// \param __a 964/// A 128-bit vector of [4 x float] containing one of the operands. The lower 965/// 32 bits of this operand are used in the comparison. 966/// \param __b 967/// A 128-bit vector of [4 x float] containing one of the operands. The lower 968/// 32 bits of this operand are used in the comparison. 969/// \returns A 128-bit vector of [4 x float] containing the comparison results 970/// in the low-order bits. 971static __inline__ __m128 __DEFAULT_FN_ATTRS 972_mm_cmpunord_ss(__m128 __a, __m128 __b) 973{ 974 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 975} 976 977/// Compares each of the corresponding 32-bit float values of the 978/// 128-bit vectors of [4 x float] to determine if the values in the first 979/// operand are unordered with respect to those in the second operand. 980/// 981/// \headerfile <x86intrin.h> 982/// 983/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c> 984/// instructions. 985/// 986/// \param __a 987/// A 128-bit vector of [4 x float]. 988/// \param __b 989/// A 128-bit vector of [4 x float]. 990/// \returns A 128-bit vector of [4 x float] containing the comparison results. 991static __inline__ __m128 __DEFAULT_FN_ATTRS 992_mm_cmpunord_ps(__m128 __a, __m128 __b) 993{ 994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 995} 996 997/// Compares two 32-bit float values in the low-order bits of both 998/// operands for equality and returns the result of the comparison. 999/// 1000/// If either of the two lower 32-bit values is NaN, 0 is returned. 1001/// 1002/// \headerfile <x86intrin.h> 1003/// 1004/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1005/// instructions. 1006/// 1007/// \param __a 1008/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1009/// used in the comparison. 1010/// \param __b 1011/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1012/// used in the comparison. 1013/// \returns An integer containing the comparison results. If either of the 1014/// two lower 32-bit values is NaN, 0 is returned. 1015static __inline__ int __DEFAULT_FN_ATTRS 1016_mm_comieq_ss(__m128 __a, __m128 __b) 1017{ 1018 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1019} 1020 1021/// Compares two 32-bit float values in the low-order bits of both 1022/// operands to determine if the first operand is less than the second 1023/// operand and returns the result of the comparison. 1024/// 1025/// If either of the two lower 32-bit values is NaN, 0 is returned. 1026/// 1027/// \headerfile <x86intrin.h> 1028/// 1029/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1030/// instructions. 1031/// 1032/// \param __a 1033/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1034/// used in the comparison. 1035/// \param __b 1036/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1037/// used in the comparison. 1038/// \returns An integer containing the comparison results. If either of the two 1039/// lower 32-bit values is NaN, 0 is returned. 1040static __inline__ int __DEFAULT_FN_ATTRS 1041_mm_comilt_ss(__m128 __a, __m128 __b) 1042{ 1043 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1044} 1045 1046/// Compares two 32-bit float values in the low-order bits of both 1047/// operands to determine if the first operand is less than or equal to the 1048/// second operand and returns the result of the comparison. 1049/// 1050/// If either of the two lower 32-bit values is NaN, 0 is returned. 1051/// 1052/// \headerfile <x86intrin.h> 1053/// 1054/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1055/// 1056/// \param __a 1057/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1058/// used in the comparison. 1059/// \param __b 1060/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1061/// used in the comparison. 1062/// \returns An integer containing the comparison results. If either of the two 1063/// lower 32-bit values is NaN, 0 is returned. 1064static __inline__ int __DEFAULT_FN_ATTRS 1065_mm_comile_ss(__m128 __a, __m128 __b) 1066{ 1067 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1068} 1069 1070/// Compares two 32-bit float values in the low-order bits of both 1071/// operands to determine if the first operand is greater than the second 1072/// operand and returns the result of the comparison. 1073/// 1074/// If either of the two lower 32-bit values is NaN, 0 is returned. 1075/// 1076/// \headerfile <x86intrin.h> 1077/// 1078/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1079/// 1080/// \param __a 1081/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1082/// used in the comparison. 1083/// \param __b 1084/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1085/// used in the comparison. 1086/// \returns An integer containing the comparison results. If either of the 1087/// two lower 32-bit values is NaN, 0 is returned. 1088static __inline__ int __DEFAULT_FN_ATTRS 1089_mm_comigt_ss(__m128 __a, __m128 __b) 1090{ 1091 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1092} 1093 1094/// Compares two 32-bit float values in the low-order bits of both 1095/// operands to determine if the first operand is greater than or equal to 1096/// the second operand and returns the result of the comparison. 1097/// 1098/// If either of the two lower 32-bit values is NaN, 0 is returned. 1099/// 1100/// \headerfile <x86intrin.h> 1101/// 1102/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1103/// 1104/// \param __a 1105/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1106/// used in the comparison. 1107/// \param __b 1108/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1109/// used in the comparison. 1110/// \returns An integer containing the comparison results. If either of the two 1111/// lower 32-bit values is NaN, 0 is returned. 1112static __inline__ int __DEFAULT_FN_ATTRS 1113_mm_comige_ss(__m128 __a, __m128 __b) 1114{ 1115 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1116} 1117 1118/// Compares two 32-bit float values in the low-order bits of both 1119/// operands to determine if the first operand is not equal to the second 1120/// operand and returns the result of the comparison. 1121/// 1122/// If either of the two lower 32-bit values is NaN, 1 is returned. 1123/// 1124/// \headerfile <x86intrin.h> 1125/// 1126/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1127/// 1128/// \param __a 1129/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1130/// used in the comparison. 1131/// \param __b 1132/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1133/// used in the comparison. 1134/// \returns An integer containing the comparison results. If either of the 1135/// two lower 32-bit values is NaN, 1 is returned. 1136static __inline__ int __DEFAULT_FN_ATTRS 1137_mm_comineq_ss(__m128 __a, __m128 __b) 1138{ 1139 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1140} 1141 1142/// Performs an unordered comparison of two 32-bit float values using 1143/// the low-order bits of both operands to determine equality and returns 1144/// the result of the comparison. 1145/// 1146/// If either of the two lower 32-bit values is NaN, 0 is returned. 1147/// 1148/// \headerfile <x86intrin.h> 1149/// 1150/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1151/// 1152/// \param __a 1153/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1154/// used in the comparison. 1155/// \param __b 1156/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1157/// used in the comparison. 1158/// \returns An integer containing the comparison results. If either of the two 1159/// lower 32-bit values is NaN, 0 is returned. 1160static __inline__ int __DEFAULT_FN_ATTRS 1161_mm_ucomieq_ss(__m128 __a, __m128 __b) 1162{ 1163 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1164} 1165 1166/// Performs an unordered comparison of two 32-bit float values using 1167/// the low-order bits of both operands to determine if the first operand is 1168/// less than the second operand and returns the result of the comparison. 1169/// 1170/// If either of the two lower 32-bit values is NaN, 0 is returned. 1171/// 1172/// \headerfile <x86intrin.h> 1173/// 1174/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1175/// 1176/// \param __a 1177/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1178/// used in the comparison. 1179/// \param __b 1180/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1181/// used in the comparison. 1182/// \returns An integer containing the comparison results. If either of the two 1183/// lower 32-bit values is NaN, 0 is returned. 1184static __inline__ int __DEFAULT_FN_ATTRS 1185_mm_ucomilt_ss(__m128 __a, __m128 __b) 1186{ 1187 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1188} 1189 1190/// Performs an unordered comparison of two 32-bit float values using 1191/// the low-order bits of both operands to determine if the first operand is 1192/// less than or equal to the second operand and returns the result of the 1193/// comparison. 1194/// 1195/// If either of the two lower 32-bit values is NaN, 0 is returned. 1196/// 1197/// \headerfile <x86intrin.h> 1198/// 1199/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1200/// 1201/// \param __a 1202/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1203/// used in the comparison. 1204/// \param __b 1205/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1206/// used in the comparison. 1207/// \returns An integer containing the comparison results. If either of the two 1208/// lower 32-bit values is NaN, 0 is returned. 1209static __inline__ int __DEFAULT_FN_ATTRS 1210_mm_ucomile_ss(__m128 __a, __m128 __b) 1211{ 1212 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1213} 1214 1215/// Performs an unordered comparison of two 32-bit float values using 1216/// the low-order bits of both operands to determine if the first operand is 1217/// greater than the second operand and returns the result of the 1218/// comparison. 1219/// 1220/// If either of the two lower 32-bit values is NaN, 0 is returned. 1221/// 1222/// \headerfile <x86intrin.h> 1223/// 1224/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1225/// 1226/// \param __a 1227/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1228/// used in the comparison. 1229/// \param __b 1230/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1231/// used in the comparison. 1232/// \returns An integer containing the comparison results. If either of the two 1233/// lower 32-bit values is NaN, 0 is returned. 1234static __inline__ int __DEFAULT_FN_ATTRS 1235_mm_ucomigt_ss(__m128 __a, __m128 __b) 1236{ 1237 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1238} 1239 1240/// Performs an unordered comparison of two 32-bit float values using 1241/// the low-order bits of both operands to determine if the first operand is 1242/// greater than or equal to the second operand and returns the result of 1243/// the comparison. 1244/// 1245/// If either of the two lower 32-bit values is NaN, 0 is returned. 1246/// 1247/// \headerfile <x86intrin.h> 1248/// 1249/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1250/// 1251/// \param __a 1252/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1253/// used in the comparison. 1254/// \param __b 1255/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1256/// used in the comparison. 1257/// \returns An integer containing the comparison results. If either of the two 1258/// lower 32-bit values is NaN, 0 is returned. 1259static __inline__ int __DEFAULT_FN_ATTRS 1260_mm_ucomige_ss(__m128 __a, __m128 __b) 1261{ 1262 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1263} 1264 1265/// Performs an unordered comparison of two 32-bit float values using 1266/// the low-order bits of both operands to determine inequality and returns 1267/// the result of the comparison. 1268/// 1269/// If either of the two lower 32-bit values is NaN, 1 is returned. 1270/// 1271/// \headerfile <x86intrin.h> 1272/// 1273/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1274/// 1275/// \param __a 1276/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1277/// used in the comparison. 1278/// \param __b 1279/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1280/// used in the comparison. 1281/// \returns An integer containing the comparison results. If either of the two 1282/// lower 32-bit values is NaN, 1 is returned. 1283static __inline__ int __DEFAULT_FN_ATTRS 1284_mm_ucomineq_ss(__m128 __a, __m128 __b) 1285{ 1286 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1287} 1288 1289/// Converts a float value contained in the lower 32 bits of a vector of 1290/// [4 x float] into a 32-bit integer. 1291/// 1292/// \headerfile <x86intrin.h> 1293/// 1294/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1295/// instructions. 1296/// 1297/// \param __a 1298/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1299/// used in the conversion. 1300/// \returns A 32-bit integer containing the converted value. 1301static __inline__ int __DEFAULT_FN_ATTRS 1302_mm_cvtss_si32(__m128 __a) 1303{ 1304 return __builtin_ia32_cvtss2si((__v4sf)__a); 1305} 1306 1307/// Converts a float value contained in the lower 32 bits of a vector of 1308/// [4 x float] into a 32-bit integer. 1309/// 1310/// \headerfile <x86intrin.h> 1311/// 1312/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1313/// instructions. 1314/// 1315/// \param __a 1316/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1317/// used in the conversion. 1318/// \returns A 32-bit integer containing the converted value. 1319static __inline__ int __DEFAULT_FN_ATTRS 1320_mm_cvt_ss2si(__m128 __a) 1321{ 1322 return _mm_cvtss_si32(__a); 1323} 1324 1325#ifdef __x86_64__ 1326 1327/// Converts a float value contained in the lower 32 bits of a vector of 1328/// [4 x float] into a 64-bit integer. 1329/// 1330/// \headerfile <x86intrin.h> 1331/// 1332/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1333/// instructions. 1334/// 1335/// \param __a 1336/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1337/// used in the conversion. 1338/// \returns A 64-bit integer containing the converted value. 1339static __inline__ long long __DEFAULT_FN_ATTRS 1340_mm_cvtss_si64(__m128 __a) 1341{ 1342 return __builtin_ia32_cvtss2si64((__v4sf)__a); 1343} 1344 1345#endif 1346 1347/// Converts two low-order float values in a 128-bit vector of 1348/// [4 x float] into a 64-bit vector of [2 x i32]. 1349/// 1350/// \headerfile <x86intrin.h> 1351/// 1352/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1353/// 1354/// \param __a 1355/// A 128-bit vector of [4 x float]. 1356/// \returns A 64-bit integer vector containing the converted values. 1357static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1358_mm_cvtps_pi32(__m128 __a) 1359{ 1360 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1361} 1362 1363/// Converts two low-order float values in a 128-bit vector of 1364/// [4 x float] into a 64-bit vector of [2 x i32]. 1365/// 1366/// \headerfile <x86intrin.h> 1367/// 1368/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1369/// 1370/// \param __a 1371/// A 128-bit vector of [4 x float]. 1372/// \returns A 64-bit integer vector containing the converted values. 1373static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1374_mm_cvt_ps2pi(__m128 __a) 1375{ 1376 return _mm_cvtps_pi32(__a); 1377} 1378 1379/// Converts a float value contained in the lower 32 bits of a vector of 1380/// [4 x float] into a 32-bit integer, truncating the result when it is 1381/// inexact. 1382/// 1383/// \headerfile <x86intrin.h> 1384/// 1385/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1386/// instructions. 1387/// 1388/// \param __a 1389/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1390/// used in the conversion. 1391/// \returns A 32-bit integer containing the converted value. 1392static __inline__ int __DEFAULT_FN_ATTRS 1393_mm_cvttss_si32(__m128 __a) 1394{ 1395 return __builtin_ia32_cvttss2si((__v4sf)__a); 1396} 1397 1398/// Converts a float value contained in the lower 32 bits of a vector of 1399/// [4 x float] into a 32-bit integer, truncating the result when it is 1400/// inexact. 1401/// 1402/// \headerfile <x86intrin.h> 1403/// 1404/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1405/// instructions. 1406/// 1407/// \param __a 1408/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1409/// used in the conversion. 1410/// \returns A 32-bit integer containing the converted value. 1411static __inline__ int __DEFAULT_FN_ATTRS 1412_mm_cvtt_ss2si(__m128 __a) 1413{ 1414 return _mm_cvttss_si32(__a); 1415} 1416 1417#ifdef __x86_64__ 1418/// Converts a float value contained in the lower 32 bits of a vector of 1419/// [4 x float] into a 64-bit integer, truncating the result when it is 1420/// inexact. 1421/// 1422/// \headerfile <x86intrin.h> 1423/// 1424/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1425/// instructions. 1426/// 1427/// \param __a 1428/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1429/// used in the conversion. 1430/// \returns A 64-bit integer containing the converted value. 1431static __inline__ long long __DEFAULT_FN_ATTRS 1432_mm_cvttss_si64(__m128 __a) 1433{ 1434 return __builtin_ia32_cvttss2si64((__v4sf)__a); 1435} 1436#endif 1437 1438/// Converts two low-order float values in a 128-bit vector of 1439/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1440/// when it is inexact. 1441/// 1442/// \headerfile <x86intrin.h> 1443/// 1444/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c> 1445/// instructions. 1446/// 1447/// \param __a 1448/// A 128-bit vector of [4 x float]. 1449/// \returns A 64-bit integer vector containing the converted values. 1450static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1451_mm_cvttps_pi32(__m128 __a) 1452{ 1453 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1454} 1455 1456/// Converts two low-order float values in a 128-bit vector of [4 x 1457/// float] into a 64-bit vector of [2 x i32], truncating the result when it 1458/// is inexact. 1459/// 1460/// \headerfile <x86intrin.h> 1461/// 1462/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction. 1463/// 1464/// \param __a 1465/// A 128-bit vector of [4 x float]. 1466/// \returns A 64-bit integer vector containing the converted values. 1467static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1468_mm_cvtt_ps2pi(__m128 __a) 1469{ 1470 return _mm_cvttps_pi32(__a); 1471} 1472 1473/// Converts a 32-bit signed integer value into a floating point value 1474/// and writes it to the lower 32 bits of the destination. The remaining 1475/// higher order elements of the destination vector are copied from the 1476/// corresponding elements in the first operand. 1477/// 1478/// \headerfile <x86intrin.h> 1479/// 1480/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1481/// 1482/// \param __a 1483/// A 128-bit vector of [4 x float]. 1484/// \param __b 1485/// A 32-bit signed integer operand containing the value to be converted. 1486/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1487/// converted value of the second operand. The upper 96 bits are copied from 1488/// the upper 96 bits of the first operand. 1489static __inline__ __m128 __DEFAULT_FN_ATTRS 1490_mm_cvtsi32_ss(__m128 __a, int __b) 1491{ 1492 __a[0] = __b; 1493 return __a; 1494} 1495 1496/// Converts a 32-bit signed integer value into a floating point value 1497/// and writes it to the lower 32 bits of the destination. The remaining 1498/// higher order elements of the destination are copied from the 1499/// corresponding elements in the first operand. 1500/// 1501/// \headerfile <x86intrin.h> 1502/// 1503/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1504/// 1505/// \param __a 1506/// A 128-bit vector of [4 x float]. 1507/// \param __b 1508/// A 32-bit signed integer operand containing the value to be converted. 1509/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1510/// converted value of the second operand. The upper 96 bits are copied from 1511/// the upper 96 bits of the first operand. 1512static __inline__ __m128 __DEFAULT_FN_ATTRS 1513_mm_cvt_si2ss(__m128 __a, int __b) 1514{ 1515 return _mm_cvtsi32_ss(__a, __b); 1516} 1517 1518#ifdef __x86_64__ 1519 1520/// Converts a 64-bit signed integer value into a floating point value 1521/// and writes it to the lower 32 bits of the destination. The remaining 1522/// higher order elements of the destination are copied from the 1523/// corresponding elements in the first operand. 1524/// 1525/// \headerfile <x86intrin.h> 1526/// 1527/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1528/// 1529/// \param __a 1530/// A 128-bit vector of [4 x float]. 1531/// \param __b 1532/// A 64-bit signed integer operand containing the value to be converted. 1533/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1534/// converted value of the second operand. The upper 96 bits are copied from 1535/// the upper 96 bits of the first operand. 1536static __inline__ __m128 __DEFAULT_FN_ATTRS 1537_mm_cvtsi64_ss(__m128 __a, long long __b) 1538{ 1539 __a[0] = __b; 1540 return __a; 1541} 1542 1543#endif 1544 1545/// Converts two elements of a 64-bit vector of [2 x i32] into two 1546/// floating point values and writes them to the lower 64-bits of the 1547/// destination. The remaining higher order elements of the destination are 1548/// copied from the corresponding elements in the first operand. 1549/// 1550/// \headerfile <x86intrin.h> 1551/// 1552/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1553/// 1554/// \param __a 1555/// A 128-bit vector of [4 x float]. 1556/// \param __b 1557/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1558/// and written to the corresponding low-order elements in the destination. 1559/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1560/// converted value of the second operand. The upper 64 bits are copied from 1561/// the upper 64 bits of the first operand. 1562static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1563_mm_cvtpi32_ps(__m128 __a, __m64 __b) 1564{ 1565 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1566} 1567 1568/// Converts two elements of a 64-bit vector of [2 x i32] into two 1569/// floating point values and writes them to the lower 64-bits of the 1570/// destination. The remaining higher order elements of the destination are 1571/// copied from the corresponding elements in the first operand. 1572/// 1573/// \headerfile <x86intrin.h> 1574/// 1575/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1576/// 1577/// \param __a 1578/// A 128-bit vector of [4 x float]. 1579/// \param __b 1580/// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1581/// and written to the corresponding low-order elements in the destination. 1582/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1583/// converted value from the second operand. The upper 64 bits are copied 1584/// from the upper 64 bits of the first operand. 1585static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1586_mm_cvt_pi2ps(__m128 __a, __m64 __b) 1587{ 1588 return _mm_cvtpi32_ps(__a, __b); 1589} 1590 1591/// Extracts a float value contained in the lower 32 bits of a vector of 1592/// [4 x float]. 1593/// 1594/// \headerfile <x86intrin.h> 1595/// 1596/// This intrinsic has no corresponding instruction. 1597/// 1598/// \param __a 1599/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1600/// used in the extraction. 1601/// \returns A 32-bit float containing the extracted value. 1602static __inline__ float __DEFAULT_FN_ATTRS 1603_mm_cvtss_f32(__m128 __a) 1604{ 1605 return __a[0]; 1606} 1607 1608/// Loads two packed float values from the address \a __p into the 1609/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits 1610/// are copied from the low-order bits of the first operand. 1611/// 1612/// \headerfile <x86intrin.h> 1613/// 1614/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1615/// 1616/// \param __a 1617/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] 1618/// of the destination. 1619/// \param __p 1620/// A pointer to two packed float values. Bits [63:0] are written to bits 1621/// [127:64] of the destination. 1622/// \returns A 128-bit vector of [4 x float] containing the moved values. 1623static __inline__ __m128 __DEFAULT_FN_ATTRS 1624_mm_loadh_pi(__m128 __a, const __m64 *__p) 1625{ 1626 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1627 struct __mm_loadh_pi_struct { 1628 __mm_loadh_pi_v2f32 __u; 1629 } __attribute__((__packed__, __may_alias__)); 1630 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; 1631 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1632 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1633} 1634 1635/// Loads two packed float values from the address \a __p into the 1636/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits 1637/// are copied from the high-order bits of the first operand. 1638/// 1639/// \headerfile <x86intrin.h> 1640/// 1641/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1642/// 1643/// \param __a 1644/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits 1645/// [127:64] of the destination. 1646/// \param __p 1647/// A pointer to two packed float values. Bits [63:0] are written to bits 1648/// [63:0] of the destination. 1649/// \returns A 128-bit vector of [4 x float] containing the moved values. 1650static __inline__ __m128 __DEFAULT_FN_ATTRS 1651_mm_loadl_pi(__m128 __a, const __m64 *__p) 1652{ 1653 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1654 struct __mm_loadl_pi_struct { 1655 __mm_loadl_pi_v2f32 __u; 1656 } __attribute__((__packed__, __may_alias__)); 1657 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; 1658 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1659 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1660} 1661 1662/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1663/// 32 bits of the vector are initialized with the single-precision 1664/// floating-point value loaded from a specified memory location. The upper 1665/// 96 bits are set to zero. 1666/// 1667/// \headerfile <x86intrin.h> 1668/// 1669/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1670/// 1671/// \param __p 1672/// A pointer to a 32-bit memory location containing a single-precision 1673/// floating-point value. 1674/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1675/// lower 32 bits contain the value loaded from the memory location. The 1676/// upper 96 bits are set to zero. 1677static __inline__ __m128 __DEFAULT_FN_ATTRS 1678_mm_load_ss(const float *__p) 1679{ 1680 struct __mm_load_ss_struct { 1681 float __u; 1682 } __attribute__((__packed__, __may_alias__)); 1683 float __u = ((const struct __mm_load_ss_struct*)__p)->__u; 1684 return __extension__ (__m128){ __u, 0, 0, 0 }; 1685} 1686 1687/// Loads a 32-bit float value and duplicates it to all four vector 1688/// elements of a 128-bit vector of [4 x float]. 1689/// 1690/// \headerfile <x86intrin.h> 1691/// 1692/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c> 1693/// instruction. 1694/// 1695/// \param __p 1696/// A pointer to a float value to be loaded and duplicated. 1697/// \returns A 128-bit vector of [4 x float] containing the loaded and 1698/// duplicated values. 1699static __inline__ __m128 __DEFAULT_FN_ATTRS 1700_mm_load1_ps(const float *__p) 1701{ 1702 struct __mm_load1_ps_struct { 1703 float __u; 1704 } __attribute__((__packed__, __may_alias__)); 1705 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u; 1706 return __extension__ (__m128){ __u, __u, __u, __u }; 1707} 1708 1709#define _mm_load_ps1(p) _mm_load1_ps(p) 1710 1711/// Loads a 128-bit floating-point vector of [4 x float] from an aligned 1712/// memory location. 1713/// 1714/// \headerfile <x86intrin.h> 1715/// 1716/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1717/// 1718/// \param __p 1719/// A pointer to a 128-bit memory location. The address of the memory 1720/// location has to be 128-bit aligned. 1721/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1722static __inline__ __m128 __DEFAULT_FN_ATTRS 1723_mm_load_ps(const float *__p) 1724{ 1725 return *(const __m128*)__p; 1726} 1727 1728/// Loads a 128-bit floating-point vector of [4 x float] from an 1729/// unaligned memory location. 1730/// 1731/// \headerfile <x86intrin.h> 1732/// 1733/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1734/// 1735/// \param __p 1736/// A pointer to a 128-bit memory location. The address of the memory 1737/// location does not have to be aligned. 1738/// \returns A 128-bit vector of [4 x float] containing the loaded values. 1739static __inline__ __m128 __DEFAULT_FN_ATTRS 1740_mm_loadu_ps(const float *__p) 1741{ 1742 struct __loadu_ps { 1743 __m128_u __v; 1744 } __attribute__((__packed__, __may_alias__)); 1745 return ((const struct __loadu_ps*)__p)->__v; 1746} 1747 1748/// Loads four packed float values, in reverse order, from an aligned 1749/// memory location to 32-bit elements in a 128-bit vector of [4 x float]. 1750/// 1751/// \headerfile <x86intrin.h> 1752/// 1753/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 1754/// instruction. 1755/// 1756/// \param __p 1757/// A pointer to a 128-bit memory location. The address of the memory 1758/// location has to be 128-bit aligned. 1759/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 1760/// in reverse order. 1761static __inline__ __m128 __DEFAULT_FN_ATTRS 1762_mm_loadr_ps(const float *__p) 1763{ 1764 __m128 __a = _mm_load_ps(__p); 1765 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1766} 1767 1768/// Create a 128-bit vector of [4 x float] with undefined values. 1769/// 1770/// \headerfile <x86intrin.h> 1771/// 1772/// This intrinsic has no corresponding instruction. 1773/// 1774/// \returns A 128-bit vector of [4 x float] containing undefined values. 1775static __inline__ __m128 __DEFAULT_FN_ATTRS 1776_mm_undefined_ps(void) 1777{ 1778 return (__m128)__builtin_ia32_undef128(); 1779} 1780 1781/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1782/// 32 bits of the vector are initialized with the specified single-precision 1783/// floating-point value. The upper 96 bits are set to zero. 1784/// 1785/// \headerfile <x86intrin.h> 1786/// 1787/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1788/// 1789/// \param __w 1790/// A single-precision floating-point value used to initialize the lower 32 1791/// bits of the result. 1792/// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1793/// lower 32 bits contain the value provided in the source operand. The 1794/// upper 96 bits are set to zero. 1795static __inline__ __m128 __DEFAULT_FN_ATTRS 1796_mm_set_ss(float __w) 1797{ 1798 return __extension__ (__m128){ __w, 0, 0, 0 }; 1799} 1800 1801/// Constructs a 128-bit floating-point vector of [4 x float], with each 1802/// of the four single-precision floating-point vector elements set to the 1803/// specified single-precision floating-point value. 1804/// 1805/// \headerfile <x86intrin.h> 1806/// 1807/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1808/// 1809/// \param __w 1810/// A single-precision floating-point value used to initialize each vector 1811/// element of the result. 1812/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1813static __inline__ __m128 __DEFAULT_FN_ATTRS 1814_mm_set1_ps(float __w) 1815{ 1816 return __extension__ (__m128){ __w, __w, __w, __w }; 1817} 1818 1819/* Microsoft specific. */ 1820/// Constructs a 128-bit floating-point vector of [4 x float], with each 1821/// of the four single-precision floating-point vector elements set to the 1822/// specified single-precision floating-point value. 1823/// 1824/// \headerfile <x86intrin.h> 1825/// 1826/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1827/// 1828/// \param __w 1829/// A single-precision floating-point value used to initialize each vector 1830/// element of the result. 1831/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1832static __inline__ __m128 __DEFAULT_FN_ATTRS 1833_mm_set_ps1(float __w) 1834{ 1835 return _mm_set1_ps(__w); 1836} 1837 1838/// Constructs a 128-bit floating-point vector of [4 x float] 1839/// initialized with the specified single-precision floating-point values. 1840/// 1841/// \headerfile <x86intrin.h> 1842/// 1843/// This intrinsic is a utility function and does not correspond to a specific 1844/// instruction. 1845/// 1846/// \param __z 1847/// A single-precision floating-point value used to initialize bits [127:96] 1848/// of the result. 1849/// \param __y 1850/// A single-precision floating-point value used to initialize bits [95:64] 1851/// of the result. 1852/// \param __x 1853/// A single-precision floating-point value used to initialize bits [63:32] 1854/// of the result. 1855/// \param __w 1856/// A single-precision floating-point value used to initialize bits [31:0] 1857/// of the result. 1858/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1859static __inline__ __m128 __DEFAULT_FN_ATTRS 1860_mm_set_ps(float __z, float __y, float __x, float __w) 1861{ 1862 return __extension__ (__m128){ __w, __x, __y, __z }; 1863} 1864 1865/// Constructs a 128-bit floating-point vector of [4 x float], 1866/// initialized in reverse order with the specified 32-bit single-precision 1867/// float-point values. 1868/// 1869/// \headerfile <x86intrin.h> 1870/// 1871/// This intrinsic is a utility function and does not correspond to a specific 1872/// instruction. 1873/// 1874/// \param __z 1875/// A single-precision floating-point value used to initialize bits [31:0] 1876/// of the result. 1877/// \param __y 1878/// A single-precision floating-point value used to initialize bits [63:32] 1879/// of the result. 1880/// \param __x 1881/// A single-precision floating-point value used to initialize bits [95:64] 1882/// of the result. 1883/// \param __w 1884/// A single-precision floating-point value used to initialize bits [127:96] 1885/// of the result. 1886/// \returns An initialized 128-bit floating-point vector of [4 x float]. 1887static __inline__ __m128 __DEFAULT_FN_ATTRS 1888_mm_setr_ps(float __z, float __y, float __x, float __w) 1889{ 1890 return __extension__ (__m128){ __z, __y, __x, __w }; 1891} 1892 1893/// Constructs a 128-bit floating-point vector of [4 x float] initialized 1894/// to zero. 1895/// 1896/// \headerfile <x86intrin.h> 1897/// 1898/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1899/// 1900/// \returns An initialized 128-bit floating-point vector of [4 x float] with 1901/// all elements set to zero. 1902static __inline__ __m128 __DEFAULT_FN_ATTRS 1903_mm_setzero_ps(void) 1904{ 1905 return __extension__ (__m128){ 0, 0, 0, 0 }; 1906} 1907 1908/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a 1909/// memory location. 1910/// 1911/// \headerfile <x86intrin.h> 1912/// 1913/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1914/// 1915/// \param __p 1916/// A pointer to a 64-bit memory location. 1917/// \param __a 1918/// A 128-bit vector of [4 x float] containing the values to be stored. 1919static __inline__ void __DEFAULT_FN_ATTRS 1920_mm_storeh_pi(__m64 *__p, __m128 __a) 1921{ 1922 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1923 struct __mm_storeh_pi_struct { 1924 __mm_storeh_pi_v2f32 __u; 1925 } __attribute__((__packed__, __may_alias__)); 1926 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); 1927} 1928 1929/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a 1930/// memory location. 1931/// 1932/// \headerfile <x86intrin.h> 1933/// 1934/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 1935/// 1936/// \param __p 1937/// A pointer to a memory location that will receive the float values. 1938/// \param __a 1939/// A 128-bit vector of [4 x float] containing the values to be stored. 1940static __inline__ void __DEFAULT_FN_ATTRS 1941_mm_storel_pi(__m64 *__p, __m128 __a) 1942{ 1943 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1944 struct __mm_storeh_pi_struct { 1945 __mm_storeh_pi_v2f32 __u; 1946 } __attribute__((__packed__, __may_alias__)); 1947 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); 1948} 1949 1950/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a 1951/// memory location. 1952/// 1953/// \headerfile <x86intrin.h> 1954/// 1955/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1956/// 1957/// \param __p 1958/// A pointer to a 32-bit memory location. 1959/// \param __a 1960/// A 128-bit vector of [4 x float] containing the value to be stored. 1961static __inline__ void __DEFAULT_FN_ATTRS 1962_mm_store_ss(float *__p, __m128 __a) 1963{ 1964 struct __mm_store_ss_struct { 1965 float __u; 1966 } __attribute__((__packed__, __may_alias__)); 1967 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1968} 1969 1970/// Stores a 128-bit vector of [4 x float] to an unaligned memory 1971/// location. 1972/// 1973/// \headerfile <x86intrin.h> 1974/// 1975/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1976/// 1977/// \param __p 1978/// A pointer to a 128-bit memory location. The address of the memory 1979/// location does not have to be aligned. 1980/// \param __a 1981/// A 128-bit vector of [4 x float] containing the values to be stored. 1982static __inline__ void __DEFAULT_FN_ATTRS 1983_mm_storeu_ps(float *__p, __m128 __a) 1984{ 1985 struct __storeu_ps { 1986 __m128_u __v; 1987 } __attribute__((__packed__, __may_alias__)); 1988 ((struct __storeu_ps*)__p)->__v = __a; 1989} 1990 1991/// Stores a 128-bit vector of [4 x float] into an aligned memory 1992/// location. 1993/// 1994/// \headerfile <x86intrin.h> 1995/// 1996/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1997/// 1998/// \param __p 1999/// A pointer to a 128-bit memory location. The address of the memory 2000/// location has to be 16-byte aligned. 2001/// \param __a 2002/// A 128-bit vector of [4 x float] containing the values to be stored. 2003static __inline__ void __DEFAULT_FN_ATTRS 2004_mm_store_ps(float *__p, __m128 __a) 2005{ 2006 *(__m128*)__p = __a; 2007} 2008 2009/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2010/// four contiguous elements in an aligned memory location. 2011/// 2012/// \headerfile <x86intrin.h> 2013/// 2014/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2015/// instruction. 2016/// 2017/// \param __p 2018/// A pointer to a 128-bit memory location. 2019/// \param __a 2020/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2021/// of the four contiguous elements pointed by \a __p. 2022static __inline__ void __DEFAULT_FN_ATTRS 2023_mm_store1_ps(float *__p, __m128 __a) 2024{ 2025 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 2026 _mm_store_ps(__p, __a); 2027} 2028 2029/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2030/// four contiguous elements in an aligned memory location. 2031/// 2032/// \headerfile <x86intrin.h> 2033/// 2034/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2035/// instruction. 2036/// 2037/// \param __p 2038/// A pointer to a 128-bit memory location. 2039/// \param __a 2040/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2041/// of the four contiguous elements pointed by \a __p. 2042static __inline__ void __DEFAULT_FN_ATTRS 2043_mm_store_ps1(float *__p, __m128 __a) 2044{ 2045 _mm_store1_ps(__p, __a); 2046} 2047 2048/// Stores float values from a 128-bit vector of [4 x float] to an 2049/// aligned memory location in reverse order. 2050/// 2051/// \headerfile <x86intrin.h> 2052/// 2053/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 2054/// instruction. 2055/// 2056/// \param __p 2057/// A pointer to a 128-bit memory location. The address of the memory 2058/// location has to be 128-bit aligned. 2059/// \param __a 2060/// A 128-bit vector of [4 x float] containing the values to be stored. 2061static __inline__ void __DEFAULT_FN_ATTRS 2062_mm_storer_ps(float *__p, __m128 __a) 2063{ 2064 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 2065 _mm_store_ps(__p, __a); 2066} 2067 2068#define _MM_HINT_ET0 7 2069#define _MM_HINT_ET1 6 2070#define _MM_HINT_T0 3 2071#define _MM_HINT_T1 2 2072#define _MM_HINT_T2 1 2073#define _MM_HINT_NTA 0 2074 2075#ifndef _MSC_VER 2076/* FIXME: We have to #define this because "sel" must be a constant integer, and 2077 Sema doesn't do any form of constant propagation yet. */ 2078 2079/// Loads one cache line of data from the specified address to a location 2080/// closer to the processor. 2081/// 2082/// \headerfile <x86intrin.h> 2083/// 2084/// \code 2085/// void _mm_prefetch(const void * a, const int sel); 2086/// \endcode 2087/// 2088/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction. 2089/// 2090/// \param a 2091/// A pointer to a memory location containing a cache line of data. 2092/// \param sel 2093/// A predefined integer constant specifying the type of prefetch 2094/// operation: \n 2095/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The 2096/// PREFETCHNTA instruction will be generated. \n 2097/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will 2098/// be generated. \n 2099/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will 2100/// be generated. \n 2101/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will 2102/// be generated. 2103#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \ 2104 ((sel) >> 2) & 1, (sel) & 0x3)) 2105#endif 2106 2107/// Stores a 64-bit integer in the specified aligned memory location. To 2108/// minimize caching, the data is flagged as non-temporal (unlikely to be 2109/// used again soon). 2110/// 2111/// \headerfile <x86intrin.h> 2112/// 2113/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction. 2114/// 2115/// \param __p 2116/// A pointer to an aligned memory location used to store the register value. 2117/// \param __a 2118/// A 64-bit integer containing the value to be stored. 2119static __inline__ void __DEFAULT_FN_ATTRS_MMX 2120_mm_stream_pi(__m64 *__p, __m64 __a) 2121{ 2122 __builtin_ia32_movntq(__p, __a); 2123} 2124 2125/// Moves packed float values from a 128-bit vector of [4 x float] to a 2126/// 128-bit aligned memory location. To minimize caching, the data is flagged 2127/// as non-temporal (unlikely to be used again soon). 2128/// 2129/// \headerfile <x86intrin.h> 2130/// 2131/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 2132/// 2133/// \param __p 2134/// A pointer to a 128-bit aligned memory location that will receive the 2135/// single-precision floating-point values. 2136/// \param __a 2137/// A 128-bit vector of [4 x float] containing the values to be moved. 2138static __inline__ void __DEFAULT_FN_ATTRS 2139_mm_stream_ps(float *__p, __m128 __a) 2140{ 2141 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 2142} 2143 2144#if defined(__cplusplus) 2145extern "C" { 2146#endif 2147 2148/// Forces strong memory ordering (serialization) between store 2149/// instructions preceding this instruction and store instructions following 2150/// this instruction, ensuring the system completes all previous stores 2151/// before executing subsequent stores. 2152/// 2153/// \headerfile <x86intrin.h> 2154/// 2155/// This intrinsic corresponds to the <c> SFENCE </c> instruction. 2156/// 2157void _mm_sfence(void); 2158 2159#if defined(__cplusplus) 2160} // extern "C" 2161#endif 2162 2163/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and 2164/// returns it, as specified by the immediate integer operand. 2165/// 2166/// \headerfile <x86intrin.h> 2167/// 2168/// \code 2169/// int _mm_extract_pi16(__m64 a, int n); 2170/// \endcode 2171/// 2172/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 2173/// 2174/// \param a 2175/// A 64-bit vector of [4 x i16]. 2176/// \param n 2177/// An immediate integer operand that determines which bits are extracted: \n 2178/// 0: Bits [15:0] are copied to the destination. \n 2179/// 1: Bits [31:16] are copied to the destination. \n 2180/// 2: Bits [47:32] are copied to the destination. \n 2181/// 3: Bits [63:48] are copied to the destination. 2182/// \returns A 16-bit integer containing the extracted 16 bits of packed data. 2183#define _mm_extract_pi16(a, n) \ 2184 (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n) 2185 2186/// Copies data from the 64-bit vector of [4 x i16] to the destination, 2187/// and inserts the lower 16-bits of an integer operand at the 16-bit offset 2188/// specified by the immediate operand \a n. 2189/// 2190/// \headerfile <x86intrin.h> 2191/// 2192/// \code 2193/// __m64 _mm_insert_pi16(__m64 a, int d, int n); 2194/// \endcode 2195/// 2196/// This intrinsic corresponds to the <c> PINSRW </c> instruction. 2197/// 2198/// \param a 2199/// A 64-bit vector of [4 x i16]. 2200/// \param d 2201/// An integer. The lower 16-bit value from this operand is written to the 2202/// destination at the offset specified by operand \a n. 2203/// \param n 2204/// An immediate integer operant that determines which the bits to be used 2205/// in the destination. \n 2206/// 0: Bits [15:0] are copied to the destination. \n 2207/// 1: Bits [31:16] are copied to the destination. \n 2208/// 2: Bits [47:32] are copied to the destination. \n 2209/// 3: Bits [63:48] are copied to the destination. \n 2210/// The remaining bits in the destination are copied from the corresponding 2211/// bits in operand \a a. 2212/// \returns A 64-bit integer vector containing the copied packed data from the 2213/// operands. 2214#define _mm_insert_pi16(a, d, n) \ 2215 (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n) 2216 2217/// Compares each of the corresponding packed 16-bit integer values of 2218/// the 64-bit integer vectors, and writes the greater value to the 2219/// corresponding bits in the destination. 2220/// 2221/// \headerfile <x86intrin.h> 2222/// 2223/// This intrinsic corresponds to the <c> PMAXSW </c> instruction. 2224/// 2225/// \param __a 2226/// A 64-bit integer vector containing one of the source operands. 2227/// \param __b 2228/// A 64-bit integer vector containing one of the source operands. 2229/// \returns A 64-bit integer vector containing the comparison results. 2230static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2231_mm_max_pi16(__m64 __a, __m64 __b) 2232{ 2233 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 2234} 2235 2236/// Compares each of the corresponding packed 8-bit unsigned integer 2237/// values of the 64-bit integer vectors, and writes the greater value to the 2238/// corresponding bits in the destination. 2239/// 2240/// \headerfile <x86intrin.h> 2241/// 2242/// This intrinsic corresponds to the <c> PMAXUB </c> instruction. 2243/// 2244/// \param __a 2245/// A 64-bit integer vector containing one of the source operands. 2246/// \param __b 2247/// A 64-bit integer vector containing one of the source operands. 2248/// \returns A 64-bit integer vector containing the comparison results. 2249static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2250_mm_max_pu8(__m64 __a, __m64 __b) 2251{ 2252 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 2253} 2254 2255/// Compares each of the corresponding packed 16-bit integer values of 2256/// the 64-bit integer vectors, and writes the lesser value to the 2257/// corresponding bits in the destination. 2258/// 2259/// \headerfile <x86intrin.h> 2260/// 2261/// This intrinsic corresponds to the <c> PMINSW </c> instruction. 2262/// 2263/// \param __a 2264/// A 64-bit integer vector containing one of the source operands. 2265/// \param __b 2266/// A 64-bit integer vector containing one of the source operands. 2267/// \returns A 64-bit integer vector containing the comparison results. 2268static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2269_mm_min_pi16(__m64 __a, __m64 __b) 2270{ 2271 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 2272} 2273 2274/// Compares each of the corresponding packed 8-bit unsigned integer 2275/// values of the 64-bit integer vectors, and writes the lesser value to the 2276/// corresponding bits in the destination. 2277/// 2278/// \headerfile <x86intrin.h> 2279/// 2280/// This intrinsic corresponds to the <c> PMINUB </c> instruction. 2281/// 2282/// \param __a 2283/// A 64-bit integer vector containing one of the source operands. 2284/// \param __b 2285/// A 64-bit integer vector containing one of the source operands. 2286/// \returns A 64-bit integer vector containing the comparison results. 2287static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2288_mm_min_pu8(__m64 __a, __m64 __b) 2289{ 2290 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 2291} 2292 2293/// Takes the most significant bit from each 8-bit element in a 64-bit 2294/// integer vector to create an 8-bit mask value. Zero-extends the value to 2295/// 32-bit integer and writes it to the destination. 2296/// 2297/// \headerfile <x86intrin.h> 2298/// 2299/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction. 2300/// 2301/// \param __a 2302/// A 64-bit integer vector containing the values with bits to be extracted. 2303/// \returns The most significant bit from each 8-bit element in \a __a, 2304/// written to bits [7:0]. 2305static __inline__ int __DEFAULT_FN_ATTRS_MMX 2306_mm_movemask_pi8(__m64 __a) 2307{ 2308 return __builtin_ia32_pmovmskb((__v8qi)__a); 2309} 2310 2311/// Multiplies packed 16-bit unsigned integer values and writes the 2312/// high-order 16 bits of each 32-bit product to the corresponding bits in 2313/// the destination. 2314/// 2315/// \headerfile <x86intrin.h> 2316/// 2317/// This intrinsic corresponds to the <c> PMULHUW </c> instruction. 2318/// 2319/// \param __a 2320/// A 64-bit integer vector containing one of the source operands. 2321/// \param __b 2322/// A 64-bit integer vector containing one of the source operands. 2323/// \returns A 64-bit integer vector containing the products of both operands. 2324static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2325_mm_mulhi_pu16(__m64 __a, __m64 __b) 2326{ 2327 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 2328} 2329 2330/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the 2331/// destination, as specified by the immediate value operand. 2332/// 2333/// \headerfile <x86intrin.h> 2334/// 2335/// \code 2336/// __m64 _mm_shuffle_pi16(__m64 a, const int n); 2337/// \endcode 2338/// 2339/// This intrinsic corresponds to the <c> PSHUFW </c> instruction. 2340/// 2341/// \param a 2342/// A 64-bit integer vector containing the values to be shuffled. 2343/// \param n 2344/// An immediate value containing an 8-bit value specifying which elements to 2345/// copy from \a a. The destinations within the 64-bit destination are 2346/// assigned values as follows: \n 2347/// Bits [1:0] are used to assign values to bits [15:0] in the 2348/// destination. \n 2349/// Bits [3:2] are used to assign values to bits [31:16] in the 2350/// destination. \n 2351/// Bits [5:4] are used to assign values to bits [47:32] in the 2352/// destination. \n 2353/// Bits [7:6] are used to assign values to bits [63:48] in the 2354/// destination. \n 2355/// Bit value assignments: \n 2356/// 00: assigned from bits [15:0] of \a a. \n 2357/// 01: assigned from bits [31:16] of \a a. \n 2358/// 10: assigned from bits [47:32] of \a a. \n 2359/// 11: assigned from bits [63:48] of \a a. 2360/// \returns A 64-bit integer vector containing the shuffled values. 2361#define _mm_shuffle_pi16(a, n) \ 2362 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) 2363 2364/// Conditionally copies the values from each 8-bit element in the first 2365/// 64-bit integer vector operand to the specified memory location, as 2366/// specified by the most significant bit in the corresponding element in the 2367/// second 64-bit integer vector operand. 2368/// 2369/// To minimize caching, the data is flagged as non-temporal 2370/// (unlikely to be used again soon). 2371/// 2372/// \headerfile <x86intrin.h> 2373/// 2374/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction. 2375/// 2376/// \param __d 2377/// A 64-bit integer vector containing the values with elements to be copied. 2378/// \param __n 2379/// A 64-bit integer vector operand. The most significant bit from each 8-bit 2380/// element determines whether the corresponding element in operand \a __d 2381/// is copied. If the most significant bit of a given element is 1, the 2382/// corresponding element in operand \a __d is copied. 2383/// \param __p 2384/// A pointer to a 64-bit memory location that will receive the conditionally 2385/// copied integer values. The address of the memory location does not have 2386/// to be aligned. 2387static __inline__ void __DEFAULT_FN_ATTRS_MMX 2388_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 2389{ 2390 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 2391} 2392 2393/// Computes the rounded averages of the packed unsigned 8-bit integer 2394/// values and writes the averages to the corresponding bits in the 2395/// destination. 2396/// 2397/// \headerfile <x86intrin.h> 2398/// 2399/// This intrinsic corresponds to the <c> PAVGB </c> instruction. 2400/// 2401/// \param __a 2402/// A 64-bit integer vector containing one of the source operands. 2403/// \param __b 2404/// A 64-bit integer vector containing one of the source operands. 2405/// \returns A 64-bit integer vector containing the averages of both operands. 2406static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2407_mm_avg_pu8(__m64 __a, __m64 __b) 2408{ 2409 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 2410} 2411 2412/// Computes the rounded averages of the packed unsigned 16-bit integer 2413/// values and writes the averages to the corresponding bits in the 2414/// destination. 2415/// 2416/// \headerfile <x86intrin.h> 2417/// 2418/// This intrinsic corresponds to the <c> PAVGW </c> instruction. 2419/// 2420/// \param __a 2421/// A 64-bit integer vector containing one of the source operands. 2422/// \param __b 2423/// A 64-bit integer vector containing one of the source operands. 2424/// \returns A 64-bit integer vector containing the averages of both operands. 2425static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2426_mm_avg_pu16(__m64 __a, __m64 __b) 2427{ 2428 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 2429} 2430 2431/// Subtracts the corresponding 8-bit unsigned integer values of the two 2432/// 64-bit vector operands and computes the absolute value for each of the 2433/// difference. Then sum of the 8 absolute differences is written to the 2434/// bits [15:0] of the destination; the remaining bits [63:16] are cleared. 2435/// 2436/// \headerfile <x86intrin.h> 2437/// 2438/// This intrinsic corresponds to the <c> PSADBW </c> instruction. 2439/// 2440/// \param __a 2441/// A 64-bit integer vector containing one of the source operands. 2442/// \param __b 2443/// A 64-bit integer vector containing one of the source operands. 2444/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the 2445/// sets of absolute differences between both operands. The upper bits are 2446/// cleared. 2447static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2448_mm_sad_pu8(__m64 __a, __m64 __b) 2449{ 2450 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 2451} 2452 2453#if defined(__cplusplus) 2454extern "C" { 2455#endif 2456 2457/// Returns the contents of the MXCSR register as a 32-bit unsigned 2458/// integer value. 2459/// 2460/// There are several groups of macros associated with this 2461/// intrinsic, including: 2462/// <ul> 2463/// <li> 2464/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2465/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2466/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2467/// _MM_GET_EXCEPTION_STATE(). 2468/// </li> 2469/// <li> 2470/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2471/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2472/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). 2473/// </li> 2474/// <li> 2475/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2476/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2477/// _MM_GET_ROUNDING_MODE(). 2478/// </li> 2479/// <li> 2480/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2481/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). 2482/// </li> 2483/// <li> 2484/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2485/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2486/// _MM_GET_DENORMALS_ZERO_MODE(). 2487/// </li> 2488/// </ul> 2489/// 2490/// For example, the following expression checks if an overflow exception has 2491/// occurred: 2492/// \code 2493/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) 2494/// \endcode 2495/// 2496/// The following expression gets the current rounding mode: 2497/// \code 2498/// _MM_GET_ROUNDING_MODE() 2499/// \endcode 2500/// 2501/// \headerfile <x86intrin.h> 2502/// 2503/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction. 2504/// 2505/// \returns A 32-bit unsigned integer containing the contents of the MXCSR 2506/// register. 2507unsigned int _mm_getcsr(void); 2508 2509/// Sets the MXCSR register with the 32-bit unsigned integer value. 2510/// 2511/// There are several groups of macros associated with this intrinsic, 2512/// including: 2513/// <ul> 2514/// <li> 2515/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2516/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2517/// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2518/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. 2519/// </li> 2520/// <li> 2521/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2522/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2523/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one 2524/// of these macros. 2525/// </li> 2526/// <li> 2527/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2528/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2529/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. 2530/// </li> 2531/// <li> 2532/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2533/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is 2534/// one of these macros. 2535/// </li> 2536/// <li> 2537/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2538/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2539/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. 2540/// </li> 2541/// </ul> 2542/// 2543/// For example, the following expression causes subsequent floating-point 2544/// operations to round up: 2545/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) 2546/// 2547/// The following example sets the DAZ and FTZ flags: 2548/// \code 2549/// void setFlags() { 2550/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); 2551/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 2552/// } 2553/// \endcode 2554/// 2555/// \headerfile <x86intrin.h> 2556/// 2557/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction. 2558/// 2559/// \param __i 2560/// A 32-bit unsigned integer value to be written to the MXCSR register. 2561void _mm_setcsr(unsigned int __i); 2562 2563#if defined(__cplusplus) 2564} // extern "C" 2565#endif 2566 2567/// Selects 4 float values from the 128-bit operands of [4 x float], as 2568/// specified by the immediate value operand. 2569/// 2570/// \headerfile <x86intrin.h> 2571/// 2572/// \code 2573/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); 2574/// \endcode 2575/// 2576/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction. 2577/// 2578/// \param a 2579/// A 128-bit vector of [4 x float]. 2580/// \param b 2581/// A 128-bit vector of [4 x float]. 2582/// \param mask 2583/// An immediate value containing an 8-bit value specifying which elements to 2584/// copy from \a a and \a b. \n 2585/// Bits [3:0] specify the values copied from operand \a a. \n 2586/// Bits [7:4] specify the values copied from operand \a b. \n 2587/// The destinations within the 128-bit destination are assigned values as 2588/// follows: \n 2589/// Bits [1:0] are used to assign values to bits [31:0] in the 2590/// destination. \n 2591/// Bits [3:2] are used to assign values to bits [63:32] in the 2592/// destination. \n 2593/// Bits [5:4] are used to assign values to bits [95:64] in the 2594/// destination. \n 2595/// Bits [7:6] are used to assign values to bits [127:96] in the 2596/// destination. \n 2597/// Bit value assignments: \n 2598/// 00: Bits [31:0] copied from the specified operand. \n 2599/// 01: Bits [63:32] copied from the specified operand. \n 2600/// 10: Bits [95:64] copied from the specified operand. \n 2601/// 11: Bits [127:96] copied from the specified operand. 2602/// \returns A 128-bit vector of [4 x float] containing the shuffled values. 2603#define _mm_shuffle_ps(a, b, mask) \ 2604 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 2605 (int)(mask)) 2606 2607/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 2608/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2609/// 2610/// \headerfile <x86intrin.h> 2611/// 2612/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction. 2613/// 2614/// \param __a 2615/// A 128-bit vector of [4 x float]. \n 2616/// Bits [95:64] are written to bits [31:0] of the destination. \n 2617/// Bits [127:96] are written to bits [95:64] of the destination. 2618/// \param __b 2619/// A 128-bit vector of [4 x float]. 2620/// Bits [95:64] are written to bits [63:32] of the destination. \n 2621/// Bits [127:96] are written to bits [127:96] of the destination. 2622/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2623static __inline__ __m128 __DEFAULT_FN_ATTRS 2624_mm_unpackhi_ps(__m128 __a, __m128 __b) 2625{ 2626 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 2627} 2628 2629/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 2630/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2631/// 2632/// \headerfile <x86intrin.h> 2633/// 2634/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction. 2635/// 2636/// \param __a 2637/// A 128-bit vector of [4 x float]. \n 2638/// Bits [31:0] are written to bits [31:0] of the destination. \n 2639/// Bits [63:32] are written to bits [95:64] of the destination. 2640/// \param __b 2641/// A 128-bit vector of [4 x float]. \n 2642/// Bits [31:0] are written to bits [63:32] of the destination. \n 2643/// Bits [63:32] are written to bits [127:96] of the destination. 2644/// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2645static __inline__ __m128 __DEFAULT_FN_ATTRS 2646_mm_unpacklo_ps(__m128 __a, __m128 __b) 2647{ 2648 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 2649} 2650 2651/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2652/// 32 bits are set to the lower 32 bits of the second parameter. The upper 2653/// 96 bits are set to the upper 96 bits of the first parameter. 2654/// 2655/// \headerfile <x86intrin.h> 2656/// 2657/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c> 2658/// instruction. 2659/// 2660/// \param __a 2661/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are 2662/// written to the upper 96 bits of the result. 2663/// \param __b 2664/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are 2665/// written to the lower 32 bits of the result. 2666/// \returns A 128-bit floating-point vector of [4 x float]. 2667static __inline__ __m128 __DEFAULT_FN_ATTRS 2668_mm_move_ss(__m128 __a, __m128 __b) 2669{ 2670 __a[0] = __b[0]; 2671 return __a; 2672} 2673 2674/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2675/// 64 bits are set to the upper 64 bits of the second parameter. The upper 2676/// 64 bits are set to the upper 64 bits of the first parameter. 2677/// 2678/// \headerfile <x86intrin.h> 2679/// 2680/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 2681/// 2682/// \param __a 2683/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2684/// written to the upper 64 bits of the result. 2685/// \param __b 2686/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2687/// written to the lower 64 bits of the result. 2688/// \returns A 128-bit floating-point vector of [4 x float]. 2689static __inline__ __m128 __DEFAULT_FN_ATTRS 2690_mm_movehl_ps(__m128 __a, __m128 __b) 2691{ 2692 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 2693} 2694 2695/// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2696/// 64 bits are set to the lower 64 bits of the first parameter. The upper 2697/// 64 bits are set to the lower 64 bits of the second parameter. 2698/// 2699/// \headerfile <x86intrin.h> 2700/// 2701/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 2702/// 2703/// \param __a 2704/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2705/// written to the lower 64 bits of the result. 2706/// \param __b 2707/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2708/// written to the upper 64 bits of the result. 2709/// \returns A 128-bit floating-point vector of [4 x float]. 2710static __inline__ __m128 __DEFAULT_FN_ATTRS 2711_mm_movelh_ps(__m128 __a, __m128 __b) 2712{ 2713 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 2714} 2715 2716/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x 2717/// float]. 2718/// 2719/// \headerfile <x86intrin.h> 2720/// 2721/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2722/// 2723/// \param __a 2724/// A 64-bit vector of [4 x i16]. The elements of the destination are copied 2725/// from the corresponding elements in this operand. 2726/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2727/// values from the operand. 2728static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2729_mm_cvtpi16_ps(__m64 __a) 2730{ 2731 __m64 __b, __c; 2732 __m128 __r; 2733 2734 __b = _mm_setzero_si64(); 2735 __b = _mm_cmpgt_pi16(__b, __a); 2736 __c = _mm_unpackhi_pi16(__a, __b); 2737 __r = _mm_setzero_ps(); 2738 __r = _mm_cvtpi32_ps(__r, __c); 2739 __r = _mm_movelh_ps(__r, __r); 2740 __c = _mm_unpacklo_pi16(__a, __b); 2741 __r = _mm_cvtpi32_ps(__r, __c); 2742 2743 return __r; 2744} 2745 2746/// Converts a 64-bit vector of 16-bit unsigned integer values into a 2747/// 128-bit vector of [4 x float]. 2748/// 2749/// \headerfile <x86intrin.h> 2750/// 2751/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2752/// 2753/// \param __a 2754/// A 64-bit vector of 16-bit unsigned integer values. The elements of the 2755/// destination are copied from the corresponding elements in this operand. 2756/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2757/// values from the operand. 2758static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2759_mm_cvtpu16_ps(__m64 __a) 2760{ 2761 __m64 __b, __c; 2762 __m128 __r; 2763 2764 __b = _mm_setzero_si64(); 2765 __c = _mm_unpackhi_pi16(__a, __b); 2766 __r = _mm_setzero_ps(); 2767 __r = _mm_cvtpi32_ps(__r, __c); 2768 __r = _mm_movelh_ps(__r, __r); 2769 __c = _mm_unpacklo_pi16(__a, __b); 2770 __r = _mm_cvtpi32_ps(__r, __c); 2771 2772 return __r; 2773} 2774 2775/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] 2776/// into a 128-bit vector of [4 x float]. 2777/// 2778/// \headerfile <x86intrin.h> 2779/// 2780/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2781/// 2782/// \param __a 2783/// A 64-bit vector of [8 x i8]. The elements of the destination are copied 2784/// from the corresponding lower 4 elements in this operand. 2785/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2786/// values from the operand. 2787static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2788_mm_cvtpi8_ps(__m64 __a) 2789{ 2790 __m64 __b; 2791 2792 __b = _mm_setzero_si64(); 2793 __b = _mm_cmpgt_pi8(__b, __a); 2794 __b = _mm_unpacklo_pi8(__a, __b); 2795 2796 return _mm_cvtpi16_ps(__b); 2797} 2798 2799/// Converts the lower four unsigned 8-bit integer values from a 64-bit 2800/// vector of [8 x u8] into a 128-bit vector of [4 x float]. 2801/// 2802/// \headerfile <x86intrin.h> 2803/// 2804/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2805/// 2806/// \param __a 2807/// A 64-bit vector of unsigned 8-bit integer values. The elements of the 2808/// destination are copied from the corresponding lower 4 elements in this 2809/// operand. 2810/// \returns A 128-bit vector of [4 x float] containing the copied and converted 2811/// values from the source operand. 2812static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2813_mm_cvtpu8_ps(__m64 __a) 2814{ 2815 __m64 __b; 2816 2817 __b = _mm_setzero_si64(); 2818 __b = _mm_unpacklo_pi8(__a, __b); 2819 2820 return _mm_cvtpi16_ps(__b); 2821} 2822 2823/// Converts the two 32-bit signed integer values from each 64-bit vector 2824/// operand of [2 x i32] into a 128-bit vector of [4 x float]. 2825/// 2826/// \headerfile <x86intrin.h> 2827/// 2828/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2829/// 2830/// \param __a 2831/// A 64-bit vector of [2 x i32]. The lower elements of the destination are 2832/// copied from the elements in this operand. 2833/// \param __b 2834/// A 64-bit vector of [2 x i32]. The upper elements of the destination are 2835/// copied from the elements in this operand. 2836/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 2837/// copied and converted values from the first operand. The upper 64 bits 2838/// contain the copied and converted values from the second operand. 2839static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2840_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 2841{ 2842 __m128 __c; 2843 2844 __c = _mm_setzero_ps(); 2845 __c = _mm_cvtpi32_ps(__c, __b); 2846 __c = _mm_movelh_ps(__c, __c); 2847 2848 return _mm_cvtpi32_ps(__c, __a); 2849} 2850 2851/// Converts each single-precision floating-point element of a 128-bit 2852/// floating-point vector of [4 x float] into a 16-bit signed integer, and 2853/// packs the results into a 64-bit integer vector of [4 x i16]. 2854/// 2855/// If the floating-point element is NaN or infinity, or if the 2856/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, 2857/// it is converted to 0x8000. Otherwise if the floating-point element is 2858/// greater than 0x7FFF, it is converted to 0x7FFF. 2859/// 2860/// \headerfile <x86intrin.h> 2861/// 2862/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2863/// 2864/// \param __a 2865/// A 128-bit floating-point vector of [4 x float]. 2866/// \returns A 64-bit integer vector of [4 x i16] containing the converted 2867/// values. 2868static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2869_mm_cvtps_pi16(__m128 __a) 2870{ 2871 __m64 __b, __c; 2872 2873 __b = _mm_cvtps_pi32(__a); 2874 __a = _mm_movehl_ps(__a, __a); 2875 __c = _mm_cvtps_pi32(__a); 2876 2877 return _mm_packs_pi32(__b, __c); 2878} 2879 2880/// Converts each single-precision floating-point element of a 128-bit 2881/// floating-point vector of [4 x float] into an 8-bit signed integer, and 2882/// packs the results into the lower 32 bits of a 64-bit integer vector of 2883/// [8 x i8]. The upper 32 bits of the vector are set to 0. 2884/// 2885/// If the floating-point element is NaN or infinity, or if the 2886/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it 2887/// is converted to 0x80. Otherwise if the floating-point element is greater 2888/// than 0x7F, it is converted to 0x7F. 2889/// 2890/// \headerfile <x86intrin.h> 2891/// 2892/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2893/// 2894/// \param __a 2895/// 128-bit floating-point vector of [4 x float]. 2896/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the 2897/// converted values and the uppper 32 bits are set to zero. 2898static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2899_mm_cvtps_pi8(__m128 __a) 2900{ 2901 __m64 __b, __c; 2902 2903 __b = _mm_cvtps_pi16(__a); 2904 __c = _mm_setzero_si64(); 2905 2906 return _mm_packs_pi16(__b, __c); 2907} 2908 2909/// Extracts the sign bits from each single-precision floating-point 2910/// element of a 128-bit floating-point vector of [4 x float] and returns the 2911/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set 2912/// to zero. 2913/// 2914/// \headerfile <x86intrin.h> 2915/// 2916/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction. 2917/// 2918/// \param __a 2919/// A 128-bit floating-point vector of [4 x float]. 2920/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each 2921/// single-precision floating-point element of the parameter. Bits [31:4] are 2922/// set to zero. 2923static __inline__ int __DEFAULT_FN_ATTRS 2924_mm_movemask_ps(__m128 __a) 2925{ 2926 return __builtin_ia32_movmskps((__v4sf)__a); 2927} 2928 2929 2930#define _MM_ALIGN16 __attribute__((aligned(16))) 2931 2932#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2933 2934#define _MM_EXCEPT_INVALID (0x0001) 2935#define _MM_EXCEPT_DENORM (0x0002) 2936#define _MM_EXCEPT_DIV_ZERO (0x0004) 2937#define _MM_EXCEPT_OVERFLOW (0x0008) 2938#define _MM_EXCEPT_UNDERFLOW (0x0010) 2939#define _MM_EXCEPT_INEXACT (0x0020) 2940#define _MM_EXCEPT_MASK (0x003f) 2941 2942#define _MM_MASK_INVALID (0x0080) 2943#define _MM_MASK_DENORM (0x0100) 2944#define _MM_MASK_DIV_ZERO (0x0200) 2945#define _MM_MASK_OVERFLOW (0x0400) 2946#define _MM_MASK_UNDERFLOW (0x0800) 2947#define _MM_MASK_INEXACT (0x1000) 2948#define _MM_MASK_MASK (0x1f80) 2949 2950#define _MM_ROUND_NEAREST (0x0000) 2951#define _MM_ROUND_DOWN (0x2000) 2952#define _MM_ROUND_UP (0x4000) 2953#define _MM_ROUND_TOWARD_ZERO (0x6000) 2954#define _MM_ROUND_MASK (0x6000) 2955 2956#define _MM_FLUSH_ZERO_MASK (0x8000) 2957#define _MM_FLUSH_ZERO_ON (0x8000) 2958#define _MM_FLUSH_ZERO_OFF (0x0000) 2959 2960#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 2961#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 2962#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 2963#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 2964 2965#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 2966#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 2967#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 2968#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 2969 2970#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2971do { \ 2972 __m128 tmp3, tmp2, tmp1, tmp0; \ 2973 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 2974 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 2975 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 2976 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 2977 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 2978 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 2979 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 2980 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 2981} while (0) 2982 2983/* Aliases for compatibility. */ 2984#define _m_pextrw _mm_extract_pi16 2985#define _m_pinsrw _mm_insert_pi16 2986#define _m_pmaxsw _mm_max_pi16 2987#define _m_pmaxub _mm_max_pu8 2988#define _m_pminsw _mm_min_pi16 2989#define _m_pminub _mm_min_pu8 2990#define _m_pmovmskb _mm_movemask_pi8 2991#define _m_pmulhuw _mm_mulhi_pu16 2992#define _m_pshufw _mm_shuffle_pi16 2993#define _m_maskmovq _mm_maskmove_si64 2994#define _m_pavgb _mm_avg_pu8 2995#define _m_pavgw _mm_avg_pu16 2996#define _m_psadbw _mm_sad_pu8 2997#define _m_ _mm_ 2998#define _m_ _mm_ 2999 3000#undef __DEFAULT_FN_ATTRS 3001#undef __DEFAULT_FN_ATTRS_MMX 3002 3003/* Ugly hack for backwards-compatibility (compatible with gcc) */ 3004#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 3005#include <emmintrin.h> 3006#endif 3007 3008#endif /* __XMMINTRIN_H */ 3009