1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __EMMINTRIN_H 11#define __EMMINTRIN_H 12 13#include <xmmintrin.h> 14 15typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 16typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 17 18typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 19typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); 20 21/* Type defines. */ 22typedef double __v2df __attribute__ ((__vector_size__ (16))); 23typedef long long __v2di __attribute__ ((__vector_size__ (16))); 24typedef short __v8hi __attribute__((__vector_size__(16))); 25typedef char __v16qi __attribute__((__vector_size__(16))); 26 27/* Unsigned types */ 28typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 29typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 30typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 31 32/* We need an explicitly signed variant for char. Note that this shouldn't 33 * appear in the interface though. */ 34typedef signed char __v16qs __attribute__((__vector_size__(16))); 35 36/* Define the default attributes for the functions in this file. */ 37#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) 38#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) 39 40/// Adds lower double-precision values in both operands and returns the 41/// sum in the lower 64 bits of the result. The upper 64 bits of the result 42/// are copied from the upper double-precision value of the first operand. 43/// 44/// \headerfile <x86intrin.h> 45/// 46/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 47/// 48/// \param __a 49/// A 128-bit vector of [2 x double] containing one of the source operands. 50/// \param __b 51/// A 128-bit vector of [2 x double] containing one of the source operands. 52/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 53/// sum of the lower 64 bits of both operands. The upper 64 bits are copied 54/// from the upper 64 bits of the first source operand. 55static __inline__ __m128d __DEFAULT_FN_ATTRS 56_mm_add_sd(__m128d __a, __m128d __b) 57{ 58 __a[0] += __b[0]; 59 return __a; 60} 61 62/// Adds two 128-bit vectors of [2 x double]. 63/// 64/// \headerfile <x86intrin.h> 65/// 66/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 67/// 68/// \param __a 69/// A 128-bit vector of [2 x double] containing one of the source operands. 70/// \param __b 71/// A 128-bit vector of [2 x double] containing one of the source operands. 72/// \returns A 128-bit vector of [2 x double] containing the sums of both 73/// operands. 74static __inline__ __m128d __DEFAULT_FN_ATTRS 75_mm_add_pd(__m128d __a, __m128d __b) 76{ 77 return (__m128d)((__v2df)__a + (__v2df)__b); 78} 79 80/// Subtracts the lower double-precision value of the second operand 81/// from the lower double-precision value of the first operand and returns 82/// the difference in the lower 64 bits of the result. The upper 64 bits of 83/// the result are copied from the upper double-precision value of the first 84/// operand. 85/// 86/// \headerfile <x86intrin.h> 87/// 88/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 89/// 90/// \param __a 91/// A 128-bit vector of [2 x double] containing the minuend. 92/// \param __b 93/// A 128-bit vector of [2 x double] containing the subtrahend. 94/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 95/// difference of the lower 64 bits of both operands. The upper 64 bits are 96/// copied from the upper 64 bits of the first source operand. 97static __inline__ __m128d __DEFAULT_FN_ATTRS 98_mm_sub_sd(__m128d __a, __m128d __b) 99{ 100 __a[0] -= __b[0]; 101 return __a; 102} 103 104/// Subtracts two 128-bit vectors of [2 x double]. 105/// 106/// \headerfile <x86intrin.h> 107/// 108/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 109/// 110/// \param __a 111/// A 128-bit vector of [2 x double] containing the minuend. 112/// \param __b 113/// A 128-bit vector of [2 x double] containing the subtrahend. 114/// \returns A 128-bit vector of [2 x double] containing the differences between 115/// both operands. 116static __inline__ __m128d __DEFAULT_FN_ATTRS 117_mm_sub_pd(__m128d __a, __m128d __b) 118{ 119 return (__m128d)((__v2df)__a - (__v2df)__b); 120} 121 122/// Multiplies lower double-precision values in both operands and returns 123/// the product in the lower 64 bits of the result. The upper 64 bits of the 124/// result are copied from the upper double-precision value of the first 125/// operand. 126/// 127/// \headerfile <x86intrin.h> 128/// 129/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 130/// 131/// \param __a 132/// A 128-bit vector of [2 x double] containing one of the source operands. 133/// \param __b 134/// A 128-bit vector of [2 x double] containing one of the source operands. 135/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 136/// product of the lower 64 bits of both operands. The upper 64 bits are 137/// copied from the upper 64 bits of the first source operand. 138static __inline__ __m128d __DEFAULT_FN_ATTRS 139_mm_mul_sd(__m128d __a, __m128d __b) 140{ 141 __a[0] *= __b[0]; 142 return __a; 143} 144 145/// Multiplies two 128-bit vectors of [2 x double]. 146/// 147/// \headerfile <x86intrin.h> 148/// 149/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 150/// 151/// \param __a 152/// A 128-bit vector of [2 x double] containing one of the operands. 153/// \param __b 154/// A 128-bit vector of [2 x double] containing one of the operands. 155/// \returns A 128-bit vector of [2 x double] containing the products of both 156/// operands. 157static __inline__ __m128d __DEFAULT_FN_ATTRS 158_mm_mul_pd(__m128d __a, __m128d __b) 159{ 160 return (__m128d)((__v2df)__a * (__v2df)__b); 161} 162 163/// Divides the lower double-precision value of the first operand by the 164/// lower double-precision value of the second operand and returns the 165/// quotient in the lower 64 bits of the result. The upper 64 bits of the 166/// result are copied from the upper double-precision value of the first 167/// operand. 168/// 169/// \headerfile <x86intrin.h> 170/// 171/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 172/// 173/// \param __a 174/// A 128-bit vector of [2 x double] containing the dividend. 175/// \param __b 176/// A 128-bit vector of [2 x double] containing divisor. 177/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 178/// quotient of the lower 64 bits of both operands. The upper 64 bits are 179/// copied from the upper 64 bits of the first source operand. 180static __inline__ __m128d __DEFAULT_FN_ATTRS 181_mm_div_sd(__m128d __a, __m128d __b) 182{ 183 __a[0] /= __b[0]; 184 return __a; 185} 186 187/// Performs an element-by-element division of two 128-bit vectors of 188/// [2 x double]. 189/// 190/// \headerfile <x86intrin.h> 191/// 192/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 193/// 194/// \param __a 195/// A 128-bit vector of [2 x double] containing the dividend. 196/// \param __b 197/// A 128-bit vector of [2 x double] containing the divisor. 198/// \returns A 128-bit vector of [2 x double] containing the quotients of both 199/// operands. 200static __inline__ __m128d __DEFAULT_FN_ATTRS 201_mm_div_pd(__m128d __a, __m128d __b) 202{ 203 return (__m128d)((__v2df)__a / (__v2df)__b); 204} 205 206/// Calculates the square root of the lower double-precision value of 207/// the second operand and returns it in the lower 64 bits of the result. 208/// The upper 64 bits of the result are copied from the upper 209/// double-precision value of the first operand. 210/// 211/// \headerfile <x86intrin.h> 212/// 213/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 214/// 215/// \param __a 216/// A 128-bit vector of [2 x double] containing one of the operands. The 217/// upper 64 bits of this operand are copied to the upper 64 bits of the 218/// result. 219/// \param __b 220/// A 128-bit vector of [2 x double] containing one of the operands. The 221/// square root is calculated using the lower 64 bits of this operand. 222/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 223/// square root of the lower 64 bits of operand \a __b, and whose upper 64 224/// bits are copied from the upper 64 bits of operand \a __a. 225static __inline__ __m128d __DEFAULT_FN_ATTRS 226_mm_sqrt_sd(__m128d __a, __m128d __b) 227{ 228 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 229 return __extension__ (__m128d) { __c[0], __a[1] }; 230} 231 232/// Calculates the square root of the each of two values stored in a 233/// 128-bit vector of [2 x double]. 234/// 235/// \headerfile <x86intrin.h> 236/// 237/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 238/// 239/// \param __a 240/// A 128-bit vector of [2 x double]. 241/// \returns A 128-bit vector of [2 x double] containing the square roots of the 242/// values in the operand. 243static __inline__ __m128d __DEFAULT_FN_ATTRS 244_mm_sqrt_pd(__m128d __a) 245{ 246 return __builtin_ia32_sqrtpd((__v2df)__a); 247} 248 249/// Compares lower 64-bit double-precision values of both operands, and 250/// returns the lesser of the pair of values in the lower 64-bits of the 251/// result. The upper 64 bits of the result are copied from the upper 252/// double-precision value of the first operand. 253/// 254/// \headerfile <x86intrin.h> 255/// 256/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 257/// 258/// \param __a 259/// A 128-bit vector of [2 x double] containing one of the operands. The 260/// lower 64 bits of this operand are used in the comparison. 261/// \param __b 262/// A 128-bit vector of [2 x double] containing one of the operands. The 263/// lower 64 bits of this operand are used in the comparison. 264/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 265/// minimum value between both operands. The upper 64 bits are copied from 266/// the upper 64 bits of the first source operand. 267static __inline__ __m128d __DEFAULT_FN_ATTRS 268_mm_min_sd(__m128d __a, __m128d __b) 269{ 270 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 271} 272 273/// Performs element-by-element comparison of the two 128-bit vectors of 274/// [2 x double] and returns the vector containing the lesser of each pair of 275/// values. 276/// 277/// \headerfile <x86intrin.h> 278/// 279/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 280/// 281/// \param __a 282/// A 128-bit vector of [2 x double] containing one of the operands. 283/// \param __b 284/// A 128-bit vector of [2 x double] containing one of the operands. 285/// \returns A 128-bit vector of [2 x double] containing the minimum values 286/// between both operands. 287static __inline__ __m128d __DEFAULT_FN_ATTRS 288_mm_min_pd(__m128d __a, __m128d __b) 289{ 290 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 291} 292 293/// Compares lower 64-bit double-precision values of both operands, and 294/// returns the greater of the pair of values in the lower 64-bits of the 295/// result. The upper 64 bits of the result are copied from the upper 296/// double-precision value of the first operand. 297/// 298/// \headerfile <x86intrin.h> 299/// 300/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 301/// 302/// \param __a 303/// A 128-bit vector of [2 x double] containing one of the operands. The 304/// lower 64 bits of this operand are used in the comparison. 305/// \param __b 306/// A 128-bit vector of [2 x double] containing one of the operands. The 307/// lower 64 bits of this operand are used in the comparison. 308/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 309/// maximum value between both operands. The upper 64 bits are copied from 310/// the upper 64 bits of the first source operand. 311static __inline__ __m128d __DEFAULT_FN_ATTRS 312_mm_max_sd(__m128d __a, __m128d __b) 313{ 314 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 315} 316 317/// Performs element-by-element comparison of the two 128-bit vectors of 318/// [2 x double] and returns the vector containing the greater of each pair 319/// of values. 320/// 321/// \headerfile <x86intrin.h> 322/// 323/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 324/// 325/// \param __a 326/// A 128-bit vector of [2 x double] containing one of the operands. 327/// \param __b 328/// A 128-bit vector of [2 x double] containing one of the operands. 329/// \returns A 128-bit vector of [2 x double] containing the maximum values 330/// between both operands. 331static __inline__ __m128d __DEFAULT_FN_ATTRS 332_mm_max_pd(__m128d __a, __m128d __b) 333{ 334 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 335} 336 337/// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 338/// 339/// \headerfile <x86intrin.h> 340/// 341/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 342/// 343/// \param __a 344/// A 128-bit vector of [2 x double] containing one of the source operands. 345/// \param __b 346/// A 128-bit vector of [2 x double] containing one of the source operands. 347/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 348/// values between both operands. 349static __inline__ __m128d __DEFAULT_FN_ATTRS 350_mm_and_pd(__m128d __a, __m128d __b) 351{ 352 return (__m128d)((__v2du)__a & (__v2du)__b); 353} 354 355/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 356/// the one's complement of the values contained in the first source operand. 357/// 358/// \headerfile <x86intrin.h> 359/// 360/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 361/// 362/// \param __a 363/// A 128-bit vector of [2 x double] containing the left source operand. The 364/// one's complement of this value is used in the bitwise AND. 365/// \param __b 366/// A 128-bit vector of [2 x double] containing the right source operand. 367/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 368/// values in the second operand and the one's complement of the first 369/// operand. 370static __inline__ __m128d __DEFAULT_FN_ATTRS 371_mm_andnot_pd(__m128d __a, __m128d __b) 372{ 373 return (__m128d)(~(__v2du)__a & (__v2du)__b); 374} 375 376/// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 377/// 378/// \headerfile <x86intrin.h> 379/// 380/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 381/// 382/// \param __a 383/// A 128-bit vector of [2 x double] containing one of the source operands. 384/// \param __b 385/// A 128-bit vector of [2 x double] containing one of the source operands. 386/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 387/// values between both operands. 388static __inline__ __m128d __DEFAULT_FN_ATTRS 389_mm_or_pd(__m128d __a, __m128d __b) 390{ 391 return (__m128d)((__v2du)__a | (__v2du)__b); 392} 393 394/// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 395/// 396/// \headerfile <x86intrin.h> 397/// 398/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 399/// 400/// \param __a 401/// A 128-bit vector of [2 x double] containing one of the source operands. 402/// \param __b 403/// A 128-bit vector of [2 x double] containing one of the source operands. 404/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 405/// values between both operands. 406static __inline__ __m128d __DEFAULT_FN_ATTRS 407_mm_xor_pd(__m128d __a, __m128d __b) 408{ 409 return (__m128d)((__v2du)__a ^ (__v2du)__b); 410} 411 412/// Compares each of the corresponding double-precision values of the 413/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 414/// for false, 0xFFFFFFFFFFFFFFFF for true. 415/// 416/// \headerfile <x86intrin.h> 417/// 418/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 419/// 420/// \param __a 421/// A 128-bit vector of [2 x double]. 422/// \param __b 423/// A 128-bit vector of [2 x double]. 424/// \returns A 128-bit vector containing the comparison results. 425static __inline__ __m128d __DEFAULT_FN_ATTRS 426_mm_cmpeq_pd(__m128d __a, __m128d __b) 427{ 428 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 429} 430 431/// Compares each of the corresponding double-precision values of the 432/// 128-bit vectors of [2 x double] to determine if the values in the first 433/// operand are less than those in the second operand. Each comparison 434/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 435/// 436/// \headerfile <x86intrin.h> 437/// 438/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 439/// 440/// \param __a 441/// A 128-bit vector of [2 x double]. 442/// \param __b 443/// A 128-bit vector of [2 x double]. 444/// \returns A 128-bit vector containing the comparison results. 445static __inline__ __m128d __DEFAULT_FN_ATTRS 446_mm_cmplt_pd(__m128d __a, __m128d __b) 447{ 448 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 449} 450 451/// Compares each of the corresponding double-precision values of the 452/// 128-bit vectors of [2 x double] to determine if the values in the first 453/// operand are less than or equal to those in the second operand. 454/// 455/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 456/// 457/// \headerfile <x86intrin.h> 458/// 459/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 460/// 461/// \param __a 462/// A 128-bit vector of [2 x double]. 463/// \param __b 464/// A 128-bit vector of [2 x double]. 465/// \returns A 128-bit vector containing the comparison results. 466static __inline__ __m128d __DEFAULT_FN_ATTRS 467_mm_cmple_pd(__m128d __a, __m128d __b) 468{ 469 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 470} 471 472/// Compares each of the corresponding double-precision values of the 473/// 128-bit vectors of [2 x double] to determine if the values in the first 474/// operand are greater than those in the second operand. 475/// 476/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 477/// 478/// \headerfile <x86intrin.h> 479/// 480/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 481/// 482/// \param __a 483/// A 128-bit vector of [2 x double]. 484/// \param __b 485/// A 128-bit vector of [2 x double]. 486/// \returns A 128-bit vector containing the comparison results. 487static __inline__ __m128d __DEFAULT_FN_ATTRS 488_mm_cmpgt_pd(__m128d __a, __m128d __b) 489{ 490 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 491} 492 493/// Compares each of the corresponding double-precision values of the 494/// 128-bit vectors of [2 x double] to determine if the values in the first 495/// operand are greater than or equal to those in the second operand. 496/// 497/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 498/// 499/// \headerfile <x86intrin.h> 500/// 501/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 502/// 503/// \param __a 504/// A 128-bit vector of [2 x double]. 505/// \param __b 506/// A 128-bit vector of [2 x double]. 507/// \returns A 128-bit vector containing the comparison results. 508static __inline__ __m128d __DEFAULT_FN_ATTRS 509_mm_cmpge_pd(__m128d __a, __m128d __b) 510{ 511 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 512} 513 514/// Compares each of the corresponding double-precision values of the 515/// 128-bit vectors of [2 x double] to determine if the values in the first 516/// operand are ordered with respect to those in the second operand. 517/// 518/// A pair of double-precision values are "ordered" with respect to each 519/// other if neither value is a NaN. Each comparison yields 0x0 for false, 520/// 0xFFFFFFFFFFFFFFFF for true. 521/// 522/// \headerfile <x86intrin.h> 523/// 524/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 525/// 526/// \param __a 527/// A 128-bit vector of [2 x double]. 528/// \param __b 529/// A 128-bit vector of [2 x double]. 530/// \returns A 128-bit vector containing the comparison results. 531static __inline__ __m128d __DEFAULT_FN_ATTRS 532_mm_cmpord_pd(__m128d __a, __m128d __b) 533{ 534 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 535} 536 537/// Compares each of the corresponding double-precision values of the 538/// 128-bit vectors of [2 x double] to determine if the values in the first 539/// operand are unordered with respect to those in the second operand. 540/// 541/// A pair of double-precision values are "unordered" with respect to each 542/// other if one or both values are NaN. Each comparison yields 0x0 for 543/// false, 0xFFFFFFFFFFFFFFFF for true. 544/// 545/// \headerfile <x86intrin.h> 546/// 547/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 548/// instruction. 549/// 550/// \param __a 551/// A 128-bit vector of [2 x double]. 552/// \param __b 553/// A 128-bit vector of [2 x double]. 554/// \returns A 128-bit vector containing the comparison results. 555static __inline__ __m128d __DEFAULT_FN_ATTRS 556_mm_cmpunord_pd(__m128d __a, __m128d __b) 557{ 558 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 559} 560 561/// Compares each of the corresponding double-precision values of the 562/// 128-bit vectors of [2 x double] to determine if the values in the first 563/// operand are unequal to those in the second operand. 564/// 565/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 566/// 567/// \headerfile <x86intrin.h> 568/// 569/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 570/// 571/// \param __a 572/// A 128-bit vector of [2 x double]. 573/// \param __b 574/// A 128-bit vector of [2 x double]. 575/// \returns A 128-bit vector containing the comparison results. 576static __inline__ __m128d __DEFAULT_FN_ATTRS 577_mm_cmpneq_pd(__m128d __a, __m128d __b) 578{ 579 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 580} 581 582/// Compares each of the corresponding double-precision values of the 583/// 128-bit vectors of [2 x double] to determine if the values in the first 584/// operand are not less than those in the second operand. 585/// 586/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 587/// 588/// \headerfile <x86intrin.h> 589/// 590/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 591/// 592/// \param __a 593/// A 128-bit vector of [2 x double]. 594/// \param __b 595/// A 128-bit vector of [2 x double]. 596/// \returns A 128-bit vector containing the comparison results. 597static __inline__ __m128d __DEFAULT_FN_ATTRS 598_mm_cmpnlt_pd(__m128d __a, __m128d __b) 599{ 600 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 601} 602 603/// Compares each of the corresponding double-precision values of the 604/// 128-bit vectors of [2 x double] to determine if the values in the first 605/// operand are not less than or equal to those in the second operand. 606/// 607/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 608/// 609/// \headerfile <x86intrin.h> 610/// 611/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 612/// 613/// \param __a 614/// A 128-bit vector of [2 x double]. 615/// \param __b 616/// A 128-bit vector of [2 x double]. 617/// \returns A 128-bit vector containing the comparison results. 618static __inline__ __m128d __DEFAULT_FN_ATTRS 619_mm_cmpnle_pd(__m128d __a, __m128d __b) 620{ 621 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 622} 623 624/// Compares each of the corresponding double-precision values of the 625/// 128-bit vectors of [2 x double] to determine if the values in the first 626/// operand are not greater than those in the second operand. 627/// 628/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 629/// 630/// \headerfile <x86intrin.h> 631/// 632/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 633/// 634/// \param __a 635/// A 128-bit vector of [2 x double]. 636/// \param __b 637/// A 128-bit vector of [2 x double]. 638/// \returns A 128-bit vector containing the comparison results. 639static __inline__ __m128d __DEFAULT_FN_ATTRS 640_mm_cmpngt_pd(__m128d __a, __m128d __b) 641{ 642 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 643} 644 645/// Compares each of the corresponding double-precision values of the 646/// 128-bit vectors of [2 x double] to determine if the values in the first 647/// operand are not greater than or equal to those in the second operand. 648/// 649/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 650/// 651/// \headerfile <x86intrin.h> 652/// 653/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 654/// 655/// \param __a 656/// A 128-bit vector of [2 x double]. 657/// \param __b 658/// A 128-bit vector of [2 x double]. 659/// \returns A 128-bit vector containing the comparison results. 660static __inline__ __m128d __DEFAULT_FN_ATTRS 661_mm_cmpnge_pd(__m128d __a, __m128d __b) 662{ 663 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 664} 665 666/// Compares the lower double-precision floating-point values in each of 667/// the two 128-bit floating-point vectors of [2 x double] for equality. 668/// 669/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 670/// 671/// \headerfile <x86intrin.h> 672/// 673/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 674/// 675/// \param __a 676/// A 128-bit vector of [2 x double]. The lower double-precision value is 677/// compared to the lower double-precision value of \a __b. 678/// \param __b 679/// A 128-bit vector of [2 x double]. The lower double-precision value is 680/// compared to the lower double-precision value of \a __a. 681/// \returns A 128-bit vector. The lower 64 bits contains the comparison 682/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 683static __inline__ __m128d __DEFAULT_FN_ATTRS 684_mm_cmpeq_sd(__m128d __a, __m128d __b) 685{ 686 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 687} 688 689/// Compares the lower double-precision floating-point values in each of 690/// the two 128-bit floating-point vectors of [2 x double] to determine if 691/// the value in the first parameter is less than the corresponding value in 692/// the second parameter. 693/// 694/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 695/// 696/// \headerfile <x86intrin.h> 697/// 698/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 699/// 700/// \param __a 701/// A 128-bit vector of [2 x double]. The lower double-precision value is 702/// compared to the lower double-precision value of \a __b. 703/// \param __b 704/// A 128-bit vector of [2 x double]. The lower double-precision value is 705/// compared to the lower double-precision value of \a __a. 706/// \returns A 128-bit vector. The lower 64 bits contains the comparison 707/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 708static __inline__ __m128d __DEFAULT_FN_ATTRS 709_mm_cmplt_sd(__m128d __a, __m128d __b) 710{ 711 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 712} 713 714/// Compares the lower double-precision floating-point values in each of 715/// the two 128-bit floating-point vectors of [2 x double] to determine if 716/// the value in the first parameter is less than or equal to the 717/// corresponding value in the second parameter. 718/// 719/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 720/// 721/// \headerfile <x86intrin.h> 722/// 723/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 724/// 725/// \param __a 726/// A 128-bit vector of [2 x double]. The lower double-precision value is 727/// compared to the lower double-precision value of \a __b. 728/// \param __b 729/// A 128-bit vector of [2 x double]. The lower double-precision value is 730/// compared to the lower double-precision value of \a __a. 731/// \returns A 128-bit vector. The lower 64 bits contains the comparison 732/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 733static __inline__ __m128d __DEFAULT_FN_ATTRS 734_mm_cmple_sd(__m128d __a, __m128d __b) 735{ 736 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 737} 738 739/// Compares the lower double-precision floating-point values in each of 740/// the two 128-bit floating-point vectors of [2 x double] to determine if 741/// the value in the first parameter is greater than the corresponding value 742/// in the second parameter. 743/// 744/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 745/// 746/// \headerfile <x86intrin.h> 747/// 748/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 749/// 750/// \param __a 751/// A 128-bit vector of [2 x double]. The lower double-precision value is 752/// compared to the lower double-precision value of \a __b. 753/// \param __b 754/// A 128-bit vector of [2 x double]. The lower double-precision value is 755/// compared to the lower double-precision value of \a __a. 756/// \returns A 128-bit vector. The lower 64 bits contains the comparison 757/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 758static __inline__ __m128d __DEFAULT_FN_ATTRS 759_mm_cmpgt_sd(__m128d __a, __m128d __b) 760{ 761 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 762 return __extension__ (__m128d) { __c[0], __a[1] }; 763} 764 765/// Compares the lower double-precision floating-point values in each of 766/// the two 128-bit floating-point vectors of [2 x double] to determine if 767/// the value in the first parameter is greater than or equal to the 768/// corresponding value in the second parameter. 769/// 770/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 771/// 772/// \headerfile <x86intrin.h> 773/// 774/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 775/// 776/// \param __a 777/// A 128-bit vector of [2 x double]. The lower double-precision value is 778/// compared to the lower double-precision value of \a __b. 779/// \param __b 780/// A 128-bit vector of [2 x double]. The lower double-precision value is 781/// compared to the lower double-precision value of \a __a. 782/// \returns A 128-bit vector. The lower 64 bits contains the comparison 783/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 784static __inline__ __m128d __DEFAULT_FN_ATTRS 785_mm_cmpge_sd(__m128d __a, __m128d __b) 786{ 787 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 788 return __extension__ (__m128d) { __c[0], __a[1] }; 789} 790 791/// Compares the lower double-precision floating-point values in each of 792/// the two 128-bit floating-point vectors of [2 x double] to determine if 793/// the value in the first parameter is "ordered" with respect to the 794/// corresponding value in the second parameter. 795/// 796/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 797/// of double-precision values are "ordered" with respect to each other if 798/// neither value is a NaN. 799/// 800/// \headerfile <x86intrin.h> 801/// 802/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 803/// 804/// \param __a 805/// A 128-bit vector of [2 x double]. The lower double-precision value is 806/// compared to the lower double-precision value of \a __b. 807/// \param __b 808/// A 128-bit vector of [2 x double]. The lower double-precision value is 809/// compared to the lower double-precision value of \a __a. 810/// \returns A 128-bit vector. The lower 64 bits contains the comparison 811/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 812static __inline__ __m128d __DEFAULT_FN_ATTRS 813_mm_cmpord_sd(__m128d __a, __m128d __b) 814{ 815 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 816} 817 818/// Compares the lower double-precision floating-point values in each of 819/// the two 128-bit floating-point vectors of [2 x double] to determine if 820/// the value in the first parameter is "unordered" with respect to the 821/// corresponding value in the second parameter. 822/// 823/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 824/// of double-precision values are "unordered" with respect to each other if 825/// one or both values are NaN. 826/// 827/// \headerfile <x86intrin.h> 828/// 829/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 830/// instruction. 831/// 832/// \param __a 833/// A 128-bit vector of [2 x double]. The lower double-precision value is 834/// compared to the lower double-precision value of \a __b. 835/// \param __b 836/// A 128-bit vector of [2 x double]. The lower double-precision value is 837/// compared to the lower double-precision value of \a __a. 838/// \returns A 128-bit vector. The lower 64 bits contains the comparison 839/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 840static __inline__ __m128d __DEFAULT_FN_ATTRS 841_mm_cmpunord_sd(__m128d __a, __m128d __b) 842{ 843 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 844} 845 846/// Compares the lower double-precision floating-point values in each of 847/// the two 128-bit floating-point vectors of [2 x double] to determine if 848/// the value in the first parameter is unequal to the corresponding value in 849/// the second parameter. 850/// 851/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 852/// 853/// \headerfile <x86intrin.h> 854/// 855/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 856/// 857/// \param __a 858/// A 128-bit vector of [2 x double]. The lower double-precision value is 859/// compared to the lower double-precision value of \a __b. 860/// \param __b 861/// A 128-bit vector of [2 x double]. The lower double-precision value is 862/// compared to the lower double-precision value of \a __a. 863/// \returns A 128-bit vector. The lower 64 bits contains the comparison 864/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 865static __inline__ __m128d __DEFAULT_FN_ATTRS 866_mm_cmpneq_sd(__m128d __a, __m128d __b) 867{ 868 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 869} 870 871/// Compares the lower double-precision floating-point values in each of 872/// the two 128-bit floating-point vectors of [2 x double] to determine if 873/// the value in the first parameter is not less than the corresponding 874/// value in the second parameter. 875/// 876/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 877/// 878/// \headerfile <x86intrin.h> 879/// 880/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 881/// 882/// \param __a 883/// A 128-bit vector of [2 x double]. The lower double-precision value is 884/// compared to the lower double-precision value of \a __b. 885/// \param __b 886/// A 128-bit vector of [2 x double]. The lower double-precision value is 887/// compared to the lower double-precision value of \a __a. 888/// \returns A 128-bit vector. The lower 64 bits contains the comparison 889/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 890static __inline__ __m128d __DEFAULT_FN_ATTRS 891_mm_cmpnlt_sd(__m128d __a, __m128d __b) 892{ 893 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 894} 895 896/// Compares the lower double-precision floating-point values in each of 897/// the two 128-bit floating-point vectors of [2 x double] to determine if 898/// the value in the first parameter is not less than or equal to the 899/// corresponding value in the second parameter. 900/// 901/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 902/// 903/// \headerfile <x86intrin.h> 904/// 905/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 906/// 907/// \param __a 908/// A 128-bit vector of [2 x double]. The lower double-precision value is 909/// compared to the lower double-precision value of \a __b. 910/// \param __b 911/// A 128-bit vector of [2 x double]. The lower double-precision value is 912/// compared to the lower double-precision value of \a __a. 913/// \returns A 128-bit vector. The lower 64 bits contains the comparison 914/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 915static __inline__ __m128d __DEFAULT_FN_ATTRS 916_mm_cmpnle_sd(__m128d __a, __m128d __b) 917{ 918 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 919} 920 921/// Compares the lower double-precision floating-point values in each of 922/// the two 128-bit floating-point vectors of [2 x double] to determine if 923/// the value in the first parameter is not greater than the corresponding 924/// value in the second parameter. 925/// 926/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 927/// 928/// \headerfile <x86intrin.h> 929/// 930/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 931/// 932/// \param __a 933/// A 128-bit vector of [2 x double]. The lower double-precision value is 934/// compared to the lower double-precision value of \a __b. 935/// \param __b 936/// A 128-bit vector of [2 x double]. The lower double-precision value is 937/// compared to the lower double-precision value of \a __a. 938/// \returns A 128-bit vector. The lower 64 bits contains the comparison 939/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 940static __inline__ __m128d __DEFAULT_FN_ATTRS 941_mm_cmpngt_sd(__m128d __a, __m128d __b) 942{ 943 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 944 return __extension__ (__m128d) { __c[0], __a[1] }; 945} 946 947/// Compares the lower double-precision floating-point values in each of 948/// the two 128-bit floating-point vectors of [2 x double] to determine if 949/// the value in the first parameter is not greater than or equal to the 950/// corresponding value in the second parameter. 951/// 952/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 953/// 954/// \headerfile <x86intrin.h> 955/// 956/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 957/// 958/// \param __a 959/// A 128-bit vector of [2 x double]. The lower double-precision value is 960/// compared to the lower double-precision value of \a __b. 961/// \param __b 962/// A 128-bit vector of [2 x double]. The lower double-precision value is 963/// compared to the lower double-precision value of \a __a. 964/// \returns A 128-bit vector. The lower 64 bits contains the comparison 965/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 966static __inline__ __m128d __DEFAULT_FN_ATTRS 967_mm_cmpnge_sd(__m128d __a, __m128d __b) 968{ 969 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 970 return __extension__ (__m128d) { __c[0], __a[1] }; 971} 972 973/// Compares the lower double-precision floating-point values in each of 974/// the two 128-bit floating-point vectors of [2 x double] for equality. 975/// 976/// The comparison yields 0 for false, 1 for true. If either of the two 977/// lower double-precision values is NaN, 0 is returned. 978/// 979/// \headerfile <x86intrin.h> 980/// 981/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 982/// 983/// \param __a 984/// A 128-bit vector of [2 x double]. The lower double-precision value is 985/// compared to the lower double-precision value of \a __b. 986/// \param __b 987/// A 128-bit vector of [2 x double]. The lower double-precision value is 988/// compared to the lower double-precision value of \a __a. 989/// \returns An integer containing the comparison results. If either of the two 990/// lower double-precision values is NaN, 0 is returned. 991static __inline__ int __DEFAULT_FN_ATTRS 992_mm_comieq_sd(__m128d __a, __m128d __b) 993{ 994 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 995} 996 997/// Compares the lower double-precision floating-point values in each of 998/// the two 128-bit floating-point vectors of [2 x double] to determine if 999/// the value in the first parameter is less than the corresponding value in 1000/// the second parameter. 1001/// 1002/// The comparison yields 0 for false, 1 for true. If either of the two 1003/// lower double-precision values is NaN, 0 is returned. 1004/// 1005/// \headerfile <x86intrin.h> 1006/// 1007/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1008/// 1009/// \param __a 1010/// A 128-bit vector of [2 x double]. The lower double-precision value is 1011/// compared to the lower double-precision value of \a __b. 1012/// \param __b 1013/// A 128-bit vector of [2 x double]. The lower double-precision value is 1014/// compared to the lower double-precision value of \a __a. 1015/// \returns An integer containing the comparison results. If either of the two 1016/// lower double-precision values is NaN, 0 is returned. 1017static __inline__ int __DEFAULT_FN_ATTRS 1018_mm_comilt_sd(__m128d __a, __m128d __b) 1019{ 1020 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1021} 1022 1023/// Compares the lower double-precision floating-point values in each of 1024/// the two 128-bit floating-point vectors of [2 x double] to determine if 1025/// the value in the first parameter is less than or equal to the 1026/// corresponding value in the second parameter. 1027/// 1028/// The comparison yields 0 for false, 1 for true. If either of the two 1029/// lower double-precision values is NaN, 0 is returned. 1030/// 1031/// \headerfile <x86intrin.h> 1032/// 1033/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1034/// 1035/// \param __a 1036/// A 128-bit vector of [2 x double]. The lower double-precision value is 1037/// compared to the lower double-precision value of \a __b. 1038/// \param __b 1039/// A 128-bit vector of [2 x double]. The lower double-precision value is 1040/// compared to the lower double-precision value of \a __a. 1041/// \returns An integer containing the comparison results. If either of the two 1042/// lower double-precision values is NaN, 0 is returned. 1043static __inline__ int __DEFAULT_FN_ATTRS 1044_mm_comile_sd(__m128d __a, __m128d __b) 1045{ 1046 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1047} 1048 1049/// Compares the lower double-precision floating-point values in each of 1050/// the two 128-bit floating-point vectors of [2 x double] to determine if 1051/// the value in the first parameter is greater than the corresponding value 1052/// in the second parameter. 1053/// 1054/// The comparison yields 0 for false, 1 for true. If either of the two 1055/// lower double-precision values is NaN, 0 is returned. 1056/// 1057/// \headerfile <x86intrin.h> 1058/// 1059/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1060/// 1061/// \param __a 1062/// A 128-bit vector of [2 x double]. The lower double-precision value is 1063/// compared to the lower double-precision value of \a __b. 1064/// \param __b 1065/// A 128-bit vector of [2 x double]. The lower double-precision value is 1066/// compared to the lower double-precision value of \a __a. 1067/// \returns An integer containing the comparison results. If either of the two 1068/// lower double-precision values is NaN, 0 is returned. 1069static __inline__ int __DEFAULT_FN_ATTRS 1070_mm_comigt_sd(__m128d __a, __m128d __b) 1071{ 1072 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1073} 1074 1075/// Compares the lower double-precision floating-point values in each of 1076/// the two 128-bit floating-point vectors of [2 x double] to determine if 1077/// the value in the first parameter is greater than or equal to the 1078/// corresponding value in the second parameter. 1079/// 1080/// The comparison yields 0 for false, 1 for true. If either of the two 1081/// lower double-precision values is NaN, 0 is returned. 1082/// 1083/// \headerfile <x86intrin.h> 1084/// 1085/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1086/// 1087/// \param __a 1088/// A 128-bit vector of [2 x double]. The lower double-precision value is 1089/// compared to the lower double-precision value of \a __b. 1090/// \param __b 1091/// A 128-bit vector of [2 x double]. The lower double-precision value is 1092/// compared to the lower double-precision value of \a __a. 1093/// \returns An integer containing the comparison results. If either of the two 1094/// lower double-precision values is NaN, 0 is returned. 1095static __inline__ int __DEFAULT_FN_ATTRS 1096_mm_comige_sd(__m128d __a, __m128d __b) 1097{ 1098 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1099} 1100 1101/// Compares the lower double-precision floating-point values in each of 1102/// the two 128-bit floating-point vectors of [2 x double] to determine if 1103/// the value in the first parameter is unequal to the corresponding value in 1104/// the second parameter. 1105/// 1106/// The comparison yields 0 for false, 1 for true. If either of the two 1107/// lower double-precision values is NaN, 1 is returned. 1108/// 1109/// \headerfile <x86intrin.h> 1110/// 1111/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1112/// 1113/// \param __a 1114/// A 128-bit vector of [2 x double]. The lower double-precision value is 1115/// compared to the lower double-precision value of \a __b. 1116/// \param __b 1117/// A 128-bit vector of [2 x double]. The lower double-precision value is 1118/// compared to the lower double-precision value of \a __a. 1119/// \returns An integer containing the comparison results. If either of the two 1120/// lower double-precision values is NaN, 1 is returned. 1121static __inline__ int __DEFAULT_FN_ATTRS 1122_mm_comineq_sd(__m128d __a, __m128d __b) 1123{ 1124 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1125} 1126 1127/// Compares the lower double-precision floating-point values in each of 1128/// the two 128-bit floating-point vectors of [2 x double] for equality. The 1129/// comparison yields 0 for false, 1 for true. 1130/// 1131/// If either of the two lower double-precision values is NaN, 0 is returned. 1132/// 1133/// \headerfile <x86intrin.h> 1134/// 1135/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1136/// 1137/// \param __a 1138/// A 128-bit vector of [2 x double]. The lower double-precision value is 1139/// compared to the lower double-precision value of \a __b. 1140/// \param __b 1141/// A 128-bit vector of [2 x double]. The lower double-precision value is 1142/// compared to the lower double-precision value of \a __a. 1143/// \returns An integer containing the comparison results. If either of the two 1144/// lower double-precision values is NaN, 0 is returned. 1145static __inline__ int __DEFAULT_FN_ATTRS 1146_mm_ucomieq_sd(__m128d __a, __m128d __b) 1147{ 1148 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1149} 1150 1151/// Compares the lower double-precision floating-point values in each of 1152/// the two 128-bit floating-point vectors of [2 x double] to determine if 1153/// the value in the first parameter is less than the corresponding value in 1154/// the second parameter. 1155/// 1156/// The comparison yields 0 for false, 1 for true. If either of the two lower 1157/// double-precision values is NaN, 0 is returned. 1158/// 1159/// \headerfile <x86intrin.h> 1160/// 1161/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1162/// 1163/// \param __a 1164/// A 128-bit vector of [2 x double]. The lower double-precision value is 1165/// compared to the lower double-precision value of \a __b. 1166/// \param __b 1167/// A 128-bit vector of [2 x double]. The lower double-precision value is 1168/// compared to the lower double-precision value of \a __a. 1169/// \returns An integer containing the comparison results. If either of the two 1170/// lower double-precision values is NaN, 0 is returned. 1171static __inline__ int __DEFAULT_FN_ATTRS 1172_mm_ucomilt_sd(__m128d __a, __m128d __b) 1173{ 1174 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1175} 1176 1177/// Compares the lower double-precision floating-point values in each of 1178/// the two 128-bit floating-point vectors of [2 x double] to determine if 1179/// the value in the first parameter is less than or equal to the 1180/// corresponding value in the second parameter. 1181/// 1182/// The comparison yields 0 for false, 1 for true. If either of the two lower 1183/// double-precision values is NaN, 0 is returned. 1184/// 1185/// \headerfile <x86intrin.h> 1186/// 1187/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1188/// 1189/// \param __a 1190/// A 128-bit vector of [2 x double]. The lower double-precision value is 1191/// compared to the lower double-precision value of \a __b. 1192/// \param __b 1193/// A 128-bit vector of [2 x double]. The lower double-precision value is 1194/// compared to the lower double-precision value of \a __a. 1195/// \returns An integer containing the comparison results. If either of the two 1196/// lower double-precision values is NaN, 0 is returned. 1197static __inline__ int __DEFAULT_FN_ATTRS 1198_mm_ucomile_sd(__m128d __a, __m128d __b) 1199{ 1200 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1201} 1202 1203/// Compares the lower double-precision floating-point values in each of 1204/// the two 128-bit floating-point vectors of [2 x double] to determine if 1205/// the value in the first parameter is greater than the corresponding value 1206/// in the second parameter. 1207/// 1208/// The comparison yields 0 for false, 1 for true. If either of the two lower 1209/// double-precision values is NaN, 0 is returned. 1210/// 1211/// \headerfile <x86intrin.h> 1212/// 1213/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1214/// 1215/// \param __a 1216/// A 128-bit vector of [2 x double]. The lower double-precision value is 1217/// compared to the lower double-precision value of \a __b. 1218/// \param __b 1219/// A 128-bit vector of [2 x double]. The lower double-precision value is 1220/// compared to the lower double-precision value of \a __a. 1221/// \returns An integer containing the comparison results. If either of the two 1222/// lower double-precision values is NaN, 0 is returned. 1223static __inline__ int __DEFAULT_FN_ATTRS 1224_mm_ucomigt_sd(__m128d __a, __m128d __b) 1225{ 1226 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1227} 1228 1229/// Compares the lower double-precision floating-point values in each of 1230/// the two 128-bit floating-point vectors of [2 x double] to determine if 1231/// the value in the first parameter is greater than or equal to the 1232/// corresponding value in the second parameter. 1233/// 1234/// The comparison yields 0 for false, 1 for true. If either of the two 1235/// lower double-precision values is NaN, 0 is returned. 1236/// 1237/// \headerfile <x86intrin.h> 1238/// 1239/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1240/// 1241/// \param __a 1242/// A 128-bit vector of [2 x double]. The lower double-precision value is 1243/// compared to the lower double-precision value of \a __b. 1244/// \param __b 1245/// A 128-bit vector of [2 x double]. The lower double-precision value is 1246/// compared to the lower double-precision value of \a __a. 1247/// \returns An integer containing the comparison results. If either of the two 1248/// lower double-precision values is NaN, 0 is returned. 1249static __inline__ int __DEFAULT_FN_ATTRS 1250_mm_ucomige_sd(__m128d __a, __m128d __b) 1251{ 1252 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1253} 1254 1255/// Compares the lower double-precision floating-point values in each of 1256/// the two 128-bit floating-point vectors of [2 x double] to determine if 1257/// the value in the first parameter is unequal to the corresponding value in 1258/// the second parameter. 1259/// 1260/// The comparison yields 0 for false, 1 for true. If either of the two lower 1261/// double-precision values is NaN, 1 is returned. 1262/// 1263/// \headerfile <x86intrin.h> 1264/// 1265/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1266/// 1267/// \param __a 1268/// A 128-bit vector of [2 x double]. The lower double-precision value is 1269/// compared to the lower double-precision value of \a __b. 1270/// \param __b 1271/// A 128-bit vector of [2 x double]. The lower double-precision value is 1272/// compared to the lower double-precision value of \a __a. 1273/// \returns An integer containing the comparison result. If either of the two 1274/// lower double-precision values is NaN, 1 is returned. 1275static __inline__ int __DEFAULT_FN_ATTRS 1276_mm_ucomineq_sd(__m128d __a, __m128d __b) 1277{ 1278 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1279} 1280 1281/// Converts the two double-precision floating-point elements of a 1282/// 128-bit vector of [2 x double] into two single-precision floating-point 1283/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1284/// The upper 64 bits of the result vector are set to zero. 1285/// 1286/// \headerfile <x86intrin.h> 1287/// 1288/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1289/// 1290/// \param __a 1291/// A 128-bit vector of [2 x double]. 1292/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1293/// converted values. The upper 64 bits are set to zero. 1294static __inline__ __m128 __DEFAULT_FN_ATTRS 1295_mm_cvtpd_ps(__m128d __a) 1296{ 1297 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1298} 1299 1300/// Converts the lower two single-precision floating-point elements of a 1301/// 128-bit vector of [4 x float] into two double-precision floating-point 1302/// values, returned in a 128-bit vector of [2 x double]. The upper two 1303/// elements of the input vector are unused. 1304/// 1305/// \headerfile <x86intrin.h> 1306/// 1307/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1308/// 1309/// \param __a 1310/// A 128-bit vector of [4 x float]. The lower two single-precision 1311/// floating-point elements are converted to double-precision values. The 1312/// upper two elements are unused. 1313/// \returns A 128-bit vector of [2 x double] containing the converted values. 1314static __inline__ __m128d __DEFAULT_FN_ATTRS 1315_mm_cvtps_pd(__m128 __a) 1316{ 1317 return (__m128d) __builtin_convertvector( 1318 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1319} 1320 1321/// Converts the lower two integer elements of a 128-bit vector of 1322/// [4 x i32] into two double-precision floating-point values, returned in a 1323/// 128-bit vector of [2 x double]. 1324/// 1325/// The upper two elements of the input vector are unused. 1326/// 1327/// \headerfile <x86intrin.h> 1328/// 1329/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1330/// 1331/// \param __a 1332/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1333/// converted to double-precision values. 1334/// 1335/// The upper two elements are unused. 1336/// \returns A 128-bit vector of [2 x double] containing the converted values. 1337static __inline__ __m128d __DEFAULT_FN_ATTRS 1338_mm_cvtepi32_pd(__m128i __a) 1339{ 1340 return (__m128d) __builtin_convertvector( 1341 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1342} 1343 1344/// Converts the two double-precision floating-point elements of a 1345/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1346/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1347/// 64 bits of the result vector are set to zero. 1348/// 1349/// \headerfile <x86intrin.h> 1350/// 1351/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1352/// 1353/// \param __a 1354/// A 128-bit vector of [2 x double]. 1355/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1356/// converted values. The upper 64 bits are set to zero. 1357static __inline__ __m128i __DEFAULT_FN_ATTRS 1358_mm_cvtpd_epi32(__m128d __a) 1359{ 1360 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1361} 1362 1363/// Converts the low-order element of a 128-bit vector of [2 x double] 1364/// into a 32-bit signed integer value. 1365/// 1366/// \headerfile <x86intrin.h> 1367/// 1368/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1369/// 1370/// \param __a 1371/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1372/// conversion. 1373/// \returns A 32-bit signed integer containing the converted value. 1374static __inline__ int __DEFAULT_FN_ATTRS 1375_mm_cvtsd_si32(__m128d __a) 1376{ 1377 return __builtin_ia32_cvtsd2si((__v2df)__a); 1378} 1379 1380/// Converts the lower double-precision floating-point element of a 1381/// 128-bit vector of [2 x double], in the second parameter, into a 1382/// single-precision floating-point value, returned in the lower 32 bits of a 1383/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1384/// copied from the upper 96 bits of the first parameter. 1385/// 1386/// \headerfile <x86intrin.h> 1387/// 1388/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1389/// 1390/// \param __a 1391/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1392/// copied to the upper 96 bits of the result. 1393/// \param __b 1394/// A 128-bit vector of [2 x double]. The lower double-precision 1395/// floating-point element is used in the conversion. 1396/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1397/// converted value from the second parameter. The upper 96 bits are copied 1398/// from the upper 96 bits of the first parameter. 1399static __inline__ __m128 __DEFAULT_FN_ATTRS 1400_mm_cvtsd_ss(__m128 __a, __m128d __b) 1401{ 1402 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1403} 1404 1405/// Converts a 32-bit signed integer value, in the second parameter, into 1406/// a double-precision floating-point value, returned in the lower 64 bits of 1407/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1408/// are copied from the upper 64 bits of the first parameter. 1409/// 1410/// \headerfile <x86intrin.h> 1411/// 1412/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1413/// 1414/// \param __a 1415/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1416/// copied to the upper 64 bits of the result. 1417/// \param __b 1418/// A 32-bit signed integer containing the value to be converted. 1419/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1420/// converted value from the second parameter. The upper 64 bits are copied 1421/// from the upper 64 bits of the first parameter. 1422static __inline__ __m128d __DEFAULT_FN_ATTRS 1423_mm_cvtsi32_sd(__m128d __a, int __b) 1424{ 1425 __a[0] = __b; 1426 return __a; 1427} 1428 1429/// Converts the lower single-precision floating-point element of a 1430/// 128-bit vector of [4 x float], in the second parameter, into a 1431/// double-precision floating-point value, returned in the lower 64 bits of 1432/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1433/// are copied from the upper 64 bits of the first parameter. 1434/// 1435/// \headerfile <x86intrin.h> 1436/// 1437/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1438/// 1439/// \param __a 1440/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1441/// copied to the upper 64 bits of the result. 1442/// \param __b 1443/// A 128-bit vector of [4 x float]. The lower single-precision 1444/// floating-point element is used in the conversion. 1445/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1446/// converted value from the second parameter. The upper 64 bits are copied 1447/// from the upper 64 bits of the first parameter. 1448static __inline__ __m128d __DEFAULT_FN_ATTRS 1449_mm_cvtss_sd(__m128d __a, __m128 __b) 1450{ 1451 __a[0] = __b[0]; 1452 return __a; 1453} 1454 1455/// Converts the two double-precision floating-point elements of a 1456/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1457/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1458/// 1459/// If the result of either conversion is inexact, the result is truncated 1460/// (rounded towards zero) regardless of the current MXCSR setting. The upper 1461/// 64 bits of the result vector are set to zero. 1462/// 1463/// \headerfile <x86intrin.h> 1464/// 1465/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1466/// instruction. 1467/// 1468/// \param __a 1469/// A 128-bit vector of [2 x double]. 1470/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1471/// converted values. The upper 64 bits are set to zero. 1472static __inline__ __m128i __DEFAULT_FN_ATTRS 1473_mm_cvttpd_epi32(__m128d __a) 1474{ 1475 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1476} 1477 1478/// Converts the low-order element of a [2 x double] vector into a 32-bit 1479/// signed integer value, truncating the result when it is inexact. 1480/// 1481/// \headerfile <x86intrin.h> 1482/// 1483/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1484/// instruction. 1485/// 1486/// \param __a 1487/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1488/// conversion. 1489/// \returns A 32-bit signed integer containing the converted value. 1490static __inline__ int __DEFAULT_FN_ATTRS 1491_mm_cvttsd_si32(__m128d __a) 1492{ 1493 return __builtin_ia32_cvttsd2si((__v2df)__a); 1494} 1495 1496/// Converts the two double-precision floating-point elements of a 1497/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1498/// returned in a 64-bit vector of [2 x i32]. 1499/// 1500/// \headerfile <x86intrin.h> 1501/// 1502/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1503/// 1504/// \param __a 1505/// A 128-bit vector of [2 x double]. 1506/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1507static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1508_mm_cvtpd_pi32(__m128d __a) 1509{ 1510 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1511} 1512 1513/// Converts the two double-precision floating-point elements of a 1514/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1515/// returned in a 64-bit vector of [2 x i32]. 1516/// 1517/// If the result of either conversion is inexact, the result is truncated 1518/// (rounded towards zero) regardless of the current MXCSR setting. 1519/// 1520/// \headerfile <x86intrin.h> 1521/// 1522/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1523/// 1524/// \param __a 1525/// A 128-bit vector of [2 x double]. 1526/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1527static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1528_mm_cvttpd_pi32(__m128d __a) 1529{ 1530 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1531} 1532 1533/// Converts the two signed 32-bit integer elements of a 64-bit vector of 1534/// [2 x i32] into two double-precision floating-point values, returned in a 1535/// 128-bit vector of [2 x double]. 1536/// 1537/// \headerfile <x86intrin.h> 1538/// 1539/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1540/// 1541/// \param __a 1542/// A 64-bit vector of [2 x i32]. 1543/// \returns A 128-bit vector of [2 x double] containing the converted values. 1544static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX 1545_mm_cvtpi32_pd(__m64 __a) 1546{ 1547 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1548} 1549 1550/// Returns the low-order element of a 128-bit vector of [2 x double] as 1551/// a double-precision floating-point value. 1552/// 1553/// \headerfile <x86intrin.h> 1554/// 1555/// This intrinsic has no corresponding instruction. 1556/// 1557/// \param __a 1558/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1559/// \returns A double-precision floating-point value copied from the lower 64 1560/// bits of \a __a. 1561static __inline__ double __DEFAULT_FN_ATTRS 1562_mm_cvtsd_f64(__m128d __a) 1563{ 1564 return __a[0]; 1565} 1566 1567/// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1568/// memory location. 1569/// 1570/// \headerfile <x86intrin.h> 1571/// 1572/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1573/// 1574/// \param __dp 1575/// A pointer to a 128-bit memory location. The address of the memory 1576/// location has to be 16-byte aligned. 1577/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1578static __inline__ __m128d __DEFAULT_FN_ATTRS 1579_mm_load_pd(double const *__dp) 1580{ 1581 return *(const __m128d*)__dp; 1582} 1583 1584/// Loads a double-precision floating-point value from a specified memory 1585/// location and duplicates it to both vector elements of a 128-bit vector of 1586/// [2 x double]. 1587/// 1588/// \headerfile <x86intrin.h> 1589/// 1590/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1591/// 1592/// \param __dp 1593/// A pointer to a memory location containing a double-precision value. 1594/// \returns A 128-bit vector of [2 x double] containing the loaded and 1595/// duplicated values. 1596static __inline__ __m128d __DEFAULT_FN_ATTRS 1597_mm_load1_pd(double const *__dp) 1598{ 1599 struct __mm_load1_pd_struct { 1600 double __u; 1601 } __attribute__((__packed__, __may_alias__)); 1602 double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u; 1603 return __extension__ (__m128d){ __u, __u }; 1604} 1605 1606#define _mm_load_pd1(dp) _mm_load1_pd(dp) 1607 1608/// Loads two double-precision values, in reverse order, from an aligned 1609/// memory location into a 128-bit vector of [2 x double]. 1610/// 1611/// \headerfile <x86intrin.h> 1612/// 1613/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1614/// needed shuffling instructions. In AVX mode, the shuffling may be combined 1615/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1616/// 1617/// \param __dp 1618/// A 16-byte aligned pointer to an array of double-precision values to be 1619/// loaded in reverse order. 1620/// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1621/// values. 1622static __inline__ __m128d __DEFAULT_FN_ATTRS 1623_mm_loadr_pd(double const *__dp) 1624{ 1625 __m128d __u = *(const __m128d*)__dp; 1626 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1627} 1628 1629/// Loads a 128-bit floating-point vector of [2 x double] from an 1630/// unaligned memory location. 1631/// 1632/// \headerfile <x86intrin.h> 1633/// 1634/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1635/// 1636/// \param __dp 1637/// A pointer to a 128-bit memory location. The address of the memory 1638/// location does not have to be aligned. 1639/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1640static __inline__ __m128d __DEFAULT_FN_ATTRS 1641_mm_loadu_pd(double const *__dp) 1642{ 1643 struct __loadu_pd { 1644 __m128d_u __v; 1645 } __attribute__((__packed__, __may_alias__)); 1646 return ((const struct __loadu_pd*)__dp)->__v; 1647} 1648 1649/// Loads a 64-bit integer value to the low element of a 128-bit integer 1650/// vector and clears the upper element. 1651/// 1652/// \headerfile <x86intrin.h> 1653/// 1654/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1655/// 1656/// \param __a 1657/// A pointer to a 64-bit memory location. The address of the memory 1658/// location does not have to be aligned. 1659/// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1660static __inline__ __m128i __DEFAULT_FN_ATTRS 1661_mm_loadu_si64(void const *__a) 1662{ 1663 struct __loadu_si64 { 1664 long long __v; 1665 } __attribute__((__packed__, __may_alias__)); 1666 long long __u = ((const struct __loadu_si64*)__a)->__v; 1667 return __extension__ (__m128i)(__v2di){__u, 0LL}; 1668} 1669 1670/// Loads a 32-bit integer value to the low element of a 128-bit integer 1671/// vector and clears the upper element. 1672/// 1673/// \headerfile <x86intrin.h> 1674/// 1675/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1676/// 1677/// \param __a 1678/// A pointer to a 32-bit memory location. The address of the memory 1679/// location does not have to be aligned. 1680/// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1681static __inline__ __m128i __DEFAULT_FN_ATTRS 1682_mm_loadu_si32(void const *__a) 1683{ 1684 struct __loadu_si32 { 1685 int __v; 1686 } __attribute__((__packed__, __may_alias__)); 1687 int __u = ((const struct __loadu_si32*)__a)->__v; 1688 return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; 1689} 1690 1691/// Loads a 16-bit integer value to the low element of a 128-bit integer 1692/// vector and clears the upper element. 1693/// 1694/// \headerfile <x86intrin.h> 1695/// 1696/// This intrinsic does not correspond to a specific instruction. 1697/// 1698/// \param __a 1699/// A pointer to a 16-bit memory location. The address of the memory 1700/// location does not have to be aligned. 1701/// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1702static __inline__ __m128i __DEFAULT_FN_ATTRS 1703_mm_loadu_si16(void const *__a) 1704{ 1705 struct __loadu_si16 { 1706 short __v; 1707 } __attribute__((__packed__, __may_alias__)); 1708 short __u = ((const struct __loadu_si16*)__a)->__v; 1709 return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1710} 1711 1712/// Loads a 64-bit double-precision value to the low element of a 1713/// 128-bit integer vector and clears the upper element. 1714/// 1715/// \headerfile <x86intrin.h> 1716/// 1717/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1718/// 1719/// \param __dp 1720/// A pointer to a memory location containing a double-precision value. 1721/// The address of the memory location does not have to be aligned. 1722/// \returns A 128-bit vector of [2 x double] containing the loaded value. 1723static __inline__ __m128d __DEFAULT_FN_ATTRS 1724_mm_load_sd(double const *__dp) 1725{ 1726 struct __mm_load_sd_struct { 1727 double __u; 1728 } __attribute__((__packed__, __may_alias__)); 1729 double __u = ((const struct __mm_load_sd_struct*)__dp)->__u; 1730 return __extension__ (__m128d){ __u, 0 }; 1731} 1732 1733/// Loads a double-precision value into the high-order bits of a 128-bit 1734/// vector of [2 x double]. The low-order bits are copied from the low-order 1735/// bits of the first operand. 1736/// 1737/// \headerfile <x86intrin.h> 1738/// 1739/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1740/// 1741/// \param __a 1742/// A 128-bit vector of [2 x double]. \n 1743/// Bits [63:0] are written to bits [63:0] of the result. 1744/// \param __dp 1745/// A pointer to a 64-bit memory location containing a double-precision 1746/// floating-point value that is loaded. The loaded value is written to bits 1747/// [127:64] of the result. The address of the memory location does not have 1748/// to be aligned. 1749/// \returns A 128-bit vector of [2 x double] containing the moved values. 1750static __inline__ __m128d __DEFAULT_FN_ATTRS 1751_mm_loadh_pd(__m128d __a, double const *__dp) 1752{ 1753 struct __mm_loadh_pd_struct { 1754 double __u; 1755 } __attribute__((__packed__, __may_alias__)); 1756 double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u; 1757 return __extension__ (__m128d){ __a[0], __u }; 1758} 1759 1760/// Loads a double-precision value into the low-order bits of a 128-bit 1761/// vector of [2 x double]. The high-order bits are copied from the 1762/// high-order bits of the first operand. 1763/// 1764/// \headerfile <x86intrin.h> 1765/// 1766/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1767/// 1768/// \param __a 1769/// A 128-bit vector of [2 x double]. \n 1770/// Bits [127:64] are written to bits [127:64] of the result. 1771/// \param __dp 1772/// A pointer to a 64-bit memory location containing a double-precision 1773/// floating-point value that is loaded. The loaded value is written to bits 1774/// [63:0] of the result. The address of the memory location does not have to 1775/// be aligned. 1776/// \returns A 128-bit vector of [2 x double] containing the moved values. 1777static __inline__ __m128d __DEFAULT_FN_ATTRS 1778_mm_loadl_pd(__m128d __a, double const *__dp) 1779{ 1780 struct __mm_loadl_pd_struct { 1781 double __u; 1782 } __attribute__((__packed__, __may_alias__)); 1783 double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u; 1784 return __extension__ (__m128d){ __u, __a[1] }; 1785} 1786 1787/// Constructs a 128-bit floating-point vector of [2 x double] with 1788/// unspecified content. This could be used as an argument to another 1789/// intrinsic function where the argument is required but the value is not 1790/// actually used. 1791/// 1792/// \headerfile <x86intrin.h> 1793/// 1794/// This intrinsic has no corresponding instruction. 1795/// 1796/// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1797/// content. 1798static __inline__ __m128d __DEFAULT_FN_ATTRS 1799_mm_undefined_pd(void) 1800{ 1801 return (__m128d)__builtin_ia32_undef128(); 1802} 1803 1804/// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1805/// 64 bits of the vector are initialized with the specified double-precision 1806/// floating-point value. The upper 64 bits are set to zero. 1807/// 1808/// \headerfile <x86intrin.h> 1809/// 1810/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1811/// 1812/// \param __w 1813/// A double-precision floating-point value used to initialize the lower 64 1814/// bits of the result. 1815/// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1816/// lower 64 bits contain the value of the parameter. The upper 64 bits are 1817/// set to zero. 1818static __inline__ __m128d __DEFAULT_FN_ATTRS 1819_mm_set_sd(double __w) 1820{ 1821 return __extension__ (__m128d){ __w, 0 }; 1822} 1823 1824/// Constructs a 128-bit floating-point vector of [2 x double], with each 1825/// of the two double-precision floating-point vector elements set to the 1826/// specified double-precision floating-point value. 1827/// 1828/// \headerfile <x86intrin.h> 1829/// 1830/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1831/// 1832/// \param __w 1833/// A double-precision floating-point value used to initialize each vector 1834/// element of the result. 1835/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1836static __inline__ __m128d __DEFAULT_FN_ATTRS 1837_mm_set1_pd(double __w) 1838{ 1839 return __extension__ (__m128d){ __w, __w }; 1840} 1841 1842/// Constructs a 128-bit floating-point vector of [2 x double], with each 1843/// of the two double-precision floating-point vector elements set to the 1844/// specified double-precision floating-point value. 1845/// 1846/// \headerfile <x86intrin.h> 1847/// 1848/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1849/// 1850/// \param __w 1851/// A double-precision floating-point value used to initialize each vector 1852/// element of the result. 1853/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1854static __inline__ __m128d __DEFAULT_FN_ATTRS 1855_mm_set_pd1(double __w) 1856{ 1857 return _mm_set1_pd(__w); 1858} 1859 1860/// Constructs a 128-bit floating-point vector of [2 x double] 1861/// initialized with the specified double-precision floating-point values. 1862/// 1863/// \headerfile <x86intrin.h> 1864/// 1865/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1866/// 1867/// \param __w 1868/// A double-precision floating-point value used to initialize the upper 64 1869/// bits of the result. 1870/// \param __x 1871/// A double-precision floating-point value used to initialize the lower 64 1872/// bits of the result. 1873/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1874static __inline__ __m128d __DEFAULT_FN_ATTRS 1875_mm_set_pd(double __w, double __x) 1876{ 1877 return __extension__ (__m128d){ __x, __w }; 1878} 1879 1880/// Constructs a 128-bit floating-point vector of [2 x double], 1881/// initialized in reverse order with the specified double-precision 1882/// floating-point values. 1883/// 1884/// \headerfile <x86intrin.h> 1885/// 1886/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1887/// 1888/// \param __w 1889/// A double-precision floating-point value used to initialize the lower 64 1890/// bits of the result. 1891/// \param __x 1892/// A double-precision floating-point value used to initialize the upper 64 1893/// bits of the result. 1894/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1895static __inline__ __m128d __DEFAULT_FN_ATTRS 1896_mm_setr_pd(double __w, double __x) 1897{ 1898 return __extension__ (__m128d){ __w, __x }; 1899} 1900 1901/// Constructs a 128-bit floating-point vector of [2 x double] 1902/// initialized to zero. 1903/// 1904/// \headerfile <x86intrin.h> 1905/// 1906/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1907/// 1908/// \returns An initialized 128-bit floating-point vector of [2 x double] with 1909/// all elements set to zero. 1910static __inline__ __m128d __DEFAULT_FN_ATTRS 1911_mm_setzero_pd(void) 1912{ 1913 return __extension__ (__m128d){ 0, 0 }; 1914} 1915 1916/// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1917/// 64 bits are set to the lower 64 bits of the second parameter. The upper 1918/// 64 bits are set to the upper 64 bits of the first parameter. 1919/// 1920/// \headerfile <x86intrin.h> 1921/// 1922/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1923/// 1924/// \param __a 1925/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1926/// upper 64 bits of the result. 1927/// \param __b 1928/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1929/// lower 64 bits of the result. 1930/// \returns A 128-bit vector of [2 x double] containing the moved values. 1931static __inline__ __m128d __DEFAULT_FN_ATTRS 1932_mm_move_sd(__m128d __a, __m128d __b) 1933{ 1934 __a[0] = __b[0]; 1935 return __a; 1936} 1937 1938/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1939/// memory location. 1940/// 1941/// \headerfile <x86intrin.h> 1942/// 1943/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1944/// 1945/// \param __dp 1946/// A pointer to a 64-bit memory location. 1947/// \param __a 1948/// A 128-bit vector of [2 x double] containing the value to be stored. 1949static __inline__ void __DEFAULT_FN_ATTRS 1950_mm_store_sd(double *__dp, __m128d __a) 1951{ 1952 struct __mm_store_sd_struct { 1953 double __u; 1954 } __attribute__((__packed__, __may_alias__)); 1955 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1956} 1957 1958/// Moves packed double-precision values from a 128-bit vector of 1959/// [2 x double] to a memory location. 1960/// 1961/// \headerfile <x86intrin.h> 1962/// 1963/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1964/// 1965/// \param __dp 1966/// A pointer to an aligned memory location that can store two 1967/// double-precision values. 1968/// \param __a 1969/// A packed 128-bit vector of [2 x double] containing the values to be 1970/// moved. 1971static __inline__ void __DEFAULT_FN_ATTRS 1972_mm_store_pd(double *__dp, __m128d __a) 1973{ 1974 *(__m128d*)__dp = __a; 1975} 1976 1977/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1978/// the upper and lower 64 bits of a memory location. 1979/// 1980/// \headerfile <x86intrin.h> 1981/// 1982/// This intrinsic corresponds to the 1983/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1984/// 1985/// \param __dp 1986/// A pointer to a memory location that can store two double-precision 1987/// values. 1988/// \param __a 1989/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1990/// of the values in \a __dp. 1991static __inline__ void __DEFAULT_FN_ATTRS 1992_mm_store1_pd(double *__dp, __m128d __a) 1993{ 1994 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1995 _mm_store_pd(__dp, __a); 1996} 1997 1998/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1999/// the upper and lower 64 bits of a memory location. 2000/// 2001/// \headerfile <x86intrin.h> 2002/// 2003/// This intrinsic corresponds to the 2004/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 2005/// 2006/// \param __dp 2007/// A pointer to a memory location that can store two double-precision 2008/// values. 2009/// \param __a 2010/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 2011/// of the values in \a __dp. 2012static __inline__ void __DEFAULT_FN_ATTRS 2013_mm_store_pd1(double *__dp, __m128d __a) 2014{ 2015 _mm_store1_pd(__dp, __a); 2016} 2017 2018/// Stores a 128-bit vector of [2 x double] into an unaligned memory 2019/// location. 2020/// 2021/// \headerfile <x86intrin.h> 2022/// 2023/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 2024/// 2025/// \param __dp 2026/// A pointer to a 128-bit memory location. The address of the memory 2027/// location does not have to be aligned. 2028/// \param __a 2029/// A 128-bit vector of [2 x double] containing the values to be stored. 2030static __inline__ void __DEFAULT_FN_ATTRS 2031_mm_storeu_pd(double *__dp, __m128d __a) 2032{ 2033 struct __storeu_pd { 2034 __m128d_u __v; 2035 } __attribute__((__packed__, __may_alias__)); 2036 ((struct __storeu_pd*)__dp)->__v = __a; 2037} 2038 2039/// Stores two double-precision values, in reverse order, from a 128-bit 2040/// vector of [2 x double] to a 16-byte aligned memory location. 2041/// 2042/// \headerfile <x86intrin.h> 2043/// 2044/// This intrinsic corresponds to a shuffling instruction followed by a 2045/// <c> VMOVAPD / MOVAPD </c> instruction. 2046/// 2047/// \param __dp 2048/// A pointer to a 16-byte aligned memory location that can store two 2049/// double-precision values. 2050/// \param __a 2051/// A 128-bit vector of [2 x double] containing the values to be reversed and 2052/// stored. 2053static __inline__ void __DEFAULT_FN_ATTRS 2054_mm_storer_pd(double *__dp, __m128d __a) 2055{ 2056 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 2057 *(__m128d *)__dp = __a; 2058} 2059 2060/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 2061/// memory location. 2062/// 2063/// \headerfile <x86intrin.h> 2064/// 2065/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2066/// 2067/// \param __dp 2068/// A pointer to a 64-bit memory location. 2069/// \param __a 2070/// A 128-bit vector of [2 x double] containing the value to be stored. 2071static __inline__ void __DEFAULT_FN_ATTRS 2072_mm_storeh_pd(double *__dp, __m128d __a) 2073{ 2074 struct __mm_storeh_pd_struct { 2075 double __u; 2076 } __attribute__((__packed__, __may_alias__)); 2077 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 2078} 2079 2080/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2081/// memory location. 2082/// 2083/// \headerfile <x86intrin.h> 2084/// 2085/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2086/// 2087/// \param __dp 2088/// A pointer to a 64-bit memory location. 2089/// \param __a 2090/// A 128-bit vector of [2 x double] containing the value to be stored. 2091static __inline__ void __DEFAULT_FN_ATTRS 2092_mm_storel_pd(double *__dp, __m128d __a) 2093{ 2094 struct __mm_storeh_pd_struct { 2095 double __u; 2096 } __attribute__((__packed__, __may_alias__)); 2097 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 2098} 2099 2100/// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2101/// saving the lower 8 bits of each sum in the corresponding element of a 2102/// 128-bit result vector of [16 x i8]. 2103/// 2104/// The integer elements of both parameters can be either signed or unsigned. 2105/// 2106/// \headerfile <x86intrin.h> 2107/// 2108/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2109/// 2110/// \param __a 2111/// A 128-bit vector of [16 x i8]. 2112/// \param __b 2113/// A 128-bit vector of [16 x i8]. 2114/// \returns A 128-bit vector of [16 x i8] containing the sums of both 2115/// parameters. 2116static __inline__ __m128i __DEFAULT_FN_ATTRS 2117_mm_add_epi8(__m128i __a, __m128i __b) 2118{ 2119 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2120} 2121 2122/// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2123/// saving the lower 16 bits of each sum in the corresponding element of a 2124/// 128-bit result vector of [8 x i16]. 2125/// 2126/// The integer elements of both parameters can be either signed or unsigned. 2127/// 2128/// \headerfile <x86intrin.h> 2129/// 2130/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2131/// 2132/// \param __a 2133/// A 128-bit vector of [8 x i16]. 2134/// \param __b 2135/// A 128-bit vector of [8 x i16]. 2136/// \returns A 128-bit vector of [8 x i16] containing the sums of both 2137/// parameters. 2138static __inline__ __m128i __DEFAULT_FN_ATTRS 2139_mm_add_epi16(__m128i __a, __m128i __b) 2140{ 2141 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2142} 2143 2144/// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2145/// saving the lower 32 bits of each sum in the corresponding element of a 2146/// 128-bit result vector of [4 x i32]. 2147/// 2148/// The integer elements of both parameters can be either signed or unsigned. 2149/// 2150/// \headerfile <x86intrin.h> 2151/// 2152/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2153/// 2154/// \param __a 2155/// A 128-bit vector of [4 x i32]. 2156/// \param __b 2157/// A 128-bit vector of [4 x i32]. 2158/// \returns A 128-bit vector of [4 x i32] containing the sums of both 2159/// parameters. 2160static __inline__ __m128i __DEFAULT_FN_ATTRS 2161_mm_add_epi32(__m128i __a, __m128i __b) 2162{ 2163 return (__m128i)((__v4su)__a + (__v4su)__b); 2164} 2165 2166/// Adds two signed or unsigned 64-bit integer values, returning the 2167/// lower 64 bits of the sum. 2168/// 2169/// \headerfile <x86intrin.h> 2170/// 2171/// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2172/// 2173/// \param __a 2174/// A 64-bit integer. 2175/// \param __b 2176/// A 64-bit integer. 2177/// \returns A 64-bit integer containing the sum of both parameters. 2178static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2179_mm_add_si64(__m64 __a, __m64 __b) 2180{ 2181 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2182} 2183 2184/// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2185/// saving the lower 64 bits of each sum in the corresponding element of a 2186/// 128-bit result vector of [2 x i64]. 2187/// 2188/// The integer elements of both parameters can be either signed or unsigned. 2189/// 2190/// \headerfile <x86intrin.h> 2191/// 2192/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2193/// 2194/// \param __a 2195/// A 128-bit vector of [2 x i64]. 2196/// \param __b 2197/// A 128-bit vector of [2 x i64]. 2198/// \returns A 128-bit vector of [2 x i64] containing the sums of both 2199/// parameters. 2200static __inline__ __m128i __DEFAULT_FN_ATTRS 2201_mm_add_epi64(__m128i __a, __m128i __b) 2202{ 2203 return (__m128i)((__v2du)__a + (__v2du)__b); 2204} 2205 2206/// Adds, with saturation, the corresponding elements of two 128-bit 2207/// signed [16 x i8] vectors, saving each sum in the corresponding element of 2208/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are 2209/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. 2210/// 2211/// \headerfile <x86intrin.h> 2212/// 2213/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2214/// 2215/// \param __a 2216/// A 128-bit signed [16 x i8] vector. 2217/// \param __b 2218/// A 128-bit signed [16 x i8] vector. 2219/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2220/// both parameters. 2221static __inline__ __m128i __DEFAULT_FN_ATTRS 2222_mm_adds_epi8(__m128i __a, __m128i __b) 2223{ 2224 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2225} 2226 2227/// Adds, with saturation, the corresponding elements of two 128-bit 2228/// signed [8 x i16] vectors, saving each sum in the corresponding element of 2229/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF 2230/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to 2231/// 0x8000. 2232/// 2233/// \headerfile <x86intrin.h> 2234/// 2235/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2236/// 2237/// \param __a 2238/// A 128-bit signed [8 x i16] vector. 2239/// \param __b 2240/// A 128-bit signed [8 x i16] vector. 2241/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2242/// both parameters. 2243static __inline__ __m128i __DEFAULT_FN_ATTRS 2244_mm_adds_epi16(__m128i __a, __m128i __b) 2245{ 2246 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2247} 2248 2249/// Adds, with saturation, the corresponding elements of two 128-bit 2250/// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2251/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF 2252/// are saturated to 0xFF. Negative sums are saturated to 0x00. 2253/// 2254/// \headerfile <x86intrin.h> 2255/// 2256/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2257/// 2258/// \param __a 2259/// A 128-bit unsigned [16 x i8] vector. 2260/// \param __b 2261/// A 128-bit unsigned [16 x i8] vector. 2262/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2263/// of both parameters. 2264static __inline__ __m128i __DEFAULT_FN_ATTRS 2265_mm_adds_epu8(__m128i __a, __m128i __b) 2266{ 2267 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2268} 2269 2270/// Adds, with saturation, the corresponding elements of two 128-bit 2271/// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2272/// of a 128-bit result vector of [8 x i16]. Positive sums greater than 2273/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. 2274/// 2275/// \headerfile <x86intrin.h> 2276/// 2277/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2278/// 2279/// \param __a 2280/// A 128-bit unsigned [8 x i16] vector. 2281/// \param __b 2282/// A 128-bit unsigned [8 x i16] vector. 2283/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2284/// of both parameters. 2285static __inline__ __m128i __DEFAULT_FN_ATTRS 2286_mm_adds_epu16(__m128i __a, __m128i __b) 2287{ 2288 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2289} 2290 2291/// Computes the rounded averages of corresponding elements of two 2292/// 128-bit unsigned [16 x i8] vectors, saving each result in the 2293/// corresponding element of a 128-bit result vector of [16 x i8]. 2294/// 2295/// \headerfile <x86intrin.h> 2296/// 2297/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2298/// 2299/// \param __a 2300/// A 128-bit unsigned [16 x i8] vector. 2301/// \param __b 2302/// A 128-bit unsigned [16 x i8] vector. 2303/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2304/// averages of both parameters. 2305static __inline__ __m128i __DEFAULT_FN_ATTRS 2306_mm_avg_epu8(__m128i __a, __m128i __b) 2307{ 2308 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2309} 2310 2311/// Computes the rounded averages of corresponding elements of two 2312/// 128-bit unsigned [8 x i16] vectors, saving each result in the 2313/// corresponding element of a 128-bit result vector of [8 x i16]. 2314/// 2315/// \headerfile <x86intrin.h> 2316/// 2317/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2318/// 2319/// \param __a 2320/// A 128-bit unsigned [8 x i16] vector. 2321/// \param __b 2322/// A 128-bit unsigned [8 x i16] vector. 2323/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2324/// averages of both parameters. 2325static __inline__ __m128i __DEFAULT_FN_ATTRS 2326_mm_avg_epu16(__m128i __a, __m128i __b) 2327{ 2328 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2329} 2330 2331/// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2332/// vectors, producing eight intermediate 32-bit signed integer products, and 2333/// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2334/// [4 x i32] vector. 2335/// 2336/// For example, bits [15:0] of both parameters are multiplied producing a 2337/// 32-bit product, bits [31:16] of both parameters are multiplied producing 2338/// a 32-bit product, and the sum of those two products becomes bits [31:0] 2339/// of the result. 2340/// 2341/// \headerfile <x86intrin.h> 2342/// 2343/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2344/// 2345/// \param __a 2346/// A 128-bit signed [8 x i16] vector. 2347/// \param __b 2348/// A 128-bit signed [8 x i16] vector. 2349/// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2350/// of both parameters. 2351static __inline__ __m128i __DEFAULT_FN_ATTRS 2352_mm_madd_epi16(__m128i __a, __m128i __b) 2353{ 2354 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2355} 2356 2357/// Compares corresponding elements of two 128-bit signed [8 x i16] 2358/// vectors, saving the greater value from each comparison in the 2359/// corresponding element of a 128-bit result vector of [8 x i16]. 2360/// 2361/// \headerfile <x86intrin.h> 2362/// 2363/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2364/// 2365/// \param __a 2366/// A 128-bit signed [8 x i16] vector. 2367/// \param __b 2368/// A 128-bit signed [8 x i16] vector. 2369/// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2370/// each comparison. 2371static __inline__ __m128i __DEFAULT_FN_ATTRS 2372_mm_max_epi16(__m128i __a, __m128i __b) 2373{ 2374 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 2375} 2376 2377/// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2378/// vectors, saving the greater value from each comparison in the 2379/// corresponding element of a 128-bit result vector of [16 x i8]. 2380/// 2381/// \headerfile <x86intrin.h> 2382/// 2383/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2384/// 2385/// \param __a 2386/// A 128-bit unsigned [16 x i8] vector. 2387/// \param __b 2388/// A 128-bit unsigned [16 x i8] vector. 2389/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2390/// each comparison. 2391static __inline__ __m128i __DEFAULT_FN_ATTRS 2392_mm_max_epu8(__m128i __a, __m128i __b) 2393{ 2394 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 2395} 2396 2397/// Compares corresponding elements of two 128-bit signed [8 x i16] 2398/// vectors, saving the smaller value from each comparison in the 2399/// corresponding element of a 128-bit result vector of [8 x i16]. 2400/// 2401/// \headerfile <x86intrin.h> 2402/// 2403/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2404/// 2405/// \param __a 2406/// A 128-bit signed [8 x i16] vector. 2407/// \param __b 2408/// A 128-bit signed [8 x i16] vector. 2409/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2410/// each comparison. 2411static __inline__ __m128i __DEFAULT_FN_ATTRS 2412_mm_min_epi16(__m128i __a, __m128i __b) 2413{ 2414 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 2415} 2416 2417/// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2418/// vectors, saving the smaller value from each comparison in the 2419/// corresponding element of a 128-bit result vector of [16 x i8]. 2420/// 2421/// \headerfile <x86intrin.h> 2422/// 2423/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2424/// 2425/// \param __a 2426/// A 128-bit unsigned [16 x i8] vector. 2427/// \param __b 2428/// A 128-bit unsigned [16 x i8] vector. 2429/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2430/// each comparison. 2431static __inline__ __m128i __DEFAULT_FN_ATTRS 2432_mm_min_epu8(__m128i __a, __m128i __b) 2433{ 2434 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 2435} 2436 2437/// Multiplies the corresponding elements of two signed [8 x i16] 2438/// vectors, saving the upper 16 bits of each 32-bit product in the 2439/// corresponding element of a 128-bit signed [8 x i16] result vector. 2440/// 2441/// \headerfile <x86intrin.h> 2442/// 2443/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2444/// 2445/// \param __a 2446/// A 128-bit signed [8 x i16] vector. 2447/// \param __b 2448/// A 128-bit signed [8 x i16] vector. 2449/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2450/// each of the eight 32-bit products. 2451static __inline__ __m128i __DEFAULT_FN_ATTRS 2452_mm_mulhi_epi16(__m128i __a, __m128i __b) 2453{ 2454 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2455} 2456 2457/// Multiplies the corresponding elements of two unsigned [8 x i16] 2458/// vectors, saving the upper 16 bits of each 32-bit product in the 2459/// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2460/// 2461/// \headerfile <x86intrin.h> 2462/// 2463/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2464/// 2465/// \param __a 2466/// A 128-bit unsigned [8 x i16] vector. 2467/// \param __b 2468/// A 128-bit unsigned [8 x i16] vector. 2469/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2470/// of each of the eight 32-bit products. 2471static __inline__ __m128i __DEFAULT_FN_ATTRS 2472_mm_mulhi_epu16(__m128i __a, __m128i __b) 2473{ 2474 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2475} 2476 2477/// Multiplies the corresponding elements of two signed [8 x i16] 2478/// vectors, saving the lower 16 bits of each 32-bit product in the 2479/// corresponding element of a 128-bit signed [8 x i16] result vector. 2480/// 2481/// \headerfile <x86intrin.h> 2482/// 2483/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2484/// 2485/// \param __a 2486/// A 128-bit signed [8 x i16] vector. 2487/// \param __b 2488/// A 128-bit signed [8 x i16] vector. 2489/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2490/// each of the eight 32-bit products. 2491static __inline__ __m128i __DEFAULT_FN_ATTRS 2492_mm_mullo_epi16(__m128i __a, __m128i __b) 2493{ 2494 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2495} 2496 2497/// Multiplies 32-bit unsigned integer values contained in the lower bits 2498/// of the two 64-bit integer vectors and returns the 64-bit unsigned 2499/// product. 2500/// 2501/// \headerfile <x86intrin.h> 2502/// 2503/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2504/// 2505/// \param __a 2506/// A 64-bit integer containing one of the source operands. 2507/// \param __b 2508/// A 64-bit integer containing one of the source operands. 2509/// \returns A 64-bit integer vector containing the product of both operands. 2510static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2511_mm_mul_su32(__m64 __a, __m64 __b) 2512{ 2513 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2514} 2515 2516/// Multiplies 32-bit unsigned integer values contained in the lower 2517/// bits of the corresponding elements of two [2 x i64] vectors, and returns 2518/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2519/// 2520/// \headerfile <x86intrin.h> 2521/// 2522/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2523/// 2524/// \param __a 2525/// A [2 x i64] vector containing one of the source operands. 2526/// \param __b 2527/// A [2 x i64] vector containing one of the source operands. 2528/// \returns A [2 x i64] vector containing the product of both operands. 2529static __inline__ __m128i __DEFAULT_FN_ATTRS 2530_mm_mul_epu32(__m128i __a, __m128i __b) 2531{ 2532 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2533} 2534 2535/// Computes the absolute differences of corresponding 8-bit integer 2536/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2537/// separately sums the second 8 absolute differences. Packs these two 2538/// unsigned 16-bit integer sums into the upper and lower elements of a 2539/// [2 x i64] vector. 2540/// 2541/// \headerfile <x86intrin.h> 2542/// 2543/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2544/// 2545/// \param __a 2546/// A 128-bit integer vector containing one of the source operands. 2547/// \param __b 2548/// A 128-bit integer vector containing one of the source operands. 2549/// \returns A [2 x i64] vector containing the sums of the sets of absolute 2550/// differences between both operands. 2551static __inline__ __m128i __DEFAULT_FN_ATTRS 2552_mm_sad_epu8(__m128i __a, __m128i __b) 2553{ 2554 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2555} 2556 2557/// Subtracts the corresponding 8-bit integer values in the operands. 2558/// 2559/// \headerfile <x86intrin.h> 2560/// 2561/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2562/// 2563/// \param __a 2564/// A 128-bit integer vector containing the minuends. 2565/// \param __b 2566/// A 128-bit integer vector containing the subtrahends. 2567/// \returns A 128-bit integer vector containing the differences of the values 2568/// in the operands. 2569static __inline__ __m128i __DEFAULT_FN_ATTRS 2570_mm_sub_epi8(__m128i __a, __m128i __b) 2571{ 2572 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2573} 2574 2575/// Subtracts the corresponding 16-bit integer values in the operands. 2576/// 2577/// \headerfile <x86intrin.h> 2578/// 2579/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2580/// 2581/// \param __a 2582/// A 128-bit integer vector containing the minuends. 2583/// \param __b 2584/// A 128-bit integer vector containing the subtrahends. 2585/// \returns A 128-bit integer vector containing the differences of the values 2586/// in the operands. 2587static __inline__ __m128i __DEFAULT_FN_ATTRS 2588_mm_sub_epi16(__m128i __a, __m128i __b) 2589{ 2590 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2591} 2592 2593/// Subtracts the corresponding 32-bit integer values in the operands. 2594/// 2595/// \headerfile <x86intrin.h> 2596/// 2597/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2598/// 2599/// \param __a 2600/// A 128-bit integer vector containing the minuends. 2601/// \param __b 2602/// A 128-bit integer vector containing the subtrahends. 2603/// \returns A 128-bit integer vector containing the differences of the values 2604/// in the operands. 2605static __inline__ __m128i __DEFAULT_FN_ATTRS 2606_mm_sub_epi32(__m128i __a, __m128i __b) 2607{ 2608 return (__m128i)((__v4su)__a - (__v4su)__b); 2609} 2610 2611/// Subtracts signed or unsigned 64-bit integer values and writes the 2612/// difference to the corresponding bits in the destination. 2613/// 2614/// \headerfile <x86intrin.h> 2615/// 2616/// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2617/// 2618/// \param __a 2619/// A 64-bit integer vector containing the minuend. 2620/// \param __b 2621/// A 64-bit integer vector containing the subtrahend. 2622/// \returns A 64-bit integer vector containing the difference of the values in 2623/// the operands. 2624static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2625_mm_sub_si64(__m64 __a, __m64 __b) 2626{ 2627 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2628} 2629 2630/// Subtracts the corresponding elements of two [2 x i64] vectors. 2631/// 2632/// \headerfile <x86intrin.h> 2633/// 2634/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2635/// 2636/// \param __a 2637/// A 128-bit integer vector containing the minuends. 2638/// \param __b 2639/// A 128-bit integer vector containing the subtrahends. 2640/// \returns A 128-bit integer vector containing the differences of the values 2641/// in the operands. 2642static __inline__ __m128i __DEFAULT_FN_ATTRS 2643_mm_sub_epi64(__m128i __a, __m128i __b) 2644{ 2645 return (__m128i)((__v2du)__a - (__v2du)__b); 2646} 2647 2648/// Subtracts corresponding 8-bit signed integer values in the input and 2649/// returns the differences in the corresponding bytes in the destination. 2650/// Differences greater than 0x7F are saturated to 0x7F, and differences less 2651/// than 0x80 are saturated to 0x80. 2652/// 2653/// \headerfile <x86intrin.h> 2654/// 2655/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2656/// 2657/// \param __a 2658/// A 128-bit integer vector containing the minuends. 2659/// \param __b 2660/// A 128-bit integer vector containing the subtrahends. 2661/// \returns A 128-bit integer vector containing the differences of the values 2662/// in the operands. 2663static __inline__ __m128i __DEFAULT_FN_ATTRS 2664_mm_subs_epi8(__m128i __a, __m128i __b) 2665{ 2666 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2667} 2668 2669/// Subtracts corresponding 16-bit signed integer values in the input and 2670/// returns the differences in the corresponding bytes in the destination. 2671/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2672/// than 0x8000 are saturated to 0x8000. 2673/// 2674/// \headerfile <x86intrin.h> 2675/// 2676/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2677/// 2678/// \param __a 2679/// A 128-bit integer vector containing the minuends. 2680/// \param __b 2681/// A 128-bit integer vector containing the subtrahends. 2682/// \returns A 128-bit integer vector containing the differences of the values 2683/// in the operands. 2684static __inline__ __m128i __DEFAULT_FN_ATTRS 2685_mm_subs_epi16(__m128i __a, __m128i __b) 2686{ 2687 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2688} 2689 2690/// Subtracts corresponding 8-bit unsigned integer values in the input 2691/// and returns the differences in the corresponding bytes in the 2692/// destination. Differences less than 0x00 are saturated to 0x00. 2693/// 2694/// \headerfile <x86intrin.h> 2695/// 2696/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2697/// 2698/// \param __a 2699/// A 128-bit integer vector containing the minuends. 2700/// \param __b 2701/// A 128-bit integer vector containing the subtrahends. 2702/// \returns A 128-bit integer vector containing the unsigned integer 2703/// differences of the values in the operands. 2704static __inline__ __m128i __DEFAULT_FN_ATTRS 2705_mm_subs_epu8(__m128i __a, __m128i __b) 2706{ 2707 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2708} 2709 2710/// Subtracts corresponding 16-bit unsigned integer values in the input 2711/// and returns the differences in the corresponding bytes in the 2712/// destination. Differences less than 0x0000 are saturated to 0x0000. 2713/// 2714/// \headerfile <x86intrin.h> 2715/// 2716/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2717/// 2718/// \param __a 2719/// A 128-bit integer vector containing the minuends. 2720/// \param __b 2721/// A 128-bit integer vector containing the subtrahends. 2722/// \returns A 128-bit integer vector containing the unsigned integer 2723/// differences of the values in the operands. 2724static __inline__ __m128i __DEFAULT_FN_ATTRS 2725_mm_subs_epu16(__m128i __a, __m128i __b) 2726{ 2727 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2728} 2729 2730/// Performs a bitwise AND of two 128-bit integer vectors. 2731/// 2732/// \headerfile <x86intrin.h> 2733/// 2734/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2735/// 2736/// \param __a 2737/// A 128-bit integer vector containing one of the source operands. 2738/// \param __b 2739/// A 128-bit integer vector containing one of the source operands. 2740/// \returns A 128-bit integer vector containing the bitwise AND of the values 2741/// in both operands. 2742static __inline__ __m128i __DEFAULT_FN_ATTRS 2743_mm_and_si128(__m128i __a, __m128i __b) 2744{ 2745 return (__m128i)((__v2du)__a & (__v2du)__b); 2746} 2747 2748/// Performs a bitwise AND of two 128-bit integer vectors, using the 2749/// one's complement of the values contained in the first source operand. 2750/// 2751/// \headerfile <x86intrin.h> 2752/// 2753/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2754/// 2755/// \param __a 2756/// A 128-bit vector containing the left source operand. The one's complement 2757/// of this value is used in the bitwise AND. 2758/// \param __b 2759/// A 128-bit vector containing the right source operand. 2760/// \returns A 128-bit integer vector containing the bitwise AND of the one's 2761/// complement of the first operand and the values in the second operand. 2762static __inline__ __m128i __DEFAULT_FN_ATTRS 2763_mm_andnot_si128(__m128i __a, __m128i __b) 2764{ 2765 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2766} 2767/// Performs a bitwise OR of two 128-bit integer vectors. 2768/// 2769/// \headerfile <x86intrin.h> 2770/// 2771/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2772/// 2773/// \param __a 2774/// A 128-bit integer vector containing one of the source operands. 2775/// \param __b 2776/// A 128-bit integer vector containing one of the source operands. 2777/// \returns A 128-bit integer vector containing the bitwise OR of the values 2778/// in both operands. 2779static __inline__ __m128i __DEFAULT_FN_ATTRS 2780_mm_or_si128(__m128i __a, __m128i __b) 2781{ 2782 return (__m128i)((__v2du)__a | (__v2du)__b); 2783} 2784 2785/// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2786/// 2787/// \headerfile <x86intrin.h> 2788/// 2789/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2790/// 2791/// \param __a 2792/// A 128-bit integer vector containing one of the source operands. 2793/// \param __b 2794/// A 128-bit integer vector containing one of the source operands. 2795/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2796/// values in both operands. 2797static __inline__ __m128i __DEFAULT_FN_ATTRS 2798_mm_xor_si128(__m128i __a, __m128i __b) 2799{ 2800 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2801} 2802 2803/// Left-shifts the 128-bit integer vector operand by the specified 2804/// number of bytes. Low-order bits are cleared. 2805/// 2806/// \headerfile <x86intrin.h> 2807/// 2808/// \code 2809/// __m128i _mm_slli_si128(__m128i a, const int imm); 2810/// \endcode 2811/// 2812/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2813/// 2814/// \param a 2815/// A 128-bit integer vector containing the source operand. 2816/// \param imm 2817/// An immediate value specifying the number of bytes to left-shift operand 2818/// \a a. 2819/// \returns A 128-bit integer vector containing the left-shifted value. 2820#define _mm_slli_si128(a, imm) \ 2821 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 2822 2823#define _mm_bslli_si128(a, imm) \ 2824 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 2825 2826/// Left-shifts each 16-bit value in the 128-bit integer vector operand 2827/// by the specified number of bits. Low-order bits are cleared. 2828/// 2829/// \headerfile <x86intrin.h> 2830/// 2831/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2832/// 2833/// \param __a 2834/// A 128-bit integer vector containing the source operand. 2835/// \param __count 2836/// An integer value specifying the number of bits to left-shift each value 2837/// in operand \a __a. 2838/// \returns A 128-bit integer vector containing the left-shifted values. 2839static __inline__ __m128i __DEFAULT_FN_ATTRS 2840_mm_slli_epi16(__m128i __a, int __count) 2841{ 2842 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2843} 2844 2845/// Left-shifts each 16-bit value in the 128-bit integer vector operand 2846/// by the specified number of bits. Low-order bits are cleared. 2847/// 2848/// \headerfile <x86intrin.h> 2849/// 2850/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2851/// 2852/// \param __a 2853/// A 128-bit integer vector containing the source operand. 2854/// \param __count 2855/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2856/// to left-shift each value in operand \a __a. 2857/// \returns A 128-bit integer vector containing the left-shifted values. 2858static __inline__ __m128i __DEFAULT_FN_ATTRS 2859_mm_sll_epi16(__m128i __a, __m128i __count) 2860{ 2861 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2862} 2863 2864/// Left-shifts each 32-bit value in the 128-bit integer vector operand 2865/// by the specified number of bits. Low-order bits are cleared. 2866/// 2867/// \headerfile <x86intrin.h> 2868/// 2869/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2870/// 2871/// \param __a 2872/// A 128-bit integer vector containing the source operand. 2873/// \param __count 2874/// An integer value specifying the number of bits to left-shift each value 2875/// in operand \a __a. 2876/// \returns A 128-bit integer vector containing the left-shifted values. 2877static __inline__ __m128i __DEFAULT_FN_ATTRS 2878_mm_slli_epi32(__m128i __a, int __count) 2879{ 2880 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2881} 2882 2883/// Left-shifts each 32-bit value in the 128-bit integer vector operand 2884/// by the specified number of bits. Low-order bits are cleared. 2885/// 2886/// \headerfile <x86intrin.h> 2887/// 2888/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2889/// 2890/// \param __a 2891/// A 128-bit integer vector containing the source operand. 2892/// \param __count 2893/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2894/// to left-shift each value in operand \a __a. 2895/// \returns A 128-bit integer vector containing the left-shifted values. 2896static __inline__ __m128i __DEFAULT_FN_ATTRS 2897_mm_sll_epi32(__m128i __a, __m128i __count) 2898{ 2899 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2900} 2901 2902/// Left-shifts each 64-bit value in the 128-bit integer vector operand 2903/// by the specified number of bits. Low-order bits are cleared. 2904/// 2905/// \headerfile <x86intrin.h> 2906/// 2907/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2908/// 2909/// \param __a 2910/// A 128-bit integer vector containing the source operand. 2911/// \param __count 2912/// An integer value specifying the number of bits to left-shift each value 2913/// in operand \a __a. 2914/// \returns A 128-bit integer vector containing the left-shifted values. 2915static __inline__ __m128i __DEFAULT_FN_ATTRS 2916_mm_slli_epi64(__m128i __a, int __count) 2917{ 2918 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2919} 2920 2921/// Left-shifts each 64-bit value in the 128-bit integer vector operand 2922/// by the specified number of bits. Low-order bits are cleared. 2923/// 2924/// \headerfile <x86intrin.h> 2925/// 2926/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2927/// 2928/// \param __a 2929/// A 128-bit integer vector containing the source operand. 2930/// \param __count 2931/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2932/// to left-shift each value in operand \a __a. 2933/// \returns A 128-bit integer vector containing the left-shifted values. 2934static __inline__ __m128i __DEFAULT_FN_ATTRS 2935_mm_sll_epi64(__m128i __a, __m128i __count) 2936{ 2937 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2938} 2939 2940/// Right-shifts each 16-bit value in the 128-bit integer vector operand 2941/// by the specified number of bits. High-order bits are filled with the sign 2942/// bit of the initial value. 2943/// 2944/// \headerfile <x86intrin.h> 2945/// 2946/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2947/// 2948/// \param __a 2949/// A 128-bit integer vector containing the source operand. 2950/// \param __count 2951/// An integer value specifying the number of bits to right-shift each value 2952/// in operand \a __a. 2953/// \returns A 128-bit integer vector containing the right-shifted values. 2954static __inline__ __m128i __DEFAULT_FN_ATTRS 2955_mm_srai_epi16(__m128i __a, int __count) 2956{ 2957 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2958} 2959 2960/// Right-shifts each 16-bit value in the 128-bit integer vector operand 2961/// by the specified number of bits. High-order bits are filled with the sign 2962/// bit of the initial value. 2963/// 2964/// \headerfile <x86intrin.h> 2965/// 2966/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2967/// 2968/// \param __a 2969/// A 128-bit integer vector containing the source operand. 2970/// \param __count 2971/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2972/// to right-shift each value in operand \a __a. 2973/// \returns A 128-bit integer vector containing the right-shifted values. 2974static __inline__ __m128i __DEFAULT_FN_ATTRS 2975_mm_sra_epi16(__m128i __a, __m128i __count) 2976{ 2977 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2978} 2979 2980/// Right-shifts each 32-bit value in the 128-bit integer vector operand 2981/// by the specified number of bits. High-order bits are filled with the sign 2982/// bit of the initial value. 2983/// 2984/// \headerfile <x86intrin.h> 2985/// 2986/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2987/// 2988/// \param __a 2989/// A 128-bit integer vector containing the source operand. 2990/// \param __count 2991/// An integer value specifying the number of bits to right-shift each value 2992/// in operand \a __a. 2993/// \returns A 128-bit integer vector containing the right-shifted values. 2994static __inline__ __m128i __DEFAULT_FN_ATTRS 2995_mm_srai_epi32(__m128i __a, int __count) 2996{ 2997 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2998} 2999 3000/// Right-shifts each 32-bit value in the 128-bit integer vector operand 3001/// by the specified number of bits. High-order bits are filled with the sign 3002/// bit of the initial value. 3003/// 3004/// \headerfile <x86intrin.h> 3005/// 3006/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 3007/// 3008/// \param __a 3009/// A 128-bit integer vector containing the source operand. 3010/// \param __count 3011/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3012/// to right-shift each value in operand \a __a. 3013/// \returns A 128-bit integer vector containing the right-shifted values. 3014static __inline__ __m128i __DEFAULT_FN_ATTRS 3015_mm_sra_epi32(__m128i __a, __m128i __count) 3016{ 3017 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 3018} 3019 3020/// Right-shifts the 128-bit integer vector operand by the specified 3021/// number of bytes. High-order bits are cleared. 3022/// 3023/// \headerfile <x86intrin.h> 3024/// 3025/// \code 3026/// __m128i _mm_srli_si128(__m128i a, const int imm); 3027/// \endcode 3028/// 3029/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 3030/// 3031/// \param a 3032/// A 128-bit integer vector containing the source operand. 3033/// \param imm 3034/// An immediate value specifying the number of bytes to right-shift operand 3035/// \a a. 3036/// \returns A 128-bit integer vector containing the right-shifted value. 3037#define _mm_srli_si128(a, imm) \ 3038 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 3039 3040#define _mm_bsrli_si128(a, imm) \ 3041 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 3042 3043/// Right-shifts each of 16-bit values in the 128-bit integer vector 3044/// operand by the specified number of bits. High-order bits are cleared. 3045/// 3046/// \headerfile <x86intrin.h> 3047/// 3048/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3049/// 3050/// \param __a 3051/// A 128-bit integer vector containing the source operand. 3052/// \param __count 3053/// An integer value specifying the number of bits to right-shift each value 3054/// in operand \a __a. 3055/// \returns A 128-bit integer vector containing the right-shifted values. 3056static __inline__ __m128i __DEFAULT_FN_ATTRS 3057_mm_srli_epi16(__m128i __a, int __count) 3058{ 3059 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 3060} 3061 3062/// Right-shifts each of 16-bit values in the 128-bit integer vector 3063/// operand by the specified number of bits. High-order bits are cleared. 3064/// 3065/// \headerfile <x86intrin.h> 3066/// 3067/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3068/// 3069/// \param __a 3070/// A 128-bit integer vector containing the source operand. 3071/// \param __count 3072/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3073/// to right-shift each value in operand \a __a. 3074/// \returns A 128-bit integer vector containing the right-shifted values. 3075static __inline__ __m128i __DEFAULT_FN_ATTRS 3076_mm_srl_epi16(__m128i __a, __m128i __count) 3077{ 3078 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 3079} 3080 3081/// Right-shifts each of 32-bit values in the 128-bit integer vector 3082/// operand by the specified number of bits. High-order bits are cleared. 3083/// 3084/// \headerfile <x86intrin.h> 3085/// 3086/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3087/// 3088/// \param __a 3089/// A 128-bit integer vector containing the source operand. 3090/// \param __count 3091/// An integer value specifying the number of bits to right-shift each value 3092/// in operand \a __a. 3093/// \returns A 128-bit integer vector containing the right-shifted values. 3094static __inline__ __m128i __DEFAULT_FN_ATTRS 3095_mm_srli_epi32(__m128i __a, int __count) 3096{ 3097 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3098} 3099 3100/// Right-shifts each of 32-bit values in the 128-bit integer vector 3101/// operand by the specified number of bits. High-order bits are cleared. 3102/// 3103/// \headerfile <x86intrin.h> 3104/// 3105/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3106/// 3107/// \param __a 3108/// A 128-bit integer vector containing the source operand. 3109/// \param __count 3110/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3111/// to right-shift each value in operand \a __a. 3112/// \returns A 128-bit integer vector containing the right-shifted values. 3113static __inline__ __m128i __DEFAULT_FN_ATTRS 3114_mm_srl_epi32(__m128i __a, __m128i __count) 3115{ 3116 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3117} 3118 3119/// Right-shifts each of 64-bit values in the 128-bit integer vector 3120/// operand by the specified number of bits. High-order bits are cleared. 3121/// 3122/// \headerfile <x86intrin.h> 3123/// 3124/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3125/// 3126/// \param __a 3127/// A 128-bit integer vector containing the source operand. 3128/// \param __count 3129/// An integer value specifying the number of bits to right-shift each value 3130/// in operand \a __a. 3131/// \returns A 128-bit integer vector containing the right-shifted values. 3132static __inline__ __m128i __DEFAULT_FN_ATTRS 3133_mm_srli_epi64(__m128i __a, int __count) 3134{ 3135 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3136} 3137 3138/// Right-shifts each of 64-bit values in the 128-bit integer vector 3139/// operand by the specified number of bits. High-order bits are cleared. 3140/// 3141/// \headerfile <x86intrin.h> 3142/// 3143/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3144/// 3145/// \param __a 3146/// A 128-bit integer vector containing the source operand. 3147/// \param __count 3148/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3149/// to right-shift each value in operand \a __a. 3150/// \returns A 128-bit integer vector containing the right-shifted values. 3151static __inline__ __m128i __DEFAULT_FN_ATTRS 3152_mm_srl_epi64(__m128i __a, __m128i __count) 3153{ 3154 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3155} 3156 3157/// Compares each of the corresponding 8-bit values of the 128-bit 3158/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF 3159/// for true. 3160/// 3161/// \headerfile <x86intrin.h> 3162/// 3163/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3164/// 3165/// \param __a 3166/// A 128-bit integer vector. 3167/// \param __b 3168/// A 128-bit integer vector. 3169/// \returns A 128-bit integer vector containing the comparison results. 3170static __inline__ __m128i __DEFAULT_FN_ATTRS 3171_mm_cmpeq_epi8(__m128i __a, __m128i __b) 3172{ 3173 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3174} 3175 3176/// Compares each of the corresponding 16-bit values of the 128-bit 3177/// integer vectors for equality. Each comparison yields 0x0 for false, 3178/// 0xFFFF for true. 3179/// 3180/// \headerfile <x86intrin.h> 3181/// 3182/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3183/// 3184/// \param __a 3185/// A 128-bit integer vector. 3186/// \param __b 3187/// A 128-bit integer vector. 3188/// \returns A 128-bit integer vector containing the comparison results. 3189static __inline__ __m128i __DEFAULT_FN_ATTRS 3190_mm_cmpeq_epi16(__m128i __a, __m128i __b) 3191{ 3192 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3193} 3194 3195/// Compares each of the corresponding 32-bit values of the 128-bit 3196/// integer vectors for equality. Each comparison yields 0x0 for false, 3197/// 0xFFFFFFFF for true. 3198/// 3199/// \headerfile <x86intrin.h> 3200/// 3201/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3202/// 3203/// \param __a 3204/// A 128-bit integer vector. 3205/// \param __b 3206/// A 128-bit integer vector. 3207/// \returns A 128-bit integer vector containing the comparison results. 3208static __inline__ __m128i __DEFAULT_FN_ATTRS 3209_mm_cmpeq_epi32(__m128i __a, __m128i __b) 3210{ 3211 return (__m128i)((__v4si)__a == (__v4si)__b); 3212} 3213 3214/// Compares each of the corresponding signed 8-bit values of the 128-bit 3215/// integer vectors to determine if the values in the first operand are 3216/// greater than those in the second operand. Each comparison yields 0x0 for 3217/// false, 0xFF for true. 3218/// 3219/// \headerfile <x86intrin.h> 3220/// 3221/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3222/// 3223/// \param __a 3224/// A 128-bit integer vector. 3225/// \param __b 3226/// A 128-bit integer vector. 3227/// \returns A 128-bit integer vector containing the comparison results. 3228static __inline__ __m128i __DEFAULT_FN_ATTRS 3229_mm_cmpgt_epi8(__m128i __a, __m128i __b) 3230{ 3231 /* This function always performs a signed comparison, but __v16qi is a char 3232 which may be signed or unsigned, so use __v16qs. */ 3233 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3234} 3235 3236/// Compares each of the corresponding signed 16-bit values of the 3237/// 128-bit integer vectors to determine if the values in the first operand 3238/// are greater than those in the second operand. 3239/// 3240/// Each comparison yields 0x0 for false, 0xFFFF for true. 3241/// 3242/// \headerfile <x86intrin.h> 3243/// 3244/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3245/// 3246/// \param __a 3247/// A 128-bit integer vector. 3248/// \param __b 3249/// A 128-bit integer vector. 3250/// \returns A 128-bit integer vector containing the comparison results. 3251static __inline__ __m128i __DEFAULT_FN_ATTRS 3252_mm_cmpgt_epi16(__m128i __a, __m128i __b) 3253{ 3254 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3255} 3256 3257/// Compares each of the corresponding signed 32-bit values of the 3258/// 128-bit integer vectors to determine if the values in the first operand 3259/// are greater than those in the second operand. 3260/// 3261/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3262/// 3263/// \headerfile <x86intrin.h> 3264/// 3265/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3266/// 3267/// \param __a 3268/// A 128-bit integer vector. 3269/// \param __b 3270/// A 128-bit integer vector. 3271/// \returns A 128-bit integer vector containing the comparison results. 3272static __inline__ __m128i __DEFAULT_FN_ATTRS 3273_mm_cmpgt_epi32(__m128i __a, __m128i __b) 3274{ 3275 return (__m128i)((__v4si)__a > (__v4si)__b); 3276} 3277 3278/// Compares each of the corresponding signed 8-bit values of the 128-bit 3279/// integer vectors to determine if the values in the first operand are less 3280/// than those in the second operand. 3281/// 3282/// Each comparison yields 0x0 for false, 0xFF for true. 3283/// 3284/// \headerfile <x86intrin.h> 3285/// 3286/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3287/// 3288/// \param __a 3289/// A 128-bit integer vector. 3290/// \param __b 3291/// A 128-bit integer vector. 3292/// \returns A 128-bit integer vector containing the comparison results. 3293static __inline__ __m128i __DEFAULT_FN_ATTRS 3294_mm_cmplt_epi8(__m128i __a, __m128i __b) 3295{ 3296 return _mm_cmpgt_epi8(__b, __a); 3297} 3298 3299/// Compares each of the corresponding signed 16-bit values of the 3300/// 128-bit integer vectors to determine if the values in the first operand 3301/// are less than those in the second operand. 3302/// 3303/// Each comparison yields 0x0 for false, 0xFFFF for true. 3304/// 3305/// \headerfile <x86intrin.h> 3306/// 3307/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3308/// 3309/// \param __a 3310/// A 128-bit integer vector. 3311/// \param __b 3312/// A 128-bit integer vector. 3313/// \returns A 128-bit integer vector containing the comparison results. 3314static __inline__ __m128i __DEFAULT_FN_ATTRS 3315_mm_cmplt_epi16(__m128i __a, __m128i __b) 3316{ 3317 return _mm_cmpgt_epi16(__b, __a); 3318} 3319 3320/// Compares each of the corresponding signed 32-bit values of the 3321/// 128-bit integer vectors to determine if the values in the first operand 3322/// are less than those in the second operand. 3323/// 3324/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3325/// 3326/// \headerfile <x86intrin.h> 3327/// 3328/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3329/// 3330/// \param __a 3331/// A 128-bit integer vector. 3332/// \param __b 3333/// A 128-bit integer vector. 3334/// \returns A 128-bit integer vector containing the comparison results. 3335static __inline__ __m128i __DEFAULT_FN_ATTRS 3336_mm_cmplt_epi32(__m128i __a, __m128i __b) 3337{ 3338 return _mm_cmpgt_epi32(__b, __a); 3339} 3340 3341#ifdef __x86_64__ 3342/// Converts a 64-bit signed integer value from the second operand into a 3343/// double-precision value and returns it in the lower element of a [2 x 3344/// double] vector; the upper element of the returned vector is copied from 3345/// the upper element of the first operand. 3346/// 3347/// \headerfile <x86intrin.h> 3348/// 3349/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3350/// 3351/// \param __a 3352/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3353/// copied to the upper 64 bits of the destination. 3354/// \param __b 3355/// A 64-bit signed integer operand containing the value to be converted. 3356/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3357/// converted value of the second operand. The upper 64 bits are copied from 3358/// the upper 64 bits of the first operand. 3359static __inline__ __m128d __DEFAULT_FN_ATTRS 3360_mm_cvtsi64_sd(__m128d __a, long long __b) 3361{ 3362 __a[0] = __b; 3363 return __a; 3364} 3365 3366/// Converts the first (lower) element of a vector of [2 x double] into a 3367/// 64-bit signed integer value, according to the current rounding mode. 3368/// 3369/// \headerfile <x86intrin.h> 3370/// 3371/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3372/// 3373/// \param __a 3374/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3375/// conversion. 3376/// \returns A 64-bit signed integer containing the converted value. 3377static __inline__ long long __DEFAULT_FN_ATTRS 3378_mm_cvtsd_si64(__m128d __a) 3379{ 3380 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3381} 3382 3383/// Converts the first (lower) element of a vector of [2 x double] into a 3384/// 64-bit signed integer value, truncating the result when it is inexact. 3385/// 3386/// \headerfile <x86intrin.h> 3387/// 3388/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3389/// instruction. 3390/// 3391/// \param __a 3392/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3393/// conversion. 3394/// \returns A 64-bit signed integer containing the converted value. 3395static __inline__ long long __DEFAULT_FN_ATTRS 3396_mm_cvttsd_si64(__m128d __a) 3397{ 3398 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3399} 3400#endif 3401 3402/// Converts a vector of [4 x i32] into a vector of [4 x float]. 3403/// 3404/// \headerfile <x86intrin.h> 3405/// 3406/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3407/// 3408/// \param __a 3409/// A 128-bit integer vector. 3410/// \returns A 128-bit vector of [4 x float] containing the converted values. 3411static __inline__ __m128 __DEFAULT_FN_ATTRS 3412_mm_cvtepi32_ps(__m128i __a) 3413{ 3414 return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); 3415} 3416 3417/// Converts a vector of [4 x float] into a vector of [4 x i32]. 3418/// 3419/// \headerfile <x86intrin.h> 3420/// 3421/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3422/// 3423/// \param __a 3424/// A 128-bit vector of [4 x float]. 3425/// \returns A 128-bit integer vector of [4 x i32] containing the converted 3426/// values. 3427static __inline__ __m128i __DEFAULT_FN_ATTRS 3428_mm_cvtps_epi32(__m128 __a) 3429{ 3430 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3431} 3432 3433/// Converts a vector of [4 x float] into a vector of [4 x i32], 3434/// truncating the result when it is inexact. 3435/// 3436/// \headerfile <x86intrin.h> 3437/// 3438/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3439/// instruction. 3440/// 3441/// \param __a 3442/// A 128-bit vector of [4 x float]. 3443/// \returns A 128-bit vector of [4 x i32] containing the converted values. 3444static __inline__ __m128i __DEFAULT_FN_ATTRS 3445_mm_cvttps_epi32(__m128 __a) 3446{ 3447 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3448} 3449 3450/// Returns a vector of [4 x i32] where the lowest element is the input 3451/// operand and the remaining elements are zero. 3452/// 3453/// \headerfile <x86intrin.h> 3454/// 3455/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3456/// 3457/// \param __a 3458/// A 32-bit signed integer operand. 3459/// \returns A 128-bit vector of [4 x i32]. 3460static __inline__ __m128i __DEFAULT_FN_ATTRS 3461_mm_cvtsi32_si128(int __a) 3462{ 3463 return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; 3464} 3465 3466#ifdef __x86_64__ 3467/// Returns a vector of [2 x i64] where the lower element is the input 3468/// operand and the upper element is zero. 3469/// 3470/// \headerfile <x86intrin.h> 3471/// 3472/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3473/// 3474/// \param __a 3475/// A 64-bit signed integer operand containing the value to be converted. 3476/// \returns A 128-bit vector of [2 x i64] containing the converted value. 3477static __inline__ __m128i __DEFAULT_FN_ATTRS 3478_mm_cvtsi64_si128(long long __a) 3479{ 3480 return __extension__ (__m128i)(__v2di){ __a, 0 }; 3481} 3482#endif 3483 3484/// Moves the least significant 32 bits of a vector of [4 x i32] to a 3485/// 32-bit signed integer value. 3486/// 3487/// \headerfile <x86intrin.h> 3488/// 3489/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3490/// 3491/// \param __a 3492/// A vector of [4 x i32]. The least significant 32 bits are moved to the 3493/// destination. 3494/// \returns A 32-bit signed integer containing the moved value. 3495static __inline__ int __DEFAULT_FN_ATTRS 3496_mm_cvtsi128_si32(__m128i __a) 3497{ 3498 __v4si __b = (__v4si)__a; 3499 return __b[0]; 3500} 3501 3502#ifdef __x86_64__ 3503/// Moves the least significant 64 bits of a vector of [2 x i64] to a 3504/// 64-bit signed integer value. 3505/// 3506/// \headerfile <x86intrin.h> 3507/// 3508/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3509/// 3510/// \param __a 3511/// A vector of [2 x i64]. The least significant 64 bits are moved to the 3512/// destination. 3513/// \returns A 64-bit signed integer containing the moved value. 3514static __inline__ long long __DEFAULT_FN_ATTRS 3515_mm_cvtsi128_si64(__m128i __a) 3516{ 3517 return __a[0]; 3518} 3519#endif 3520 3521/// Moves packed integer values from an aligned 128-bit memory location 3522/// to elements in a 128-bit integer vector. 3523/// 3524/// \headerfile <x86intrin.h> 3525/// 3526/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3527/// 3528/// \param __p 3529/// An aligned pointer to a memory location containing integer values. 3530/// \returns A 128-bit integer vector containing the moved values. 3531static __inline__ __m128i __DEFAULT_FN_ATTRS 3532_mm_load_si128(__m128i const *__p) 3533{ 3534 return *__p; 3535} 3536 3537/// Moves packed integer values from an unaligned 128-bit memory location 3538/// to elements in a 128-bit integer vector. 3539/// 3540/// \headerfile <x86intrin.h> 3541/// 3542/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3543/// 3544/// \param __p 3545/// A pointer to a memory location containing integer values. 3546/// \returns A 128-bit integer vector containing the moved values. 3547static __inline__ __m128i __DEFAULT_FN_ATTRS 3548_mm_loadu_si128(__m128i_u const *__p) 3549{ 3550 struct __loadu_si128 { 3551 __m128i_u __v; 3552 } __attribute__((__packed__, __may_alias__)); 3553 return ((const struct __loadu_si128*)__p)->__v; 3554} 3555 3556/// Returns a vector of [2 x i64] where the lower element is taken from 3557/// the lower element of the operand, and the upper element is zero. 3558/// 3559/// \headerfile <x86intrin.h> 3560/// 3561/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3562/// 3563/// \param __p 3564/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3565/// the destination. 3566/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3567/// moved value. The higher order bits are cleared. 3568static __inline__ __m128i __DEFAULT_FN_ATTRS 3569_mm_loadl_epi64(__m128i_u const *__p) 3570{ 3571 struct __mm_loadl_epi64_struct { 3572 long long __u; 3573 } __attribute__((__packed__, __may_alias__)); 3574 return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3575} 3576 3577/// Generates a 128-bit vector of [4 x i32] with unspecified content. 3578/// This could be used as an argument to another intrinsic function where the 3579/// argument is required but the value is not actually used. 3580/// 3581/// \headerfile <x86intrin.h> 3582/// 3583/// This intrinsic has no corresponding instruction. 3584/// 3585/// \returns A 128-bit vector of [4 x i32] with unspecified content. 3586static __inline__ __m128i __DEFAULT_FN_ATTRS 3587_mm_undefined_si128(void) 3588{ 3589 return (__m128i)__builtin_ia32_undef128(); 3590} 3591 3592/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3593/// the specified 64-bit integer values. 3594/// 3595/// \headerfile <x86intrin.h> 3596/// 3597/// This intrinsic is a utility function and does not correspond to a specific 3598/// instruction. 3599/// 3600/// \param __q1 3601/// A 64-bit integer value used to initialize the upper 64 bits of the 3602/// destination vector of [2 x i64]. 3603/// \param __q0 3604/// A 64-bit integer value used to initialize the lower 64 bits of the 3605/// destination vector of [2 x i64]. 3606/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3607/// provided in the operands. 3608static __inline__ __m128i __DEFAULT_FN_ATTRS 3609_mm_set_epi64x(long long __q1, long long __q0) 3610{ 3611 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 3612} 3613 3614/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3615/// the specified 64-bit integer values. 3616/// 3617/// \headerfile <x86intrin.h> 3618/// 3619/// This intrinsic is a utility function and does not correspond to a specific 3620/// instruction. 3621/// 3622/// \param __q1 3623/// A 64-bit integer value used to initialize the upper 64 bits of the 3624/// destination vector of [2 x i64]. 3625/// \param __q0 3626/// A 64-bit integer value used to initialize the lower 64 bits of the 3627/// destination vector of [2 x i64]. 3628/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3629/// provided in the operands. 3630static __inline__ __m128i __DEFAULT_FN_ATTRS 3631_mm_set_epi64(__m64 __q1, __m64 __q0) 3632{ 3633 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3634} 3635 3636/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3637/// the specified 32-bit integer values. 3638/// 3639/// \headerfile <x86intrin.h> 3640/// 3641/// This intrinsic is a utility function and does not correspond to a specific 3642/// instruction. 3643/// 3644/// \param __i3 3645/// A 32-bit integer value used to initialize bits [127:96] of the 3646/// destination vector. 3647/// \param __i2 3648/// A 32-bit integer value used to initialize bits [95:64] of the destination 3649/// vector. 3650/// \param __i1 3651/// A 32-bit integer value used to initialize bits [63:32] of the destination 3652/// vector. 3653/// \param __i0 3654/// A 32-bit integer value used to initialize bits [31:0] of the destination 3655/// vector. 3656/// \returns An initialized 128-bit vector of [4 x i32] containing the values 3657/// provided in the operands. 3658static __inline__ __m128i __DEFAULT_FN_ATTRS 3659_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3660{ 3661 return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3662} 3663 3664/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3665/// the specified 16-bit integer values. 3666/// 3667/// \headerfile <x86intrin.h> 3668/// 3669/// This intrinsic is a utility function and does not correspond to a specific 3670/// instruction. 3671/// 3672/// \param __w7 3673/// A 16-bit integer value used to initialize bits [127:112] of the 3674/// destination vector. 3675/// \param __w6 3676/// A 16-bit integer value used to initialize bits [111:96] of the 3677/// destination vector. 3678/// \param __w5 3679/// A 16-bit integer value used to initialize bits [95:80] of the destination 3680/// vector. 3681/// \param __w4 3682/// A 16-bit integer value used to initialize bits [79:64] of the destination 3683/// vector. 3684/// \param __w3 3685/// A 16-bit integer value used to initialize bits [63:48] of the destination 3686/// vector. 3687/// \param __w2 3688/// A 16-bit integer value used to initialize bits [47:32] of the destination 3689/// vector. 3690/// \param __w1 3691/// A 16-bit integer value used to initialize bits [31:16] of the destination 3692/// vector. 3693/// \param __w0 3694/// A 16-bit integer value used to initialize bits [15:0] of the destination 3695/// vector. 3696/// \returns An initialized 128-bit vector of [8 x i16] containing the values 3697/// provided in the operands. 3698static __inline__ __m128i __DEFAULT_FN_ATTRS 3699_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3700{ 3701 return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3702} 3703 3704/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3705/// the specified 8-bit integer values. 3706/// 3707/// \headerfile <x86intrin.h> 3708/// 3709/// This intrinsic is a utility function and does not correspond to a specific 3710/// instruction. 3711/// 3712/// \param __b15 3713/// Initializes bits [127:120] of the destination vector. 3714/// \param __b14 3715/// Initializes bits [119:112] of the destination vector. 3716/// \param __b13 3717/// Initializes bits [111:104] of the destination vector. 3718/// \param __b12 3719/// Initializes bits [103:96] of the destination vector. 3720/// \param __b11 3721/// Initializes bits [95:88] of the destination vector. 3722/// \param __b10 3723/// Initializes bits [87:80] of the destination vector. 3724/// \param __b9 3725/// Initializes bits [79:72] of the destination vector. 3726/// \param __b8 3727/// Initializes bits [71:64] of the destination vector. 3728/// \param __b7 3729/// Initializes bits [63:56] of the destination vector. 3730/// \param __b6 3731/// Initializes bits [55:48] of the destination vector. 3732/// \param __b5 3733/// Initializes bits [47:40] of the destination vector. 3734/// \param __b4 3735/// Initializes bits [39:32] of the destination vector. 3736/// \param __b3 3737/// Initializes bits [31:24] of the destination vector. 3738/// \param __b2 3739/// Initializes bits [23:16] of the destination vector. 3740/// \param __b1 3741/// Initializes bits [15:8] of the destination vector. 3742/// \param __b0 3743/// Initializes bits [7:0] of the destination vector. 3744/// \returns An initialized 128-bit vector of [16 x i8] containing the values 3745/// provided in the operands. 3746static __inline__ __m128i __DEFAULT_FN_ATTRS 3747_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3748{ 3749 return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3750} 3751 3752/// Initializes both values in a 128-bit integer vector with the 3753/// specified 64-bit integer value. 3754/// 3755/// \headerfile <x86intrin.h> 3756/// 3757/// This intrinsic is a utility function and does not correspond to a specific 3758/// instruction. 3759/// 3760/// \param __q 3761/// Integer value used to initialize the elements of the destination integer 3762/// vector. 3763/// \returns An initialized 128-bit integer vector of [2 x i64] with both 3764/// elements containing the value provided in the operand. 3765static __inline__ __m128i __DEFAULT_FN_ATTRS 3766_mm_set1_epi64x(long long __q) 3767{ 3768 return _mm_set_epi64x(__q, __q); 3769} 3770 3771/// Initializes both values in a 128-bit vector of [2 x i64] with the 3772/// specified 64-bit value. 3773/// 3774/// \headerfile <x86intrin.h> 3775/// 3776/// This intrinsic is a utility function and does not correspond to a specific 3777/// instruction. 3778/// 3779/// \param __q 3780/// A 64-bit value used to initialize the elements of the destination integer 3781/// vector. 3782/// \returns An initialized 128-bit vector of [2 x i64] with all elements 3783/// containing the value provided in the operand. 3784static __inline__ __m128i __DEFAULT_FN_ATTRS 3785_mm_set1_epi64(__m64 __q) 3786{ 3787 return _mm_set_epi64(__q, __q); 3788} 3789 3790/// Initializes all values in a 128-bit vector of [4 x i32] with the 3791/// specified 32-bit value. 3792/// 3793/// \headerfile <x86intrin.h> 3794/// 3795/// This intrinsic is a utility function and does not correspond to a specific 3796/// instruction. 3797/// 3798/// \param __i 3799/// A 32-bit value used to initialize the elements of the destination integer 3800/// vector. 3801/// \returns An initialized 128-bit vector of [4 x i32] with all elements 3802/// containing the value provided in the operand. 3803static __inline__ __m128i __DEFAULT_FN_ATTRS 3804_mm_set1_epi32(int __i) 3805{ 3806 return _mm_set_epi32(__i, __i, __i, __i); 3807} 3808 3809/// Initializes all values in a 128-bit vector of [8 x i16] with the 3810/// specified 16-bit value. 3811/// 3812/// \headerfile <x86intrin.h> 3813/// 3814/// This intrinsic is a utility function and does not correspond to a specific 3815/// instruction. 3816/// 3817/// \param __w 3818/// A 16-bit value used to initialize the elements of the destination integer 3819/// vector. 3820/// \returns An initialized 128-bit vector of [8 x i16] with all elements 3821/// containing the value provided in the operand. 3822static __inline__ __m128i __DEFAULT_FN_ATTRS 3823_mm_set1_epi16(short __w) 3824{ 3825 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3826} 3827 3828/// Initializes all values in a 128-bit vector of [16 x i8] with the 3829/// specified 8-bit value. 3830/// 3831/// \headerfile <x86intrin.h> 3832/// 3833/// This intrinsic is a utility function and does not correspond to a specific 3834/// instruction. 3835/// 3836/// \param __b 3837/// An 8-bit value used to initialize the elements of the destination integer 3838/// vector. 3839/// \returns An initialized 128-bit vector of [16 x i8] with all elements 3840/// containing the value provided in the operand. 3841static __inline__ __m128i __DEFAULT_FN_ATTRS 3842_mm_set1_epi8(char __b) 3843{ 3844 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); 3845} 3846 3847/// Constructs a 128-bit integer vector, initialized in reverse order 3848/// with the specified 64-bit integral values. 3849/// 3850/// \headerfile <x86intrin.h> 3851/// 3852/// This intrinsic does not correspond to a specific instruction. 3853/// 3854/// \param __q0 3855/// A 64-bit integral value used to initialize the lower 64 bits of the 3856/// result. 3857/// \param __q1 3858/// A 64-bit integral value used to initialize the upper 64 bits of the 3859/// result. 3860/// \returns An initialized 128-bit integer vector. 3861static __inline__ __m128i __DEFAULT_FN_ATTRS 3862_mm_setr_epi64(__m64 __q0, __m64 __q1) 3863{ 3864 return _mm_set_epi64(__q1, __q0); 3865} 3866 3867/// Constructs a 128-bit integer vector, initialized in reverse order 3868/// with the specified 32-bit integral values. 3869/// 3870/// \headerfile <x86intrin.h> 3871/// 3872/// This intrinsic is a utility function and does not correspond to a specific 3873/// instruction. 3874/// 3875/// \param __i0 3876/// A 32-bit integral value used to initialize bits [31:0] of the result. 3877/// \param __i1 3878/// A 32-bit integral value used to initialize bits [63:32] of the result. 3879/// \param __i2 3880/// A 32-bit integral value used to initialize bits [95:64] of the result. 3881/// \param __i3 3882/// A 32-bit integral value used to initialize bits [127:96] of the result. 3883/// \returns An initialized 128-bit integer vector. 3884static __inline__ __m128i __DEFAULT_FN_ATTRS 3885_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3886{ 3887 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3888} 3889 3890/// Constructs a 128-bit integer vector, initialized in reverse order 3891/// with the specified 16-bit integral values. 3892/// 3893/// \headerfile <x86intrin.h> 3894/// 3895/// This intrinsic is a utility function and does not correspond to a specific 3896/// instruction. 3897/// 3898/// \param __w0 3899/// A 16-bit integral value used to initialize bits [15:0] of the result. 3900/// \param __w1 3901/// A 16-bit integral value used to initialize bits [31:16] of the result. 3902/// \param __w2 3903/// A 16-bit integral value used to initialize bits [47:32] of the result. 3904/// \param __w3 3905/// A 16-bit integral value used to initialize bits [63:48] of the result. 3906/// \param __w4 3907/// A 16-bit integral value used to initialize bits [79:64] of the result. 3908/// \param __w5 3909/// A 16-bit integral value used to initialize bits [95:80] of the result. 3910/// \param __w6 3911/// A 16-bit integral value used to initialize bits [111:96] of the result. 3912/// \param __w7 3913/// A 16-bit integral value used to initialize bits [127:112] of the result. 3914/// \returns An initialized 128-bit integer vector. 3915static __inline__ __m128i __DEFAULT_FN_ATTRS 3916_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3917{ 3918 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3919} 3920 3921/// Constructs a 128-bit integer vector, initialized in reverse order 3922/// with the specified 8-bit integral values. 3923/// 3924/// \headerfile <x86intrin.h> 3925/// 3926/// This intrinsic is a utility function and does not correspond to a specific 3927/// instruction. 3928/// 3929/// \param __b0 3930/// An 8-bit integral value used to initialize bits [7:0] of the result. 3931/// \param __b1 3932/// An 8-bit integral value used to initialize bits [15:8] of the result. 3933/// \param __b2 3934/// An 8-bit integral value used to initialize bits [23:16] of the result. 3935/// \param __b3 3936/// An 8-bit integral value used to initialize bits [31:24] of the result. 3937/// \param __b4 3938/// An 8-bit integral value used to initialize bits [39:32] of the result. 3939/// \param __b5 3940/// An 8-bit integral value used to initialize bits [47:40] of the result. 3941/// \param __b6 3942/// An 8-bit integral value used to initialize bits [55:48] of the result. 3943/// \param __b7 3944/// An 8-bit integral value used to initialize bits [63:56] of the result. 3945/// \param __b8 3946/// An 8-bit integral value used to initialize bits [71:64] of the result. 3947/// \param __b9 3948/// An 8-bit integral value used to initialize bits [79:72] of the result. 3949/// \param __b10 3950/// An 8-bit integral value used to initialize bits [87:80] of the result. 3951/// \param __b11 3952/// An 8-bit integral value used to initialize bits [95:88] of the result. 3953/// \param __b12 3954/// An 8-bit integral value used to initialize bits [103:96] of the result. 3955/// \param __b13 3956/// An 8-bit integral value used to initialize bits [111:104] of the result. 3957/// \param __b14 3958/// An 8-bit integral value used to initialize bits [119:112] of the result. 3959/// \param __b15 3960/// An 8-bit integral value used to initialize bits [127:120] of the result. 3961/// \returns An initialized 128-bit integer vector. 3962static __inline__ __m128i __DEFAULT_FN_ATTRS 3963_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3964{ 3965 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3966} 3967 3968/// Creates a 128-bit integer vector initialized to zero. 3969/// 3970/// \headerfile <x86intrin.h> 3971/// 3972/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3973/// 3974/// \returns An initialized 128-bit integer vector with all elements set to 3975/// zero. 3976static __inline__ __m128i __DEFAULT_FN_ATTRS 3977_mm_setzero_si128(void) 3978{ 3979 return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; 3980} 3981 3982/// Stores a 128-bit integer vector to a memory location aligned on a 3983/// 128-bit boundary. 3984/// 3985/// \headerfile <x86intrin.h> 3986/// 3987/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3988/// 3989/// \param __p 3990/// A pointer to an aligned memory location that will receive the integer 3991/// values. 3992/// \param __b 3993/// A 128-bit integer vector containing the values to be moved. 3994static __inline__ void __DEFAULT_FN_ATTRS 3995_mm_store_si128(__m128i *__p, __m128i __b) 3996{ 3997 *__p = __b; 3998} 3999 4000/// Stores a 128-bit integer vector to an unaligned memory location. 4001/// 4002/// \headerfile <x86intrin.h> 4003/// 4004/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 4005/// 4006/// \param __p 4007/// A pointer to a memory location that will receive the integer values. 4008/// \param __b 4009/// A 128-bit integer vector containing the values to be moved. 4010static __inline__ void __DEFAULT_FN_ATTRS 4011_mm_storeu_si128(__m128i_u *__p, __m128i __b) 4012{ 4013 struct __storeu_si128 { 4014 __m128i_u __v; 4015 } __attribute__((__packed__, __may_alias__)); 4016 ((struct __storeu_si128*)__p)->__v = __b; 4017} 4018 4019/// Stores a 64-bit integer value from the low element of a 128-bit integer 4020/// vector. 4021/// 4022/// \headerfile <x86intrin.h> 4023/// 4024/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4025/// 4026/// \param __p 4027/// A pointer to a 64-bit memory location. The address of the memory 4028/// location does not have to be algned. 4029/// \param __b 4030/// A 128-bit integer vector containing the value to be stored. 4031static __inline__ void __DEFAULT_FN_ATTRS 4032_mm_storeu_si64(void *__p, __m128i __b) 4033{ 4034 struct __storeu_si64 { 4035 long long __v; 4036 } __attribute__((__packed__, __may_alias__)); 4037 ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; 4038} 4039 4040/// Stores a 32-bit integer value from the low element of a 128-bit integer 4041/// vector. 4042/// 4043/// \headerfile <x86intrin.h> 4044/// 4045/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 4046/// 4047/// \param __p 4048/// A pointer to a 32-bit memory location. The address of the memory 4049/// location does not have to be aligned. 4050/// \param __b 4051/// A 128-bit integer vector containing the value to be stored. 4052static __inline__ void __DEFAULT_FN_ATTRS 4053_mm_storeu_si32(void *__p, __m128i __b) 4054{ 4055 struct __storeu_si32 { 4056 int __v; 4057 } __attribute__((__packed__, __may_alias__)); 4058 ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; 4059} 4060 4061/// Stores a 16-bit integer value from the low element of a 128-bit integer 4062/// vector. 4063/// 4064/// \headerfile <x86intrin.h> 4065/// 4066/// This intrinsic does not correspond to a specific instruction. 4067/// 4068/// \param __p 4069/// A pointer to a 16-bit memory location. The address of the memory 4070/// location does not have to be aligned. 4071/// \param __b 4072/// A 128-bit integer vector containing the value to be stored. 4073static __inline__ void __DEFAULT_FN_ATTRS 4074_mm_storeu_si16(void *__p, __m128i __b) 4075{ 4076 struct __storeu_si16 { 4077 short __v; 4078 } __attribute__((__packed__, __may_alias__)); 4079 ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; 4080} 4081 4082/// Moves bytes selected by the mask from the first operand to the 4083/// specified unaligned memory location. When a mask bit is 1, the 4084/// corresponding byte is written, otherwise it is not written. 4085/// 4086/// To minimize caching, the data is flagged as non-temporal (unlikely to be 4087/// used again soon). Exception and trap behavior for elements not selected 4088/// for storage to memory are implementation dependent. 4089/// 4090/// \headerfile <x86intrin.h> 4091/// 4092/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 4093/// instruction. 4094/// 4095/// \param __d 4096/// A 128-bit integer vector containing the values to be moved. 4097/// \param __n 4098/// A 128-bit integer vector containing the mask. The most significant bit of 4099/// each byte represents the mask bits. 4100/// \param __p 4101/// A pointer to an unaligned 128-bit memory location where the specified 4102/// values are moved. 4103static __inline__ void __DEFAULT_FN_ATTRS 4104_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 4105{ 4106 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 4107} 4108 4109/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 4110/// a memory location. 4111/// 4112/// \headerfile <x86intrin.h> 4113/// 4114/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 4115/// 4116/// \param __p 4117/// A pointer to a 64-bit memory location that will receive the lower 64 bits 4118/// of the integer vector parameter. 4119/// \param __a 4120/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 4121/// value to be stored. 4122static __inline__ void __DEFAULT_FN_ATTRS 4123_mm_storel_epi64(__m128i_u *__p, __m128i __a) 4124{ 4125 struct __mm_storel_epi64_struct { 4126 long long __u; 4127 } __attribute__((__packed__, __may_alias__)); 4128 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 4129} 4130 4131/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4132/// aligned memory location. 4133/// 4134/// To minimize caching, the data is flagged as non-temporal (unlikely to be 4135/// used again soon). 4136/// 4137/// \headerfile <x86intrin.h> 4138/// 4139/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4140/// 4141/// \param __p 4142/// A pointer to the 128-bit aligned memory location used to store the value. 4143/// \param __a 4144/// A vector of [2 x double] containing the 64-bit values to be stored. 4145static __inline__ void __DEFAULT_FN_ATTRS 4146_mm_stream_pd(double *__p, __m128d __a) 4147{ 4148 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 4149} 4150 4151/// Stores a 128-bit integer vector to a 128-bit aligned memory location. 4152/// 4153/// To minimize caching, the data is flagged as non-temporal (unlikely to be 4154/// used again soon). 4155/// 4156/// \headerfile <x86intrin.h> 4157/// 4158/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4159/// 4160/// \param __p 4161/// A pointer to the 128-bit aligned memory location used to store the value. 4162/// \param __a 4163/// A 128-bit integer vector containing the values to be stored. 4164static __inline__ void __DEFAULT_FN_ATTRS 4165_mm_stream_si128(__m128i *__p, __m128i __a) 4166{ 4167 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 4168} 4169 4170/// Stores a 32-bit integer value in the specified memory location. 4171/// 4172/// To minimize caching, the data is flagged as non-temporal (unlikely to be 4173/// used again soon). 4174/// 4175/// \headerfile <x86intrin.h> 4176/// 4177/// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4178/// 4179/// \param __p 4180/// A pointer to the 32-bit memory location used to store the value. 4181/// \param __a 4182/// A 32-bit integer containing the value to be stored. 4183static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4184_mm_stream_si32(int *__p, int __a) 4185{ 4186 __builtin_ia32_movnti(__p, __a); 4187} 4188 4189#ifdef __x86_64__ 4190/// Stores a 64-bit integer value in the specified memory location. 4191/// 4192/// To minimize caching, the data is flagged as non-temporal (unlikely to be 4193/// used again soon). 4194/// 4195/// \headerfile <x86intrin.h> 4196/// 4197/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4198/// 4199/// \param __p 4200/// A pointer to the 64-bit memory location used to store the value. 4201/// \param __a 4202/// A 64-bit integer containing the value to be stored. 4203static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4204_mm_stream_si64(long long *__p, long long __a) 4205{ 4206 __builtin_ia32_movnti64(__p, __a); 4207} 4208#endif 4209 4210#if defined(__cplusplus) 4211extern "C" { 4212#endif 4213 4214/// The cache line containing \a __p is flushed and invalidated from all 4215/// caches in the coherency domain. 4216/// 4217/// \headerfile <x86intrin.h> 4218/// 4219/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4220/// 4221/// \param __p 4222/// A pointer to the memory location used to identify the cache line to be 4223/// flushed. 4224void _mm_clflush(void const * __p); 4225 4226/// Forces strong memory ordering (serialization) between load 4227/// instructions preceding this instruction and load instructions following 4228/// this instruction, ensuring the system completes all previous loads before 4229/// executing subsequent loads. 4230/// 4231/// \headerfile <x86intrin.h> 4232/// 4233/// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4234/// 4235void _mm_lfence(void); 4236 4237/// Forces strong memory ordering (serialization) between load and store 4238/// instructions preceding this instruction and load and store instructions 4239/// following this instruction, ensuring that the system completes all 4240/// previous memory accesses before executing subsequent memory accesses. 4241/// 4242/// \headerfile <x86intrin.h> 4243/// 4244/// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4245/// 4246void _mm_mfence(void); 4247 4248#if defined(__cplusplus) 4249} // extern "C" 4250#endif 4251 4252/// Converts 16-bit signed integers from both 128-bit integer vector 4253/// operands into 8-bit signed integers, and packs the results into the 4254/// destination. Positive values greater than 0x7F are saturated to 0x7F. 4255/// Negative values less than 0x80 are saturated to 0x80. 4256/// 4257/// \headerfile <x86intrin.h> 4258/// 4259/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4260/// 4261/// \param __a 4262/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4263/// a signed integer and is converted to a 8-bit signed integer with 4264/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4265/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4266/// written to the lower 64 bits of the result. 4267/// \param __b 4268/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4269/// a signed integer and is converted to a 8-bit signed integer with 4270/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4271/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4272/// written to the higher 64 bits of the result. 4273/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4274static __inline__ __m128i __DEFAULT_FN_ATTRS 4275_mm_packs_epi16(__m128i __a, __m128i __b) 4276{ 4277 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4278} 4279 4280/// Converts 32-bit signed integers from both 128-bit integer vector 4281/// operands into 16-bit signed integers, and packs the results into the 4282/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4283/// Negative values less than 0x8000 are saturated to 0x8000. 4284/// 4285/// \headerfile <x86intrin.h> 4286/// 4287/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4288/// 4289/// \param __a 4290/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4291/// a signed integer and is converted to a 16-bit signed integer with 4292/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4293/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4294/// are written to the lower 64 bits of the result. 4295/// \param __b 4296/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4297/// a signed integer and is converted to a 16-bit signed integer with 4298/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4299/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4300/// are written to the higher 64 bits of the result. 4301/// \returns A 128-bit vector of [8 x i16] containing the converted values. 4302static __inline__ __m128i __DEFAULT_FN_ATTRS 4303_mm_packs_epi32(__m128i __a, __m128i __b) 4304{ 4305 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4306} 4307 4308/// Converts 16-bit signed integers from both 128-bit integer vector 4309/// operands into 8-bit unsigned integers, and packs the results into the 4310/// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4311/// than 0x00 are saturated to 0x00. 4312/// 4313/// \headerfile <x86intrin.h> 4314/// 4315/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4316/// 4317/// \param __a 4318/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4319/// a signed integer and is converted to an 8-bit unsigned integer with 4320/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4321/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4322/// written to the lower 64 bits of the result. 4323/// \param __b 4324/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4325/// a signed integer and is converted to an 8-bit unsigned integer with 4326/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4327/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4328/// written to the higher 64 bits of the result. 4329/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4330static __inline__ __m128i __DEFAULT_FN_ATTRS 4331_mm_packus_epi16(__m128i __a, __m128i __b) 4332{ 4333 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4334} 4335 4336/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4337/// the immediate-value parameter as a selector. 4338/// 4339/// \headerfile <x86intrin.h> 4340/// 4341/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4342/// 4343/// \param __a 4344/// A 128-bit integer vector. 4345/// \param __imm 4346/// An immediate value. Bits [2:0] selects values from \a __a to be assigned 4347/// to bits[15:0] of the result. \n 4348/// 000: assign values from bits [15:0] of \a __a. \n 4349/// 001: assign values from bits [31:16] of \a __a. \n 4350/// 010: assign values from bits [47:32] of \a __a. \n 4351/// 011: assign values from bits [63:48] of \a __a. \n 4352/// 100: assign values from bits [79:64] of \a __a. \n 4353/// 101: assign values from bits [95:80] of \a __a. \n 4354/// 110: assign values from bits [111:96] of \a __a. \n 4355/// 111: assign values from bits [127:112] of \a __a. 4356/// \returns An integer, whose lower 16 bits are selected from the 128-bit 4357/// integer vector parameter and the remaining bits are assigned zeros. 4358#define _mm_extract_epi16(a, imm) \ 4359 (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4360 (int)(imm)) 4361 4362/// Constructs a 128-bit integer vector by first making a copy of the 4363/// 128-bit integer vector parameter, and then inserting the lower 16 bits 4364/// of an integer parameter into an offset specified by the immediate-value 4365/// parameter. 4366/// 4367/// \headerfile <x86intrin.h> 4368/// 4369/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4370/// 4371/// \param __a 4372/// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4373/// result and then one of the eight elements in the result is replaced by 4374/// the lower 16 bits of \a __b. 4375/// \param __b 4376/// An integer. The lower 16 bits of this parameter are written to the 4377/// result beginning at an offset specified by \a __imm. 4378/// \param __imm 4379/// An immediate value specifying the bit offset in the result at which the 4380/// lower 16 bits of \a __b are written. 4381/// \returns A 128-bit integer vector containing the constructed values. 4382#define _mm_insert_epi16(a, b, imm) \ 4383 (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4384 (int)(imm)) 4385 4386/// Copies the values of the most significant bits from each 8-bit 4387/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4388/// value, zero-extends the value, and writes it to the destination. 4389/// 4390/// \headerfile <x86intrin.h> 4391/// 4392/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4393/// 4394/// \param __a 4395/// A 128-bit integer vector containing the values with bits to be extracted. 4396/// \returns The most significant bits from each 8-bit element in \a __a, 4397/// written to bits [15:0]. The other bits are assigned zeros. 4398static __inline__ int __DEFAULT_FN_ATTRS 4399_mm_movemask_epi8(__m128i __a) 4400{ 4401 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4402} 4403 4404/// Constructs a 128-bit integer vector by shuffling four 32-bit 4405/// elements of a 128-bit integer vector parameter, using the immediate-value 4406/// parameter as a specifier. 4407/// 4408/// \headerfile <x86intrin.h> 4409/// 4410/// \code 4411/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4412/// \endcode 4413/// 4414/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4415/// 4416/// \param a 4417/// A 128-bit integer vector containing the values to be copied. 4418/// \param imm 4419/// An immediate value containing an 8-bit value specifying which elements to 4420/// copy from a. The destinations within the 128-bit destination are assigned 4421/// values as follows: \n 4422/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4423/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4424/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4425/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4426/// Bit value assignments: \n 4427/// 00: assign values from bits [31:0] of \a a. \n 4428/// 01: assign values from bits [63:32] of \a a. \n 4429/// 10: assign values from bits [95:64] of \a a. \n 4430/// 11: assign values from bits [127:96] of \a a. 4431/// \returns A 128-bit integer vector containing the shuffled values. 4432#define _mm_shuffle_epi32(a, imm) \ 4433 (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)) 4434 4435/// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4436/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4437/// value parameter as a specifier. 4438/// 4439/// \headerfile <x86intrin.h> 4440/// 4441/// \code 4442/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4443/// \endcode 4444/// 4445/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4446/// 4447/// \param a 4448/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4449/// [127:64] of the result. 4450/// \param imm 4451/// An 8-bit immediate value specifying which elements to copy from \a a. \n 4452/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4453/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4454/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4455/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4456/// Bit value assignments: \n 4457/// 00: assign values from bits [15:0] of \a a. \n 4458/// 01: assign values from bits [31:16] of \a a. \n 4459/// 10: assign values from bits [47:32] of \a a. \n 4460/// 11: assign values from bits [63:48] of \a a. \n 4461/// \returns A 128-bit integer vector containing the shuffled values. 4462#define _mm_shufflelo_epi16(a, imm) \ 4463 (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)) 4464 4465/// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4466/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4467/// value parameter as a specifier. 4468/// 4469/// \headerfile <x86intrin.h> 4470/// 4471/// \code 4472/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4473/// \endcode 4474/// 4475/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4476/// 4477/// \param a 4478/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4479/// [63:0] of the result. 4480/// \param imm 4481/// An 8-bit immediate value specifying which elements to copy from \a a. \n 4482/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4483/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4484/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4485/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4486/// Bit value assignments: \n 4487/// 00: assign values from bits [79:64] of \a a. \n 4488/// 01: assign values from bits [95:80] of \a a. \n 4489/// 10: assign values from bits [111:96] of \a a. \n 4490/// 11: assign values from bits [127:112] of \a a. \n 4491/// \returns A 128-bit integer vector containing the shuffled values. 4492#define _mm_shufflehi_epi16(a, imm) \ 4493 (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)) 4494 4495/// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4496/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4497/// 4498/// \headerfile <x86intrin.h> 4499/// 4500/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4501/// instruction. 4502/// 4503/// \param __a 4504/// A 128-bit vector of [16 x i8]. 4505/// Bits [71:64] are written to bits [7:0] of the result. \n 4506/// Bits [79:72] are written to bits [23:16] of the result. \n 4507/// Bits [87:80] are written to bits [39:32] of the result. \n 4508/// Bits [95:88] are written to bits [55:48] of the result. \n 4509/// Bits [103:96] are written to bits [71:64] of the result. \n 4510/// Bits [111:104] are written to bits [87:80] of the result. \n 4511/// Bits [119:112] are written to bits [103:96] of the result. \n 4512/// Bits [127:120] are written to bits [119:112] of the result. 4513/// \param __b 4514/// A 128-bit vector of [16 x i8]. \n 4515/// Bits [71:64] are written to bits [15:8] of the result. \n 4516/// Bits [79:72] are written to bits [31:24] of the result. \n 4517/// Bits [87:80] are written to bits [47:40] of the result. \n 4518/// Bits [95:88] are written to bits [63:56] of the result. \n 4519/// Bits [103:96] are written to bits [79:72] of the result. \n 4520/// Bits [111:104] are written to bits [95:88] of the result. \n 4521/// Bits [119:112] are written to bits [111:104] of the result. \n 4522/// Bits [127:120] are written to bits [127:120] of the result. 4523/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4524static __inline__ __m128i __DEFAULT_FN_ATTRS 4525_mm_unpackhi_epi8(__m128i __a, __m128i __b) 4526{ 4527 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4528} 4529 4530/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4531/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4532/// 4533/// \headerfile <x86intrin.h> 4534/// 4535/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4536/// instruction. 4537/// 4538/// \param __a 4539/// A 128-bit vector of [8 x i16]. 4540/// Bits [79:64] are written to bits [15:0] of the result. \n 4541/// Bits [95:80] are written to bits [47:32] of the result. \n 4542/// Bits [111:96] are written to bits [79:64] of the result. \n 4543/// Bits [127:112] are written to bits [111:96] of the result. 4544/// \param __b 4545/// A 128-bit vector of [8 x i16]. 4546/// Bits [79:64] are written to bits [31:16] of the result. \n 4547/// Bits [95:80] are written to bits [63:48] of the result. \n 4548/// Bits [111:96] are written to bits [95:80] of the result. \n 4549/// Bits [127:112] are written to bits [127:112] of the result. 4550/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4551static __inline__ __m128i __DEFAULT_FN_ATTRS 4552_mm_unpackhi_epi16(__m128i __a, __m128i __b) 4553{ 4554 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4555} 4556 4557/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4558/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4559/// 4560/// \headerfile <x86intrin.h> 4561/// 4562/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4563/// instruction. 4564/// 4565/// \param __a 4566/// A 128-bit vector of [4 x i32]. \n 4567/// Bits [95:64] are written to bits [31:0] of the destination. \n 4568/// Bits [127:96] are written to bits [95:64] of the destination. 4569/// \param __b 4570/// A 128-bit vector of [4 x i32]. \n 4571/// Bits [95:64] are written to bits [64:32] of the destination. \n 4572/// Bits [127:96] are written to bits [127:96] of the destination. 4573/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4574static __inline__ __m128i __DEFAULT_FN_ATTRS 4575_mm_unpackhi_epi32(__m128i __a, __m128i __b) 4576{ 4577 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4578} 4579 4580/// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4581/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4582/// 4583/// \headerfile <x86intrin.h> 4584/// 4585/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4586/// instruction. 4587/// 4588/// \param __a 4589/// A 128-bit vector of [2 x i64]. \n 4590/// Bits [127:64] are written to bits [63:0] of the destination. 4591/// \param __b 4592/// A 128-bit vector of [2 x i64]. \n 4593/// Bits [127:64] are written to bits [127:64] of the destination. 4594/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4595static __inline__ __m128i __DEFAULT_FN_ATTRS 4596_mm_unpackhi_epi64(__m128i __a, __m128i __b) 4597{ 4598 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4599} 4600 4601/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4602/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4603/// 4604/// \headerfile <x86intrin.h> 4605/// 4606/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4607/// instruction. 4608/// 4609/// \param __a 4610/// A 128-bit vector of [16 x i8]. \n 4611/// Bits [7:0] are written to bits [7:0] of the result. \n 4612/// Bits [15:8] are written to bits [23:16] of the result. \n 4613/// Bits [23:16] are written to bits [39:32] of the result. \n 4614/// Bits [31:24] are written to bits [55:48] of the result. \n 4615/// Bits [39:32] are written to bits [71:64] of the result. \n 4616/// Bits [47:40] are written to bits [87:80] of the result. \n 4617/// Bits [55:48] are written to bits [103:96] of the result. \n 4618/// Bits [63:56] are written to bits [119:112] of the result. 4619/// \param __b 4620/// A 128-bit vector of [16 x i8]. 4621/// Bits [7:0] are written to bits [15:8] of the result. \n 4622/// Bits [15:8] are written to bits [31:24] of the result. \n 4623/// Bits [23:16] are written to bits [47:40] of the result. \n 4624/// Bits [31:24] are written to bits [63:56] of the result. \n 4625/// Bits [39:32] are written to bits [79:72] of the result. \n 4626/// Bits [47:40] are written to bits [95:88] of the result. \n 4627/// Bits [55:48] are written to bits [111:104] of the result. \n 4628/// Bits [63:56] are written to bits [127:120] of the result. 4629/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4630static __inline__ __m128i __DEFAULT_FN_ATTRS 4631_mm_unpacklo_epi8(__m128i __a, __m128i __b) 4632{ 4633 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4634} 4635 4636/// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4637/// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4638/// [8 x i16]. 4639/// 4640/// \headerfile <x86intrin.h> 4641/// 4642/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4643/// instruction. 4644/// 4645/// \param __a 4646/// A 128-bit vector of [8 x i16]. 4647/// Bits [15:0] are written to bits [15:0] of the result. \n 4648/// Bits [31:16] are written to bits [47:32] of the result. \n 4649/// Bits [47:32] are written to bits [79:64] of the result. \n 4650/// Bits [63:48] are written to bits [111:96] of the result. 4651/// \param __b 4652/// A 128-bit vector of [8 x i16]. 4653/// Bits [15:0] are written to bits [31:16] of the result. \n 4654/// Bits [31:16] are written to bits [63:48] of the result. \n 4655/// Bits [47:32] are written to bits [95:80] of the result. \n 4656/// Bits [63:48] are written to bits [127:112] of the result. 4657/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4658static __inline__ __m128i __DEFAULT_FN_ATTRS 4659_mm_unpacklo_epi16(__m128i __a, __m128i __b) 4660{ 4661 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4662} 4663 4664/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4665/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4666/// 4667/// \headerfile <x86intrin.h> 4668/// 4669/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4670/// instruction. 4671/// 4672/// \param __a 4673/// A 128-bit vector of [4 x i32]. \n 4674/// Bits [31:0] are written to bits [31:0] of the destination. \n 4675/// Bits [63:32] are written to bits [95:64] of the destination. 4676/// \param __b 4677/// A 128-bit vector of [4 x i32]. \n 4678/// Bits [31:0] are written to bits [64:32] of the destination. \n 4679/// Bits [63:32] are written to bits [127:96] of the destination. 4680/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4681static __inline__ __m128i __DEFAULT_FN_ATTRS 4682_mm_unpacklo_epi32(__m128i __a, __m128i __b) 4683{ 4684 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4685} 4686 4687/// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4688/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4689/// 4690/// \headerfile <x86intrin.h> 4691/// 4692/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4693/// instruction. 4694/// 4695/// \param __a 4696/// A 128-bit vector of [2 x i64]. \n 4697/// Bits [63:0] are written to bits [63:0] of the destination. \n 4698/// \param __b 4699/// A 128-bit vector of [2 x i64]. \n 4700/// Bits [63:0] are written to bits [127:64] of the destination. \n 4701/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4702static __inline__ __m128i __DEFAULT_FN_ATTRS 4703_mm_unpacklo_epi64(__m128i __a, __m128i __b) 4704{ 4705 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4706} 4707 4708/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4709/// integer. 4710/// 4711/// \headerfile <x86intrin.h> 4712/// 4713/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4714/// 4715/// \param __a 4716/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4717/// destination. 4718/// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4719static __inline__ __m64 __DEFAULT_FN_ATTRS 4720_mm_movepi64_pi64(__m128i __a) 4721{ 4722 return (__m64)__a[0]; 4723} 4724 4725/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4726/// upper bits. 4727/// 4728/// \headerfile <x86intrin.h> 4729/// 4730/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4731/// 4732/// \param __a 4733/// A 64-bit value. 4734/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4735/// the operand. The upper 64 bits are assigned zeros. 4736static __inline__ __m128i __DEFAULT_FN_ATTRS 4737_mm_movpi64_epi64(__m64 __a) 4738{ 4739 return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; 4740} 4741 4742/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4743/// integer vector, zeroing the upper bits. 4744/// 4745/// \headerfile <x86intrin.h> 4746/// 4747/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4748/// 4749/// \param __a 4750/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4751/// destination. 4752/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4753/// the operand. The upper 64 bits are assigned zeros. 4754static __inline__ __m128i __DEFAULT_FN_ATTRS 4755_mm_move_epi64(__m128i __a) 4756{ 4757 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4758} 4759 4760/// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4761/// [2 x double] and interleaves them into a 128-bit vector of [2 x 4762/// double]. 4763/// 4764/// \headerfile <x86intrin.h> 4765/// 4766/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4767/// 4768/// \param __a 4769/// A 128-bit vector of [2 x double]. \n 4770/// Bits [127:64] are written to bits [63:0] of the destination. 4771/// \param __b 4772/// A 128-bit vector of [2 x double]. \n 4773/// Bits [127:64] are written to bits [127:64] of the destination. 4774/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4775static __inline__ __m128d __DEFAULT_FN_ATTRS 4776_mm_unpackhi_pd(__m128d __a, __m128d __b) 4777{ 4778 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4779} 4780 4781/// Unpacks the low-order 64-bit elements from two 128-bit vectors 4782/// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4783/// double]. 4784/// 4785/// \headerfile <x86intrin.h> 4786/// 4787/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4788/// 4789/// \param __a 4790/// A 128-bit vector of [2 x double]. \n 4791/// Bits [63:0] are written to bits [63:0] of the destination. 4792/// \param __b 4793/// A 128-bit vector of [2 x double]. \n 4794/// Bits [63:0] are written to bits [127:64] of the destination. 4795/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4796static __inline__ __m128d __DEFAULT_FN_ATTRS 4797_mm_unpacklo_pd(__m128d __a, __m128d __b) 4798{ 4799 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4800} 4801 4802/// Extracts the sign bits of the double-precision values in the 128-bit 4803/// vector of [2 x double], zero-extends the value, and writes it to the 4804/// low-order bits of the destination. 4805/// 4806/// \headerfile <x86intrin.h> 4807/// 4808/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4809/// 4810/// \param __a 4811/// A 128-bit vector of [2 x double] containing the values with sign bits to 4812/// be extracted. 4813/// \returns The sign bits from each of the double-precision elements in \a __a, 4814/// written to bits [1:0]. The remaining bits are assigned values of zero. 4815static __inline__ int __DEFAULT_FN_ATTRS 4816_mm_movemask_pd(__m128d __a) 4817{ 4818 return __builtin_ia32_movmskpd((__v2df)__a); 4819} 4820 4821 4822/// Constructs a 128-bit floating-point vector of [2 x double] from two 4823/// 128-bit vector parameters of [2 x double], using the immediate-value 4824/// parameter as a specifier. 4825/// 4826/// \headerfile <x86intrin.h> 4827/// 4828/// \code 4829/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4830/// \endcode 4831/// 4832/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4833/// 4834/// \param a 4835/// A 128-bit vector of [2 x double]. 4836/// \param b 4837/// A 128-bit vector of [2 x double]. 4838/// \param i 4839/// An 8-bit immediate value. The least significant two bits specify which 4840/// elements to copy from \a a and \a b: \n 4841/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4842/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4843/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4844/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4845/// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4846#define _mm_shuffle_pd(a, b, i) \ 4847 (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4848 (int)(i)) 4849 4850/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4851/// floating-point vector of [4 x float]. 4852/// 4853/// \headerfile <x86intrin.h> 4854/// 4855/// This intrinsic has no corresponding instruction. 4856/// 4857/// \param __a 4858/// A 128-bit floating-point vector of [2 x double]. 4859/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4860/// bitwise pattern as the parameter. 4861static __inline__ __m128 __DEFAULT_FN_ATTRS 4862_mm_castpd_ps(__m128d __a) 4863{ 4864 return (__m128)__a; 4865} 4866 4867/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4868/// integer vector. 4869/// 4870/// \headerfile <x86intrin.h> 4871/// 4872/// This intrinsic has no corresponding instruction. 4873/// 4874/// \param __a 4875/// A 128-bit floating-point vector of [2 x double]. 4876/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4877/// parameter. 4878static __inline__ __m128i __DEFAULT_FN_ATTRS 4879_mm_castpd_si128(__m128d __a) 4880{ 4881 return (__m128i)__a; 4882} 4883 4884/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4885/// floating-point vector of [2 x double]. 4886/// 4887/// \headerfile <x86intrin.h> 4888/// 4889/// This intrinsic has no corresponding instruction. 4890/// 4891/// \param __a 4892/// A 128-bit floating-point vector of [4 x float]. 4893/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4894/// bitwise pattern as the parameter. 4895static __inline__ __m128d __DEFAULT_FN_ATTRS 4896_mm_castps_pd(__m128 __a) 4897{ 4898 return (__m128d)__a; 4899} 4900 4901/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4902/// integer vector. 4903/// 4904/// \headerfile <x86intrin.h> 4905/// 4906/// This intrinsic has no corresponding instruction. 4907/// 4908/// \param __a 4909/// A 128-bit floating-point vector of [4 x float]. 4910/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4911/// parameter. 4912static __inline__ __m128i __DEFAULT_FN_ATTRS 4913_mm_castps_si128(__m128 __a) 4914{ 4915 return (__m128i)__a; 4916} 4917 4918/// Casts a 128-bit integer vector into a 128-bit floating-point vector 4919/// of [4 x float]. 4920/// 4921/// \headerfile <x86intrin.h> 4922/// 4923/// This intrinsic has no corresponding instruction. 4924/// 4925/// \param __a 4926/// A 128-bit integer vector. 4927/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4928/// bitwise pattern as the parameter. 4929static __inline__ __m128 __DEFAULT_FN_ATTRS 4930_mm_castsi128_ps(__m128i __a) 4931{ 4932 return (__m128)__a; 4933} 4934 4935/// Casts a 128-bit integer vector into a 128-bit floating-point vector 4936/// of [2 x double]. 4937/// 4938/// \headerfile <x86intrin.h> 4939/// 4940/// This intrinsic has no corresponding instruction. 4941/// 4942/// \param __a 4943/// A 128-bit integer vector. 4944/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4945/// bitwise pattern as the parameter. 4946static __inline__ __m128d __DEFAULT_FN_ATTRS 4947_mm_castsi128_pd(__m128i __a) 4948{ 4949 return (__m128d)__a; 4950} 4951 4952#if defined(__cplusplus) 4953extern "C" { 4954#endif 4955 4956/// Indicates that a spin loop is being executed for the purposes of 4957/// optimizing power consumption during the loop. 4958/// 4959/// \headerfile <x86intrin.h> 4960/// 4961/// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4962/// 4963void _mm_pause(void); 4964 4965#if defined(__cplusplus) 4966} // extern "C" 4967#endif 4968#undef __DEFAULT_FN_ATTRS 4969#undef __DEFAULT_FN_ATTRS_MMX 4970 4971#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4972 4973#define _MM_DENORMALS_ZERO_ON (0x0040) 4974#define _MM_DENORMALS_ZERO_OFF (0x0000) 4975 4976#define _MM_DENORMALS_ZERO_MASK (0x0040) 4977 4978#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4979#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4980 4981#endif /* __EMMINTRIN_H */ 4982