avxintrin.h revision 322320
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28#ifndef __AVXINTRIN_H 29#define __AVXINTRIN_H 30 31typedef double __v4df __attribute__ ((__vector_size__ (32))); 32typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34typedef int __v8si __attribute__ ((__vector_size__ (32))); 35typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38/* Unsigned types */ 39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44/* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48typedef float __m256 __attribute__ ((__vector_size__ (32))); 49typedef double __m256d __attribute__((__vector_size__(32))); 50typedef long long __m256i __attribute__((__vector_size__(32))); 51 52/* Define the default attributes for the functions in this file. */ 53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55/* Arithmetic */ 56/// \brief Adds two 256-bit vectors of [4 x double]. 57/// 58/// \headerfile <x86intrin.h> 59/// 60/// This intrinsic corresponds to the <c> VADDPD </c> instruction. 61/// 62/// \param __a 63/// A 256-bit vector of [4 x double] containing one of the source operands. 64/// \param __b 65/// A 256-bit vector of [4 x double] containing one of the source operands. 66/// \returns A 256-bit vector of [4 x double] containing the sums of both 67/// operands. 68static __inline __m256d __DEFAULT_FN_ATTRS 69_mm256_add_pd(__m256d __a, __m256d __b) 70{ 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72} 73 74/// \brief Adds two 256-bit vectors of [8 x float]. 75/// 76/// \headerfile <x86intrin.h> 77/// 78/// This intrinsic corresponds to the <c> VADDPS </c> instruction. 79/// 80/// \param __a 81/// A 256-bit vector of [8 x float] containing one of the source operands. 82/// \param __b 83/// A 256-bit vector of [8 x float] containing one of the source operands. 84/// \returns A 256-bit vector of [8 x float] containing the sums of both 85/// operands. 86static __inline __m256 __DEFAULT_FN_ATTRS 87_mm256_add_ps(__m256 __a, __m256 __b) 88{ 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90} 91 92/// \brief Subtracts two 256-bit vectors of [4 x double]. 93/// 94/// \headerfile <x86intrin.h> 95/// 96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 97/// 98/// \param __a 99/// A 256-bit vector of [4 x double] containing the minuend. 100/// \param __b 101/// A 256-bit vector of [4 x double] containing the subtrahend. 102/// \returns A 256-bit vector of [4 x double] containing the differences between 103/// both operands. 104static __inline __m256d __DEFAULT_FN_ATTRS 105_mm256_sub_pd(__m256d __a, __m256d __b) 106{ 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108} 109 110/// \brief Subtracts two 256-bit vectors of [8 x float]. 111/// 112/// \headerfile <x86intrin.h> 113/// 114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 115/// 116/// \param __a 117/// A 256-bit vector of [8 x float] containing the minuend. 118/// \param __b 119/// A 256-bit vector of [8 x float] containing the subtrahend. 120/// \returns A 256-bit vector of [8 x float] containing the differences between 121/// both operands. 122static __inline __m256 __DEFAULT_FN_ATTRS 123_mm256_sub_ps(__m256 __a, __m256 __b) 124{ 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126} 127 128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129/// two 256-bit vectors of [4 x double]. 130/// 131/// \headerfile <x86intrin.h> 132/// 133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 134/// 135/// \param __a 136/// A 256-bit vector of [4 x double] containing the left source operand. 137/// \param __b 138/// A 256-bit vector of [4 x double] containing the right source operand. 139/// \returns A 256-bit vector of [4 x double] containing the alternating sums 140/// and differences between both operands. 141static __inline __m256d __DEFAULT_FN_ATTRS 142_mm256_addsub_pd(__m256d __a, __m256d __b) 143{ 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145} 146 147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148/// two 256-bit vectors of [8 x float]. 149/// 150/// \headerfile <x86intrin.h> 151/// 152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 153/// 154/// \param __a 155/// A 256-bit vector of [8 x float] containing the left source operand. 156/// \param __b 157/// A 256-bit vector of [8 x float] containing the right source operand. 158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159/// differences between both operands. 160static __inline __m256 __DEFAULT_FN_ATTRS 161_mm256_addsub_ps(__m256 __a, __m256 __b) 162{ 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164} 165 166/// \brief Divides two 256-bit vectors of [4 x double]. 167/// 168/// \headerfile <x86intrin.h> 169/// 170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 171/// 172/// \param __a 173/// A 256-bit vector of [4 x double] containing the dividend. 174/// \param __b 175/// A 256-bit vector of [4 x double] containing the divisor. 176/// \returns A 256-bit vector of [4 x double] containing the quotients of both 177/// operands. 178static __inline __m256d __DEFAULT_FN_ATTRS 179_mm256_div_pd(__m256d __a, __m256d __b) 180{ 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182} 183 184/// \brief Divides two 256-bit vectors of [8 x float]. 185/// 186/// \headerfile <x86intrin.h> 187/// 188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189/// 190/// \param __a 191/// A 256-bit vector of [8 x float] containing the dividend. 192/// \param __b 193/// A 256-bit vector of [8 x float] containing the divisor. 194/// \returns A 256-bit vector of [8 x float] containing the quotients of both 195/// operands. 196static __inline __m256 __DEFAULT_FN_ATTRS 197_mm256_div_ps(__m256 __a, __m256 __b) 198{ 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200} 201 202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203/// of each pair of values. 204/// 205/// \headerfile <x86intrin.h> 206/// 207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208/// 209/// \param __a 210/// A 256-bit vector of [4 x double] containing one of the operands. 211/// \param __b 212/// A 256-bit vector of [4 x double] containing one of the operands. 213/// \returns A 256-bit vector of [4 x double] containing the maximum values 214/// between both operands. 215static __inline __m256d __DEFAULT_FN_ATTRS 216_mm256_max_pd(__m256d __a, __m256d __b) 217{ 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219} 220 221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222/// of each pair of values. 223/// 224/// \headerfile <x86intrin.h> 225/// 226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227/// 228/// \param __a 229/// A 256-bit vector of [8 x float] containing one of the operands. 230/// \param __b 231/// A 256-bit vector of [8 x float] containing one of the operands. 232/// \returns A 256-bit vector of [8 x float] containing the maximum values 233/// between both operands. 234static __inline __m256 __DEFAULT_FN_ATTRS 235_mm256_max_ps(__m256 __a, __m256 __b) 236{ 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238} 239 240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241/// of each pair of values. 242/// 243/// \headerfile <x86intrin.h> 244/// 245/// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246/// 247/// \param __a 248/// A 256-bit vector of [4 x double] containing one of the operands. 249/// \param __b 250/// A 256-bit vector of [4 x double] containing one of the operands. 251/// \returns A 256-bit vector of [4 x double] containing the minimum values 252/// between both operands. 253static __inline __m256d __DEFAULT_FN_ATTRS 254_mm256_min_pd(__m256d __a, __m256d __b) 255{ 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257} 258 259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260/// of each pair of values. 261/// 262/// \headerfile <x86intrin.h> 263/// 264/// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265/// 266/// \param __a 267/// A 256-bit vector of [8 x float] containing one of the operands. 268/// \param __b 269/// A 256-bit vector of [8 x float] containing one of the operands. 270/// \returns A 256-bit vector of [8 x float] containing the minimum values 271/// between both operands. 272static __inline __m256 __DEFAULT_FN_ATTRS 273_mm256_min_ps(__m256 __a, __m256 __b) 274{ 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276} 277 278/// \brief Multiplies two 256-bit vectors of [4 x double]. 279/// 280/// \headerfile <x86intrin.h> 281/// 282/// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283/// 284/// \param __a 285/// A 256-bit vector of [4 x double] containing one of the operands. 286/// \param __b 287/// A 256-bit vector of [4 x double] containing one of the operands. 288/// \returns A 256-bit vector of [4 x double] containing the products of both 289/// operands. 290static __inline __m256d __DEFAULT_FN_ATTRS 291_mm256_mul_pd(__m256d __a, __m256d __b) 292{ 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294} 295 296/// \brief Multiplies two 256-bit vectors of [8 x float]. 297/// 298/// \headerfile <x86intrin.h> 299/// 300/// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301/// 302/// \param __a 303/// A 256-bit vector of [8 x float] containing one of the operands. 304/// \param __b 305/// A 256-bit vector of [8 x float] containing one of the operands. 306/// \returns A 256-bit vector of [8 x float] containing the products of both 307/// operands. 308static __inline __m256 __DEFAULT_FN_ATTRS 309_mm256_mul_ps(__m256 __a, __m256 __b) 310{ 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312} 313 314/// \brief Calculates the square roots of the values in a 256-bit vector of 315/// [4 x double]. 316/// 317/// \headerfile <x86intrin.h> 318/// 319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320/// 321/// \param __a 322/// A 256-bit vector of [4 x double]. 323/// \returns A 256-bit vector of [4 x double] containing the square roots of the 324/// values in the operand. 325static __inline __m256d __DEFAULT_FN_ATTRS 326_mm256_sqrt_pd(__m256d __a) 327{ 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329} 330 331/// \brief Calculates the square roots of the values in a 256-bit vector of 332/// [8 x float]. 333/// 334/// \headerfile <x86intrin.h> 335/// 336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337/// 338/// \param __a 339/// A 256-bit vector of [8 x float]. 340/// \returns A 256-bit vector of [8 x float] containing the square roots of the 341/// values in the operand. 342static __inline __m256 __DEFAULT_FN_ATTRS 343_mm256_sqrt_ps(__m256 __a) 344{ 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346} 347 348/// \brief Calculates the reciprocal square roots of the values in a 256-bit 349/// vector of [8 x float]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354/// 355/// \param __a 356/// A 256-bit vector of [8 x float]. 357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358/// roots of the values in the operand. 359static __inline __m256 __DEFAULT_FN_ATTRS 360_mm256_rsqrt_ps(__m256 __a) 361{ 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363} 364 365/// \brief Calculates the reciprocals of the values in a 256-bit vector of 366/// [8 x float]. 367/// 368/// \headerfile <x86intrin.h> 369/// 370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371/// 372/// \param __a 373/// A 256-bit vector of [8 x float]. 374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375/// values in the operand. 376static __inline __m256 __DEFAULT_FN_ATTRS 377_mm256_rcp_ps(__m256 __a) 378{ 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380} 381 382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383/// by the byte operand. The source values are rounded to integer values and 384/// returned as 64-bit double-precision floating-point values. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// \code 389/// __m256d _mm256_round_pd(__m256d V, const int M); 390/// \endcode 391/// 392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393/// 394/// \param V 395/// A 256-bit vector of [4 x double]. 396/// \param M 397/// An integer value that specifies the rounding operation. \n 398/// Bits [7:4] are reserved. \n 399/// Bit [3] is a precision exception value: \n 400/// 0: A normal PE exception is used. \n 401/// 1: The PE field is not updated. \n 402/// Bit [2] is the rounding control source: \n 403/// 0: Use bits [1:0] of \a M. \n 404/// 1: Use the current MXCSR setting. \n 405/// Bits [1:0] contain the rounding control definition: \n 406/// 00: Nearest. \n 407/// 01: Downward (toward negative infinity). \n 408/// 10: Upward (toward positive infinity). \n 409/// 11: Truncated. 410/// \returns A 256-bit vector of [4 x double] containing the rounded values. 411#define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415/// specified by the byte operand. The source values are rounded to integer 416/// values and returned as floating-point values. 417/// 418/// \headerfile <x86intrin.h> 419/// 420/// \code 421/// __m256 _mm256_round_ps(__m256 V, const int M); 422/// \endcode 423/// 424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425/// 426/// \param V 427/// A 256-bit vector of [8 x float]. 428/// \param M 429/// An integer value that specifies the rounding operation. \n 430/// Bits [7:4] are reserved. \n 431/// Bit [3] is a precision exception value: \n 432/// 0: A normal PE exception is used. \n 433/// 1: The PE field is not updated. \n 434/// Bit [2] is the rounding control source: \n 435/// 0: Use bits [1:0] of \a M. \n 436/// 1: Use the current MXCSR setting. \n 437/// Bits [1:0] contain the rounding control definition: \n 438/// 00: Nearest. \n 439/// 01: Downward (toward negative infinity). \n 440/// 10: Upward (toward positive infinity). \n 441/// 11: Truncated. 442/// \returns A 256-bit vector of [8 x float] containing the rounded values. 443#define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447/// source values are rounded up to integer values and returned as 64-bit 448/// double-precision floating-point values. 449/// 450/// \headerfile <x86intrin.h> 451/// 452/// \code 453/// __m256d _mm256_ceil_pd(__m256d V); 454/// \endcode 455/// 456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457/// 458/// \param V 459/// A 256-bit vector of [4 x double]. 460/// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464/// The source values are rounded down to integer values and returned as 465/// 64-bit double-precision floating-point values. 466/// 467/// \headerfile <x86intrin.h> 468/// 469/// \code 470/// __m256d _mm256_floor_pd(__m256d V); 471/// \endcode 472/// 473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474/// 475/// \param V 476/// A 256-bit vector of [4 x double]. 477/// \returns A 256-bit vector of [4 x double] containing the rounded down 478/// values. 479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482/// source values are rounded up to integer values and returned as 483/// floating-point values. 484/// 485/// \headerfile <x86intrin.h> 486/// 487/// \code 488/// __m256 _mm256_ceil_ps(__m256 V); 489/// \endcode 490/// 491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492/// 493/// \param V 494/// A 256-bit vector of [8 x float]. 495/// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499/// source values are rounded down to integer values and returned as 500/// floating-point values. 501/// 502/// \headerfile <x86intrin.h> 503/// 504/// \code 505/// __m256 _mm256_floor_ps(__m256 V); 506/// \endcode 507/// 508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509/// 510/// \param V 511/// A 256-bit vector of [8 x float]. 512/// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515/* Logical */ 516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517/// 518/// \headerfile <x86intrin.h> 519/// 520/// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521/// 522/// \param __a 523/// A 256-bit vector of [4 x double] containing one of the source operands. 524/// \param __b 525/// A 256-bit vector of [4 x double] containing one of the source operands. 526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527/// values between both operands. 528static __inline __m256d __DEFAULT_FN_ATTRS 529_mm256_and_pd(__m256d __a, __m256d __b) 530{ 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532} 533 534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535/// 536/// \headerfile <x86intrin.h> 537/// 538/// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539/// 540/// \param __a 541/// A 256-bit vector of [8 x float] containing one of the source operands. 542/// \param __b 543/// A 256-bit vector of [8 x float] containing one of the source operands. 544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545/// values between both operands. 546static __inline __m256 __DEFAULT_FN_ATTRS 547_mm256_and_ps(__m256 __a, __m256 __b) 548{ 549 return (__m256)((__v8su)__a & (__v8su)__b); 550} 551 552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553/// the one's complement of the values contained in the first source operand. 554/// 555/// \headerfile <x86intrin.h> 556/// 557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558/// 559/// \param __a 560/// A 256-bit vector of [4 x double] containing the left source operand. The 561/// one's complement of this value is used in the bitwise AND. 562/// \param __b 563/// A 256-bit vector of [4 x double] containing the right source operand. 564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565/// values of the second operand and the one's complement of the first 566/// operand. 567static __inline __m256d __DEFAULT_FN_ATTRS 568_mm256_andnot_pd(__m256d __a, __m256d __b) 569{ 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571} 572 573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574/// the one's complement of the values contained in the first source operand. 575/// 576/// \headerfile <x86intrin.h> 577/// 578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579/// 580/// \param __a 581/// A 256-bit vector of [8 x float] containing the left source operand. The 582/// one's complement of this value is used in the bitwise AND. 583/// \param __b 584/// A 256-bit vector of [8 x float] containing the right source operand. 585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586/// values of the second operand and the one's complement of the first 587/// operand. 588static __inline __m256 __DEFAULT_FN_ATTRS 589_mm256_andnot_ps(__m256 __a, __m256 __b) 590{ 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592} 593 594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595/// 596/// \headerfile <x86intrin.h> 597/// 598/// This intrinsic corresponds to the <c> VORPD </c> instruction. 599/// 600/// \param __a 601/// A 256-bit vector of [4 x double] containing one of the source operands. 602/// \param __b 603/// A 256-bit vector of [4 x double] containing one of the source operands. 604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605/// values between both operands. 606static __inline __m256d __DEFAULT_FN_ATTRS 607_mm256_or_pd(__m256d __a, __m256d __b) 608{ 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610} 611 612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613/// 614/// \headerfile <x86intrin.h> 615/// 616/// This intrinsic corresponds to the <c> VORPS </c> instruction. 617/// 618/// \param __a 619/// A 256-bit vector of [8 x float] containing one of the source operands. 620/// \param __b 621/// A 256-bit vector of [8 x float] containing one of the source operands. 622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623/// values between both operands. 624static __inline __m256 __DEFAULT_FN_ATTRS 625_mm256_or_ps(__m256 __a, __m256 __b) 626{ 627 return (__m256)((__v8su)__a | (__v8su)__b); 628} 629 630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631/// 632/// \headerfile <x86intrin.h> 633/// 634/// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635/// 636/// \param __a 637/// A 256-bit vector of [4 x double] containing one of the source operands. 638/// \param __b 639/// A 256-bit vector of [4 x double] containing one of the source operands. 640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641/// values between both operands. 642static __inline __m256d __DEFAULT_FN_ATTRS 643_mm256_xor_pd(__m256d __a, __m256d __b) 644{ 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646} 647 648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649/// 650/// \headerfile <x86intrin.h> 651/// 652/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653/// 654/// \param __a 655/// A 256-bit vector of [8 x float] containing one of the source operands. 656/// \param __b 657/// A 256-bit vector of [8 x float] containing one of the source operands. 658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659/// values between both operands. 660static __inline __m256 __DEFAULT_FN_ATTRS 661_mm256_xor_ps(__m256 __a, __m256 __b) 662{ 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664} 665 666/* Horizontal arithmetic */ 667/// \brief Horizontally adds the adjacent pairs of values contained in two 668/// 256-bit vectors of [4 x double]. 669/// 670/// \headerfile <x86intrin.h> 671/// 672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673/// 674/// \param __a 675/// A 256-bit vector of [4 x double] containing one of the source operands. 676/// The horizontal sums of the values are returned in the even-indexed 677/// elements of a vector of [4 x double]. 678/// \param __b 679/// A 256-bit vector of [4 x double] containing one of the source operands. 680/// The horizontal sums of the values are returned in the odd-indexed 681/// elements of a vector of [4 x double]. 682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683/// both operands. 684static __inline __m256d __DEFAULT_FN_ATTRS 685_mm256_hadd_pd(__m256d __a, __m256d __b) 686{ 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688} 689 690/// \brief Horizontally adds the adjacent pairs of values contained in two 691/// 256-bit vectors of [8 x float]. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696/// 697/// \param __a 698/// A 256-bit vector of [8 x float] containing one of the source operands. 699/// The horizontal sums of the values are returned in the elements with 700/// index 0, 1, 4, 5 of a vector of [8 x float]. 701/// \param __b 702/// A 256-bit vector of [8 x float] containing one of the source operands. 703/// The horizontal sums of the values are returned in the elements with 704/// index 2, 3, 6, 7 of a vector of [8 x float]. 705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706/// both operands. 707static __inline __m256 __DEFAULT_FN_ATTRS 708_mm256_hadd_ps(__m256 __a, __m256 __b) 709{ 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711} 712 713/// \brief Horizontally subtracts the adjacent pairs of values contained in two 714/// 256-bit vectors of [4 x double]. 715/// 716/// \headerfile <x86intrin.h> 717/// 718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719/// 720/// \param __a 721/// A 256-bit vector of [4 x double] containing one of the source operands. 722/// The horizontal differences between the values are returned in the 723/// even-indexed elements of a vector of [4 x double]. 724/// \param __b 725/// A 256-bit vector of [4 x double] containing one of the source operands. 726/// The horizontal differences between the values are returned in the 727/// odd-indexed elements of a vector of [4 x double]. 728/// \returns A 256-bit vector of [4 x double] containing the horizontal 729/// differences of both operands. 730static __inline __m256d __DEFAULT_FN_ATTRS 731_mm256_hsub_pd(__m256d __a, __m256d __b) 732{ 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734} 735 736/// \brief Horizontally subtracts the adjacent pairs of values contained in two 737/// 256-bit vectors of [8 x float]. 738/// 739/// \headerfile <x86intrin.h> 740/// 741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742/// 743/// \param __a 744/// A 256-bit vector of [8 x float] containing one of the source operands. 745/// The horizontal differences between the values are returned in the 746/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747/// \param __b 748/// A 256-bit vector of [8 x float] containing one of the source operands. 749/// The horizontal differences between the values are returned in the 750/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751/// \returns A 256-bit vector of [8 x float] containing the horizontal 752/// differences of both operands. 753static __inline __m256 __DEFAULT_FN_ATTRS 754_mm256_hsub_ps(__m256 __a, __m256 __b) 755{ 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757} 758 759/* Vector permutations */ 760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761/// by the 128-bit integer vector operand. 762/// 763/// \headerfile <x86intrin.h> 764/// 765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766/// 767/// \param __a 768/// A 128-bit vector of [2 x double]. 769/// \param __c 770/// A 128-bit integer vector operand specifying how the values are to be 771/// copied. \n 772/// Bit [1]: \n 773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774/// vector. \n 775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776/// returned vector. \n 777/// Bit [65]: \n 778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779/// returned vector. \n 780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781/// returned vector. 782/// \returns A 128-bit vector of [2 x double] containing the copied values. 783static __inline __m128d __DEFAULT_FN_ATTRS 784_mm_permutevar_pd(__m128d __a, __m128i __c) 785{ 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787} 788 789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790/// by the 256-bit integer vector operand. 791/// 792/// \headerfile <x86intrin.h> 793/// 794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795/// 796/// \param __a 797/// A 256-bit vector of [4 x double]. 798/// \param __c 799/// A 256-bit integer vector operand specifying how the values are to be 800/// copied. \n 801/// Bit [1]: \n 802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803/// vector. \n 804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805/// returned vector. \n 806/// Bit [65]: \n 807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808/// returned vector. \n 809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810/// returned vector. \n 811/// Bit [129]: \n 812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813/// returned vector. \n 814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815/// returned vector. \n 816/// Bit [193]: \n 817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818/// returned vector. \n 819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820/// returned vector. 821/// \returns A 256-bit vector of [4 x double] containing the copied values. 822static __inline __m256d __DEFAULT_FN_ATTRS 823_mm256_permutevar_pd(__m256d __a, __m256i __c) 824{ 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826} 827 828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829/// specified by the 128-bit integer vector operand. 830/// \headerfile <x86intrin.h> 831/// 832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833/// 834/// \param __a 835/// A 128-bit vector of [4 x float]. 836/// \param __c 837/// A 128-bit integer vector operand specifying how the values are to be 838/// copied. \n 839/// Bits [1:0]: \n 840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841/// returned vector. \n 842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843/// returned vector. \n 844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845/// returned vector. \n 846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847/// returned vector. \n 848/// Bits [33:32]: \n 849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850/// returned vector. \n 851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852/// returned vector. \n 853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854/// returned vector. \n 855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856/// returned vector. \n 857/// Bits [65:64]: \n 858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859/// returned vector. \n 860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861/// returned vector. \n 862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863/// returned vector. \n 864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865/// returned vector. \n 866/// Bits [97:96]: \n 867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868/// returned vector. \n 869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870/// returned vector. \n 871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872/// returned vector. \n 873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874/// returned vector. 875/// \returns A 128-bit vector of [4 x float] containing the copied values. 876static __inline __m128 __DEFAULT_FN_ATTRS 877_mm_permutevar_ps(__m128 __a, __m128i __c) 878{ 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880} 881 882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883/// specified by the 256-bit integer vector operand. 884/// 885/// \headerfile <x86intrin.h> 886/// 887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888/// 889/// \param __a 890/// A 256-bit vector of [8 x float]. 891/// \param __c 892/// A 256-bit integer vector operand specifying how the values are to be 893/// copied. \n 894/// Bits [1:0]: \n 895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896/// returned vector. \n 897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898/// returned vector. \n 899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900/// returned vector. \n 901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902/// returned vector. \n 903/// Bits [33:32]: \n 904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905/// returned vector. \n 906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907/// returned vector. \n 908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909/// returned vector. \n 910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911/// returned vector. \n 912/// Bits [65:64]: \n 913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914/// returned vector. \n 915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916/// returned vector. \n 917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918/// returned vector. \n 919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920/// returned vector. \n 921/// Bits [97:96]: \n 922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923/// returned vector. \n 924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925/// returned vector. \n 926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927/// returned vector. \n 928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929/// returned vector. \n 930/// Bits [129:128]: \n 931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932/// returned vector. \n 933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934/// returned vector. \n 935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936/// returned vector. \n 937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938/// returned vector. \n 939/// Bits [161:160]: \n 940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941/// returned vector. \n 942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943/// returned vector. \n 944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945/// returned vector. \n 946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947/// returned vector. \n 948/// Bits [193:192]: \n 949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950/// returned vector. \n 951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952/// returned vector. \n 953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954/// returned vector. \n 955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956/// returned vector. \n 957/// Bits [225:224]: \n 958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959/// returned vector. \n 960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961/// returned vector. \n 962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963/// returned vector. \n 964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965/// returned vector. 966/// \returns A 256-bit vector of [8 x float] containing the copied values. 967static __inline __m256 __DEFAULT_FN_ATTRS 968_mm256_permutevar_ps(__m256 __a, __m256i __c) 969{ 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971} 972 973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974/// by the immediate integer operand. 975/// 976/// \headerfile <x86intrin.h> 977/// 978/// \code 979/// __m128d _mm_permute_pd(__m128d A, const int C); 980/// \endcode 981/// 982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983/// 984/// \param A 985/// A 128-bit vector of [2 x double]. 986/// \param C 987/// An immediate integer operand specifying how the values are to be 988/// copied. \n 989/// Bit [0]: \n 990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991/// vector. \n 992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993/// returned vector. \n 994/// Bit [1]: \n 995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996/// returned vector. \n 997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998/// returned vector. 999/// \returns A 128-bit vector of [2 x double] containing the copied values. 1000#define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006/// the immediate integer operand. 1007/// 1008/// \headerfile <x86intrin.h> 1009/// 1010/// \code 1011/// __m256d _mm256_permute_pd(__m256d A, const int C); 1012/// \endcode 1013/// 1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015/// 1016/// \param A 1017/// A 256-bit vector of [4 x double]. 1018/// \param C 1019/// An immediate integer operand specifying how the values are to be 1020/// copied. \n 1021/// Bit [0]: \n 1022/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023/// vector. \n 1024/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025/// returned vector. \n 1026/// Bit [1]: \n 1027/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028/// returned vector. \n 1029/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030/// returned vector. \n 1031/// Bit [2]: \n 1032/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033/// returned vector. \n 1034/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035/// returned vector. \n 1036/// Bit [3]: \n 1037/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038/// returned vector. \n 1039/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040/// returned vector. 1041/// \returns A 256-bit vector of [4 x double] containing the copied values. 1042#define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051/// the immediate integer operand. 1052/// 1053/// \headerfile <x86intrin.h> 1054/// 1055/// \code 1056/// __m128 _mm_permute_ps(__m128 A, const int C); 1057/// \endcode 1058/// 1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060/// 1061/// \param A 1062/// A 128-bit vector of [4 x float]. 1063/// \param C 1064/// An immediate integer operand specifying how the values are to be 1065/// copied. \n 1066/// Bits [1:0]: \n 1067/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068/// returned vector. \n 1069/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070/// returned vector. \n 1071/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072/// returned vector. \n 1073/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074/// returned vector. \n 1075/// Bits [3:2]: \n 1076/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077/// returned vector. \n 1078/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079/// returned vector. \n 1080/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081/// returned vector. \n 1082/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083/// returned vector. \n 1084/// Bits [5:4]: \n 1085/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086/// returned vector. \n 1087/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088/// returned vector. \n 1089/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090/// returned vector. \n 1091/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092/// returned vector. \n 1093/// Bits [7:6]: \n 1094/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095/// returned vector. \n 1096/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097/// returned vector. \n 1098/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099/// returned vector. \n 1100/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101/// returned vector. 1102/// \returns A 128-bit vector of [4 x float] containing the copied values. 1103#define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110/// the immediate integer operand. 1111/// 1112/// \headerfile <x86intrin.h> 1113/// 1114/// \code 1115/// __m256 _mm256_permute_ps(__m256 A, const int C); 1116/// \endcode 1117/// 1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119/// 1120/// \param A 1121/// A 256-bit vector of [8 x float]. 1122/// \param C 1123/// An immediate integer operand specifying how the values are to be \n 1124/// copied. \n 1125/// Bits [1:0]: \n 1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127/// returned vector. \n 1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129/// returned vector. \n 1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131/// returned vector. \n 1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133/// returned vector. \n 1134/// Bits [3:2]: \n 1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136/// returned vector. \n 1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138/// returned vector. \n 1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140/// returned vector. \n 1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142/// returned vector. \n 1143/// Bits [5:4]: \n 1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145/// returned vector. \n 1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147/// returned vector. \n 1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149/// returned vector. \n 1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151/// returned vector. \n 1152/// Bits [7:6]: \n 1153/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154/// returned vector. \n 1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156/// returned vector. \n 1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158/// returned vector. \n 1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160/// returned vector. \n 1161/// Bits [1:0]: \n 1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163/// returned vector. \n 1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165/// returned vector. \n 1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167/// returned vector. \n 1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169/// returned vector. \n 1170/// Bits [3:2]: \n 1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172/// returned vector. \n 1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174/// returned vector. \n 1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176/// returned vector. \n 1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178/// returned vector. \n 1179/// Bits [5:4]: \n 1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181/// returned vector. \n 1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183/// returned vector. \n 1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185/// returned vector. \n 1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187/// returned vector. \n 1188/// Bits [7:6]: \n 1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190/// returned vector. \n 1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192/// returned vector. \n 1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194/// returned vector. \n 1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196/// returned vector. 1197/// \returns A 256-bit vector of [8 x float] containing the copied values. 1198#define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211/// [4 x double], as specified by the immediate integer operand. 1212/// 1213/// \headerfile <x86intrin.h> 1214/// 1215/// \code 1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217/// \endcode 1218/// 1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220/// 1221/// \param V1 1222/// A 256-bit vector of [4 x double]. 1223/// \param V2 1224/// A 256-bit vector of [4 x double. 1225/// \param M 1226/// An immediate integer operand specifying how the values are to be 1227/// permuted. \n 1228/// Bits [1:0]: \n 1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230/// destination. \n 1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232/// destination. \n 1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234/// destination. \n 1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236/// destination. \n 1237/// Bits [5:4]: \n 1238/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239/// destination. \n 1240/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241/// destination. \n 1242/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243/// destination. \n 1244/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245/// destination. 1246/// \returns A 256-bit vector of [4 x double] containing the copied values. 1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252/// [8 x float], as specified by the immediate integer operand. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// \code 1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258/// \endcode 1259/// 1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261/// 1262/// \param V1 1263/// A 256-bit vector of [8 x float]. 1264/// \param V2 1265/// A 256-bit vector of [8 x float]. 1266/// \param M 1267/// An immediate integer operand specifying how the values are to be 1268/// permuted. \n 1269/// Bits [1:0]: \n 1270/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271/// destination. \n 1272/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273/// destination. \n 1274/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275/// destination. \n 1276/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277/// destination. \n 1278/// Bits [5:4]: \n 1279/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280/// destination. \n 1281/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282/// destination. \n 1283/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284/// destination. \n 1285/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286/// destination. 1287/// \returns A 256-bit vector of [8 x float] containing the copied values. 1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293/// as specified by the immediate integer operand. 1294/// 1295/// \headerfile <x86intrin.h> 1296/// 1297/// \code 1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299/// \endcode 1300/// 1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302/// 1303/// \param V1 1304/// A 256-bit integer vector. 1305/// \param V2 1306/// A 256-bit integer vector. 1307/// \param M 1308/// An immediate integer operand specifying how the values are to be copied. 1309/// Bits [1:0]: \n 1310/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311/// destination. \n 1312/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313/// destination. \n 1314/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315/// destination. \n 1316/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317/// destination. \n 1318/// Bits [5:4]: \n 1319/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320/// destination. \n 1321/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322/// destination. \n 1323/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324/// destination. \n 1325/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326/// destination. 1327/// \returns A 256-bit integer vector containing the copied values. 1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332/* Vector Blend */ 1333/// \brief Merges 64-bit double-precision data values stored in either of the 1334/// two 256-bit vectors of [4 x double], as specified by the immediate 1335/// integer operand. 1336/// 1337/// \headerfile <x86intrin.h> 1338/// 1339/// \code 1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341/// \endcode 1342/// 1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344/// 1345/// \param V1 1346/// A 256-bit vector of [4 x double]. 1347/// \param V2 1348/// A 256-bit vector of [4 x double]. 1349/// \param M 1350/// An immediate integer operand, with mask bits [3:0] specifying how the 1351/// values are to be copied. The position of the mask bit corresponds to the 1352/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353/// element in operand \a V1 is copied to the same position in the 1354/// destination. When a mask bit is 1, the corresponding 64-bit element in 1355/// operand \a V2 is copied to the same position in the destination. 1356/// \returns A 256-bit vector of [4 x double] containing the copied values. 1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365/// \brief Merges 32-bit single-precision data values stored in either of the 1366/// two 256-bit vectors of [8 x float], as specified by the immediate 1367/// integer operand. 1368/// 1369/// \headerfile <x86intrin.h> 1370/// 1371/// \code 1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373/// \endcode 1374/// 1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376/// 1377/// \param V1 1378/// A 256-bit vector of [8 x float]. 1379/// \param V2 1380/// A 256-bit vector of [8 x float]. 1381/// \param M 1382/// An immediate integer operand, with mask bits [7:0] specifying how the 1383/// values are to be copied. The position of the mask bit corresponds to the 1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385/// element in operand \a V1 is copied to the same position in the 1386/// destination. When a mask bit is 1, the corresponding 32-bit element in 1387/// operand \a V2 is copied to the same position in the destination. 1388/// \returns A 256-bit vector of [8 x float] containing the copied values. 1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401/// \brief Merges 64-bit double-precision data values stored in either of the 1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403/// operand. 1404/// 1405/// \headerfile <x86intrin.h> 1406/// 1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408/// 1409/// \param __a 1410/// A 256-bit vector of [4 x double]. 1411/// \param __b 1412/// A 256-bit vector of [4 x double]. 1413/// \param __c 1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415/// how the values are to be copied. The position of the mask bit corresponds 1416/// to the most significant bit of a copied value. When a mask bit is 0, the 1417/// corresponding 64-bit element in operand \a __a is copied to the same 1418/// position in the destination. When a mask bit is 1, the corresponding 1419/// 64-bit element in operand \a __b is copied to the same position in the 1420/// destination. 1421/// \returns A 256-bit vector of [4 x double] containing the copied values. 1422static __inline __m256d __DEFAULT_FN_ATTRS 1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424{ 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427} 1428 1429/// \brief Merges 32-bit single-precision data values stored in either of the 1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431/// operand. 1432/// 1433/// \headerfile <x86intrin.h> 1434/// 1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436/// 1437/// \param __a 1438/// A 256-bit vector of [8 x float]. 1439/// \param __b 1440/// A 256-bit vector of [8 x float]. 1441/// \param __c 1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443/// and 31 specifying how the values are to be copied. The position of the 1444/// mask bit corresponds to the most significant bit of a copied value. When 1445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446/// copied to the same position in the destination. When a mask bit is 1, the 1447/// corresponding 32-bit element in operand \a __b is copied to the same 1448/// position in the destination. 1449/// \returns A 256-bit vector of [8 x float] containing the copied values. 1450static __inline __m256 __DEFAULT_FN_ATTRS 1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452{ 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455} 1456 1457/* Vector Dot Product */ 1458/// \brief Computes two dot products in parallel, using the lower and upper 1459/// halves of two [8 x float] vectors as input to the two computations, and 1460/// returning the two dot products in the lower and upper halves of the 1461/// [8 x float] result. 1462/// 1463/// The immediate integer operand controls which input elements will 1464/// contribute to the dot product, and where the final results are returned. 1465/// In general, for each dot product, the four corresponding elements of the 1466/// input vectors are multiplied; the first two and second two products are 1467/// summed, then the two sums are added to form the final result. 1468/// 1469/// \headerfile <x86intrin.h> 1470/// 1471/// \code 1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1473/// \endcode 1474/// 1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1476/// 1477/// \param V1 1478/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1479/// \param V2 1480/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1481/// \param M 1482/// An immediate integer argument. Bits [7:4] determine which elements of 1483/// the input vectors are used, with bit [4] corresponding to the lowest 1484/// element and bit [7] corresponding to the highest element of each [4 x 1485/// float] subvector. If a bit is set, the corresponding elements from the 1486/// two input vectors are used as an input for dot product; otherwise that 1487/// input is treated as zero. Bits [3:0] determine which elements of the 1488/// result will receive a copy of the final dot product, with bit [0] 1489/// corresponding to the lowest element and bit [3] corresponding to the 1490/// highest element of each [4 x float] subvector. If a bit is set, the dot 1491/// product is returned in the corresponding element; otherwise that element 1492/// is set to zero. The bitmask is applied in the same way to each of the 1493/// two parallel dot product computations. 1494/// \returns A 256-bit vector of [8 x float] containing the two dot products. 1495#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1496 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1497 (__v8sf)(__m256)(V2), (M)); }) 1498 1499/* Vector shuffle */ 1500/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1501/// specified by the immediate value operand. 1502/// 1503/// The four selected elements in each operand are copied to the destination 1504/// according to the bits specified in the immediate operand. The selected 1505/// elements from the first 256-bit operand are copied to bits [63:0] and 1506/// bits [191:128] of the destination, and the selected elements from the 1507/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1508/// the destination. For example, if bits [7:0] of the immediate operand 1509/// contain a value of 0xFF, the 256-bit destination vector would contain the 1510/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1511/// 1512/// \headerfile <x86intrin.h> 1513/// 1514/// \code 1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1516/// \endcode 1517/// 1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1519/// 1520/// \param a 1521/// A 256-bit vector of [8 x float]. The four selected elements in this 1522/// operand are copied to bits [63:0] and bits [191:128] in the destination, 1523/// according to the bits specified in the immediate operand. 1524/// \param b 1525/// A 256-bit vector of [8 x float]. The four selected elements in this 1526/// operand are copied to bits [127:64] and bits [255:192] in the 1527/// destination, according to the bits specified in the immediate operand. 1528/// \param mask 1529/// An immediate value containing an 8-bit value specifying which elements to 1530/// copy from \a a and \a b \n. 1531/// Bits [3:0] specify the values copied from operand \a a. \n 1532/// Bits [7:4] specify the values copied from operand \a b. \n 1533/// The destinations within the 256-bit destination are assigned values as 1534/// follows, according to the bit value assignments described below: \n 1535/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1536/// destination. \n 1537/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1538/// destination. \n 1539/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1540/// destination. \n 1541/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1542/// the destination. \n 1543/// Bit value assignments: \n 1544/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1545/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1546/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1547/// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1548/// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1549#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1550 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1551 (__v8sf)(__m256)(b), \ 1552 0 + (((mask) >> 0) & 0x3), \ 1553 0 + (((mask) >> 2) & 0x3), \ 1554 8 + (((mask) >> 4) & 0x3), \ 1555 8 + (((mask) >> 6) & 0x3), \ 1556 4 + (((mask) >> 0) & 0x3), \ 1557 4 + (((mask) >> 2) & 0x3), \ 1558 12 + (((mask) >> 4) & 0x3), \ 1559 12 + (((mask) >> 6) & 0x3)); }) 1560 1561/// \brief Selects four double-precision values from the 256-bit operands of 1562/// [4 x double], as specified by the immediate value operand. 1563/// 1564/// The selected elements from the first 256-bit operand are copied to bits 1565/// [63:0] and bits [191:128] in the destination, and the selected elements 1566/// from the second 256-bit operand are copied to bits [127:64] and bits 1567/// [255:192] in the destination. For example, if bits [3:0] of the immediate 1568/// operand contain a value of 0xF, the 256-bit destination vector would 1569/// contain the following values: b[3], a[3], b[1], a[1]. 1570/// 1571/// \headerfile <x86intrin.h> 1572/// 1573/// \code 1574/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1575/// \endcode 1576/// 1577/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1578/// 1579/// \param a 1580/// A 256-bit vector of [4 x double]. 1581/// \param b 1582/// A 256-bit vector of [4 x double]. 1583/// \param mask 1584/// An immediate value containing 8-bit values specifying which elements to 1585/// copy from \a a and \a b: \n 1586/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1587/// destination. \n 1588/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1589/// destination. \n 1590/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1591/// destination. \n 1592/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1593/// destination. \n 1594/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1595/// destination. \n 1596/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1597/// destination. \n 1598/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1599/// destination. \n 1600/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1601/// destination. 1602/// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1603#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1604 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1605 (__v4df)(__m256d)(b), \ 1606 0 + (((mask) >> 0) & 0x1), \ 1607 4 + (((mask) >> 1) & 0x1), \ 1608 2 + (((mask) >> 2) & 0x1), \ 1609 6 + (((mask) >> 3) & 0x1)); }) 1610 1611/* Compare */ 1612#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1613#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1614#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1615#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1616#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1617#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1618#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1619#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1620#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1621#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1622#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1623#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1624#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1625#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1626#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1627#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1628#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1629#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1630#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1631#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1632#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1633#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1634#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1635#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1636#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1637#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1638#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1639#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1640#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1641#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1642#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1643#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1644 1645/// \brief Compares each of the corresponding double-precision values of two 1646/// 128-bit vectors of [2 x double], using the operation specified by the 1647/// immediate integer operand. 1648/// 1649/// Returns a [2 x double] vector consisting of two doubles corresponding to 1650/// the two comparison results: zero if the comparison is false, and all 1's 1651/// if the comparison is true. 1652/// 1653/// \headerfile <x86intrin.h> 1654/// 1655/// \code 1656/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1657/// \endcode 1658/// 1659/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1660/// 1661/// \param a 1662/// A 128-bit vector of [2 x double]. 1663/// \param b 1664/// A 128-bit vector of [2 x double]. 1665/// \param c 1666/// An immediate integer operand, with bits [4:0] specifying which comparison 1667/// operation to use: \n 1668/// 0x00 : Equal (ordered, non-signaling) 1669/// 0x01 : Less-than (ordered, signaling) 1670/// 0x02 : Less-than-or-equal (ordered, signaling) 1671/// 0x03 : Unordered (non-signaling) 1672/// 0x04 : Not-equal (unordered, non-signaling) 1673/// 0x05 : Not-less-than (unordered, signaling) 1674/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1675/// 0x07 : Ordered (non-signaling) 1676/// 0x08 : Equal (unordered, non-signaling) 1677/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1678/// 0x0a : Not-greater-than (unordered, signaling) 1679/// 0x0b : False (ordered, non-signaling) 1680/// 0x0c : Not-equal (ordered, non-signaling) 1681/// 0x0d : Greater-than-or-equal (ordered, signaling) 1682/// 0x0e : Greater-than (ordered, signaling) 1683/// 0x0f : True (unordered, non-signaling) 1684/// 0x10 : Equal (ordered, signaling) 1685/// 0x11 : Less-than (ordered, non-signaling) 1686/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1687/// 0x13 : Unordered (signaling) 1688/// 0x14 : Not-equal (unordered, signaling) 1689/// 0x15 : Not-less-than (unordered, non-signaling) 1690/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1691/// 0x17 : Ordered (signaling) 1692/// 0x18 : Equal (unordered, signaling) 1693/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1694/// 0x1a : Not-greater-than (unordered, non-signaling) 1695/// 0x1b : False (ordered, signaling) 1696/// 0x1c : Not-equal (ordered, signaling) 1697/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1698/// 0x1e : Greater-than (ordered, non-signaling) 1699/// 0x1f : True (unordered, signaling) 1700/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1701#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1702 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1703 (__v2df)(__m128d)(b), (c)); }) 1704 1705/// \brief Compares each of the corresponding values of two 128-bit vectors of 1706/// [4 x float], using the operation specified by the immediate integer 1707/// operand. 1708/// 1709/// Returns a [4 x float] vector consisting of four floats corresponding to 1710/// the four comparison results: zero if the comparison is false, and all 1's 1711/// if the comparison is true. 1712/// 1713/// \headerfile <x86intrin.h> 1714/// 1715/// \code 1716/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1717/// \endcode 1718/// 1719/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1720/// 1721/// \param a 1722/// A 128-bit vector of [4 x float]. 1723/// \param b 1724/// A 128-bit vector of [4 x float]. 1725/// \param c 1726/// An immediate integer operand, with bits [4:0] specifying which comparison 1727/// operation to use: \n 1728/// 0x00 : Equal (ordered, non-signaling) 1729/// 0x01 : Less-than (ordered, signaling) 1730/// 0x02 : Less-than-or-equal (ordered, signaling) 1731/// 0x03 : Unordered (non-signaling) 1732/// 0x04 : Not-equal (unordered, non-signaling) 1733/// 0x05 : Not-less-than (unordered, signaling) 1734/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1735/// 0x07 : Ordered (non-signaling) 1736/// 0x08 : Equal (unordered, non-signaling) 1737/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1738/// 0x0a : Not-greater-than (unordered, signaling) 1739/// 0x0b : False (ordered, non-signaling) 1740/// 0x0c : Not-equal (ordered, non-signaling) 1741/// 0x0d : Greater-than-or-equal (ordered, signaling) 1742/// 0x0e : Greater-than (ordered, signaling) 1743/// 0x0f : True (unordered, non-signaling) 1744/// 0x10 : Equal (ordered, signaling) 1745/// 0x11 : Less-than (ordered, non-signaling) 1746/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1747/// 0x13 : Unordered (signaling) 1748/// 0x14 : Not-equal (unordered, signaling) 1749/// 0x15 : Not-less-than (unordered, non-signaling) 1750/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1751/// 0x17 : Ordered (signaling) 1752/// 0x18 : Equal (unordered, signaling) 1753/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1754/// 0x1a : Not-greater-than (unordered, non-signaling) 1755/// 0x1b : False (ordered, signaling) 1756/// 0x1c : Not-equal (ordered, signaling) 1757/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1758/// 0x1e : Greater-than (ordered, non-signaling) 1759/// 0x1f : True (unordered, signaling) 1760/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1761#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1762 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1763 (__v4sf)(__m128)(b), (c)); }) 1764 1765/// \brief Compares each of the corresponding double-precision values of two 1766/// 256-bit vectors of [4 x double], using the operation specified by the 1767/// immediate integer operand. 1768/// 1769/// Returns a [4 x double] vector consisting of four doubles corresponding to 1770/// the four comparison results: zero if the comparison is false, and all 1's 1771/// if the comparison is true. 1772/// 1773/// \headerfile <x86intrin.h> 1774/// 1775/// \code 1776/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1777/// \endcode 1778/// 1779/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1780/// 1781/// \param a 1782/// A 256-bit vector of [4 x double]. 1783/// \param b 1784/// A 256-bit vector of [4 x double]. 1785/// \param c 1786/// An immediate integer operand, with bits [4:0] specifying which comparison 1787/// operation to use: \n 1788/// 0x00 : Equal (ordered, non-signaling) 1789/// 0x01 : Less-than (ordered, signaling) 1790/// 0x02 : Less-than-or-equal (ordered, signaling) 1791/// 0x03 : Unordered (non-signaling) 1792/// 0x04 : Not-equal (unordered, non-signaling) 1793/// 0x05 : Not-less-than (unordered, signaling) 1794/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1795/// 0x07 : Ordered (non-signaling) 1796/// 0x08 : Equal (unordered, non-signaling) 1797/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1798/// 0x0a : Not-greater-than (unordered, signaling) 1799/// 0x0b : False (ordered, non-signaling) 1800/// 0x0c : Not-equal (ordered, non-signaling) 1801/// 0x0d : Greater-than-or-equal (ordered, signaling) 1802/// 0x0e : Greater-than (ordered, signaling) 1803/// 0x0f : True (unordered, non-signaling) 1804/// 0x10 : Equal (ordered, signaling) 1805/// 0x11 : Less-than (ordered, non-signaling) 1806/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1807/// 0x13 : Unordered (signaling) 1808/// 0x14 : Not-equal (unordered, signaling) 1809/// 0x15 : Not-less-than (unordered, non-signaling) 1810/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1811/// 0x17 : Ordered (signaling) 1812/// 0x18 : Equal (unordered, signaling) 1813/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1814/// 0x1a : Not-greater-than (unordered, non-signaling) 1815/// 0x1b : False (ordered, signaling) 1816/// 0x1c : Not-equal (ordered, signaling) 1817/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1818/// 0x1e : Greater-than (ordered, non-signaling) 1819/// 0x1f : True (unordered, signaling) 1820/// \returns A 256-bit vector of [4 x double] containing the comparison results. 1821#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1822 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1823 (__v4df)(__m256d)(b), (c)); }) 1824 1825/// \brief Compares each of the corresponding values of two 256-bit vectors of 1826/// [8 x float], using the operation specified by the immediate integer 1827/// operand. 1828/// 1829/// Returns a [8 x float] vector consisting of eight floats corresponding to 1830/// the eight comparison results: zero if the comparison is false, and all 1831/// 1's if the comparison is true. 1832/// 1833/// \headerfile <x86intrin.h> 1834/// 1835/// \code 1836/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1837/// \endcode 1838/// 1839/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1840/// 1841/// \param a 1842/// A 256-bit vector of [8 x float]. 1843/// \param b 1844/// A 256-bit vector of [8 x float]. 1845/// \param c 1846/// An immediate integer operand, with bits [4:0] specifying which comparison 1847/// operation to use: \n 1848/// 0x00 : Equal (ordered, non-signaling) 1849/// 0x01 : Less-than (ordered, signaling) 1850/// 0x02 : Less-than-or-equal (ordered, signaling) 1851/// 0x03 : Unordered (non-signaling) 1852/// 0x04 : Not-equal (unordered, non-signaling) 1853/// 0x05 : Not-less-than (unordered, signaling) 1854/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1855/// 0x07 : Ordered (non-signaling) 1856/// 0x08 : Equal (unordered, non-signaling) 1857/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1858/// 0x0a : Not-greater-than (unordered, signaling) 1859/// 0x0b : False (ordered, non-signaling) 1860/// 0x0c : Not-equal (ordered, non-signaling) 1861/// 0x0d : Greater-than-or-equal (ordered, signaling) 1862/// 0x0e : Greater-than (ordered, signaling) 1863/// 0x0f : True (unordered, non-signaling) 1864/// 0x10 : Equal (ordered, signaling) 1865/// 0x11 : Less-than (ordered, non-signaling) 1866/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1867/// 0x13 : Unordered (signaling) 1868/// 0x14 : Not-equal (unordered, signaling) 1869/// 0x15 : Not-less-than (unordered, non-signaling) 1870/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1871/// 0x17 : Ordered (signaling) 1872/// 0x18 : Equal (unordered, signaling) 1873/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1874/// 0x1a : Not-greater-than (unordered, non-signaling) 1875/// 0x1b : False (ordered, signaling) 1876/// 0x1c : Not-equal (ordered, signaling) 1877/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1878/// 0x1e : Greater-than (ordered, non-signaling) 1879/// 0x1f : True (unordered, signaling) 1880/// \returns A 256-bit vector of [8 x float] containing the comparison results. 1881#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1882 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1883 (__v8sf)(__m256)(b), (c)); }) 1884 1885/// \brief Compares each of the corresponding scalar double-precision values of 1886/// two 128-bit vectors of [2 x double], using the operation specified by the 1887/// immediate integer operand. 1888/// 1889/// If the result is true, all 64 bits of the destination vector are set; 1890/// otherwise they are cleared. 1891/// 1892/// \headerfile <x86intrin.h> 1893/// 1894/// \code 1895/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1896/// \endcode 1897/// 1898/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1899/// 1900/// \param a 1901/// A 128-bit vector of [2 x double]. 1902/// \param b 1903/// A 128-bit vector of [2 x double]. 1904/// \param c 1905/// An immediate integer operand, with bits [4:0] specifying which comparison 1906/// operation to use: \n 1907/// 0x00 : Equal (ordered, non-signaling) 1908/// 0x01 : Less-than (ordered, signaling) 1909/// 0x02 : Less-than-or-equal (ordered, signaling) 1910/// 0x03 : Unordered (non-signaling) 1911/// 0x04 : Not-equal (unordered, non-signaling) 1912/// 0x05 : Not-less-than (unordered, signaling) 1913/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1914/// 0x07 : Ordered (non-signaling) 1915/// 0x08 : Equal (unordered, non-signaling) 1916/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1917/// 0x0a : Not-greater-than (unordered, signaling) 1918/// 0x0b : False (ordered, non-signaling) 1919/// 0x0c : Not-equal (ordered, non-signaling) 1920/// 0x0d : Greater-than-or-equal (ordered, signaling) 1921/// 0x0e : Greater-than (ordered, signaling) 1922/// 0x0f : True (unordered, non-signaling) 1923/// 0x10 : Equal (ordered, signaling) 1924/// 0x11 : Less-than (ordered, non-signaling) 1925/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1926/// 0x13 : Unordered (signaling) 1927/// 0x14 : Not-equal (unordered, signaling) 1928/// 0x15 : Not-less-than (unordered, non-signaling) 1929/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1930/// 0x17 : Ordered (signaling) 1931/// 0x18 : Equal (unordered, signaling) 1932/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1933/// 0x1a : Not-greater-than (unordered, non-signaling) 1934/// 0x1b : False (ordered, signaling) 1935/// 0x1c : Not-equal (ordered, signaling) 1936/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1937/// 0x1e : Greater-than (ordered, non-signaling) 1938/// 0x1f : True (unordered, signaling) 1939/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1940#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1941 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1942 (__v2df)(__m128d)(b), (c)); }) 1943 1944/// \brief Compares each of the corresponding scalar values of two 128-bit 1945/// vectors of [4 x float], using the operation specified by the immediate 1946/// integer operand. 1947/// 1948/// If the result is true, all 32 bits of the destination vector are set; 1949/// otherwise they are cleared. 1950/// 1951/// \headerfile <x86intrin.h> 1952/// 1953/// \code 1954/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1955/// \endcode 1956/// 1957/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1958/// 1959/// \param a 1960/// A 128-bit vector of [4 x float]. 1961/// \param b 1962/// A 128-bit vector of [4 x float]. 1963/// \param c 1964/// An immediate integer operand, with bits [4:0] specifying which comparison 1965/// operation to use: \n 1966/// 0x00 : Equal (ordered, non-signaling) 1967/// 0x01 : Less-than (ordered, signaling) 1968/// 0x02 : Less-than-or-equal (ordered, signaling) 1969/// 0x03 : Unordered (non-signaling) 1970/// 0x04 : Not-equal (unordered, non-signaling) 1971/// 0x05 : Not-less-than (unordered, signaling) 1972/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1973/// 0x07 : Ordered (non-signaling) 1974/// 0x08 : Equal (unordered, non-signaling) 1975/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1976/// 0x0a : Not-greater-than (unordered, signaling) 1977/// 0x0b : False (ordered, non-signaling) 1978/// 0x0c : Not-equal (ordered, non-signaling) 1979/// 0x0d : Greater-than-or-equal (ordered, signaling) 1980/// 0x0e : Greater-than (ordered, signaling) 1981/// 0x0f : True (unordered, non-signaling) 1982/// 0x10 : Equal (ordered, signaling) 1983/// 0x11 : Less-than (ordered, non-signaling) 1984/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1985/// 0x13 : Unordered (signaling) 1986/// 0x14 : Not-equal (unordered, signaling) 1987/// 0x15 : Not-less-than (unordered, non-signaling) 1988/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1989/// 0x17 : Ordered (signaling) 1990/// 0x18 : Equal (unordered, signaling) 1991/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1992/// 0x1a : Not-greater-than (unordered, non-signaling) 1993/// 0x1b : False (ordered, signaling) 1994/// 0x1c : Not-equal (ordered, signaling) 1995/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1996/// 0x1e : Greater-than (ordered, non-signaling) 1997/// 0x1f : True (unordered, signaling) 1998/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1999#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 2000 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 2001 (__v4sf)(__m128)(b), (c)); }) 2002 2003/// \brief Takes a [8 x i32] vector and returns the vector element value 2004/// indexed by the immediate constant operand. 2005/// 2006/// \headerfile <x86intrin.h> 2007/// 2008/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2009/// instruction. 2010/// 2011/// \param __a 2012/// A 256-bit vector of [8 x i32]. 2013/// \param __imm 2014/// An immediate integer operand with bits [2:0] determining which vector 2015/// element is extracted and returned. 2016/// \returns A 32-bit integer containing the extracted 32 bits of extended 2017/// packed data. 2018static __inline int __DEFAULT_FN_ATTRS 2019_mm256_extract_epi32(__m256i __a, const int __imm) 2020{ 2021 __v8si __b = (__v8si)__a; 2022 return __b[__imm & 7]; 2023} 2024 2025/// \brief Takes a [16 x i16] vector and returns the vector element value 2026/// indexed by the immediate constant operand. 2027/// 2028/// \headerfile <x86intrin.h> 2029/// 2030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2031/// instruction. 2032/// 2033/// \param __a 2034/// A 256-bit integer vector of [16 x i16]. 2035/// \param __imm 2036/// An immediate integer operand with bits [3:0] determining which vector 2037/// element is extracted and returned. 2038/// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2039/// packed data. 2040static __inline int __DEFAULT_FN_ATTRS 2041_mm256_extract_epi16(__m256i __a, const int __imm) 2042{ 2043 __v16hi __b = (__v16hi)__a; 2044 return (unsigned short)__b[__imm & 15]; 2045} 2046 2047/// \brief Takes a [32 x i8] vector and returns the vector element value 2048/// indexed by the immediate constant operand. 2049/// 2050/// \headerfile <x86intrin.h> 2051/// 2052/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2053/// instruction. 2054/// 2055/// \param __a 2056/// A 256-bit integer vector of [32 x i8]. 2057/// \param __imm 2058/// An immediate integer operand with bits [4:0] determining which vector 2059/// element is extracted and returned. 2060/// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2061/// packed data. 2062static __inline int __DEFAULT_FN_ATTRS 2063_mm256_extract_epi8(__m256i __a, const int __imm) 2064{ 2065 __v32qi __b = (__v32qi)__a; 2066 return (unsigned char)__b[__imm & 31]; 2067} 2068 2069#ifdef __x86_64__ 2070/// \brief Takes a [4 x i64] vector and returns the vector element value 2071/// indexed by the immediate constant operand. 2072/// 2073/// \headerfile <x86intrin.h> 2074/// 2075/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2076/// instruction. 2077/// 2078/// \param __a 2079/// A 256-bit integer vector of [4 x i64]. 2080/// \param __imm 2081/// An immediate integer operand with bits [1:0] determining which vector 2082/// element is extracted and returned. 2083/// \returns A 64-bit integer containing the extracted 64 bits of extended 2084/// packed data. 2085static __inline long long __DEFAULT_FN_ATTRS 2086_mm256_extract_epi64(__m256i __a, const int __imm) 2087{ 2088 __v4di __b = (__v4di)__a; 2089 return __b[__imm & 3]; 2090} 2091#endif 2092 2093/// \brief Takes a [8 x i32] vector and replaces the vector element value 2094/// indexed by the immediate constant operand by a new value. Returns the 2095/// modified vector. 2096/// 2097/// \headerfile <x86intrin.h> 2098/// 2099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2100/// instruction. 2101/// 2102/// \param __a 2103/// A vector of [8 x i32] to be used by the insert operation. 2104/// \param __b 2105/// An integer value. The replacement value for the insert operation. 2106/// \param __imm 2107/// An immediate integer specifying the index of the vector element to be 2108/// replaced. 2109/// \returns A copy of vector \a __a, after replacing its element indexed by 2110/// \a __imm with \a __b. 2111static __inline __m256i __DEFAULT_FN_ATTRS 2112_mm256_insert_epi32(__m256i __a, int __b, int const __imm) 2113{ 2114 __v8si __c = (__v8si)__a; 2115 __c[__imm & 7] = __b; 2116 return (__m256i)__c; 2117} 2118 2119 2120/// \brief Takes a [16 x i16] vector and replaces the vector element value 2121/// indexed by the immediate constant operand with a new value. Returns the 2122/// modified vector. 2123/// 2124/// \headerfile <x86intrin.h> 2125/// 2126/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2127/// instruction. 2128/// 2129/// \param __a 2130/// A vector of [16 x i16] to be used by the insert operation. 2131/// \param __b 2132/// An i16 integer value. The replacement value for the insert operation. 2133/// \param __imm 2134/// An immediate integer specifying the index of the vector element to be 2135/// replaced. 2136/// \returns A copy of vector \a __a, after replacing its element indexed by 2137/// \a __imm with \a __b. 2138static __inline __m256i __DEFAULT_FN_ATTRS 2139_mm256_insert_epi16(__m256i __a, int __b, int const __imm) 2140{ 2141 __v16hi __c = (__v16hi)__a; 2142 __c[__imm & 15] = __b; 2143 return (__m256i)__c; 2144} 2145 2146/// \brief Takes a [32 x i8] vector and replaces the vector element value 2147/// indexed by the immediate constant operand with a new value. Returns the 2148/// modified vector. 2149/// 2150/// \headerfile <x86intrin.h> 2151/// 2152/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2153/// instruction. 2154/// 2155/// \param __a 2156/// A vector of [32 x i8] to be used by the insert operation. 2157/// \param __b 2158/// An i8 integer value. The replacement value for the insert operation. 2159/// \param __imm 2160/// An immediate integer specifying the index of the vector element to be 2161/// replaced. 2162/// \returns A copy of vector \a __a, after replacing its element indexed by 2163/// \a __imm with \a __b. 2164static __inline __m256i __DEFAULT_FN_ATTRS 2165_mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2166{ 2167 __v32qi __c = (__v32qi)__a; 2168 __c[__imm & 31] = __b; 2169 return (__m256i)__c; 2170} 2171 2172#ifdef __x86_64__ 2173/// \brief Takes a [4 x i64] vector and replaces the vector element value 2174/// indexed by the immediate constant operand with a new value. Returns the 2175/// modified vector. 2176/// 2177/// \headerfile <x86intrin.h> 2178/// 2179/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2180/// instruction. 2181/// 2182/// \param __a 2183/// A vector of [4 x i64] to be used by the insert operation. 2184/// \param __b 2185/// A 64-bit integer value. The replacement value for the insert operation. 2186/// \param __imm 2187/// An immediate integer specifying the index of the vector element to be 2188/// replaced. 2189/// \returns A copy of vector \a __a, after replacing its element indexed by 2190/// \a __imm with \a __b. 2191static __inline __m256i __DEFAULT_FN_ATTRS 2192_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2193{ 2194 __v4di __c = (__v4di)__a; 2195 __c[__imm & 3] = __b; 2196 return (__m256i)__c; 2197} 2198#endif 2199 2200/* Conversion */ 2201/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2202/// 2203/// \headerfile <x86intrin.h> 2204/// 2205/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2206/// 2207/// \param __a 2208/// A 128-bit integer vector of [4 x i32]. 2209/// \returns A 256-bit vector of [4 x double] containing the converted values. 2210static __inline __m256d __DEFAULT_FN_ATTRS 2211_mm256_cvtepi32_pd(__m128i __a) 2212{ 2213 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2214} 2215 2216/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2217/// 2218/// \headerfile <x86intrin.h> 2219/// 2220/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2221/// 2222/// \param __a 2223/// A 256-bit integer vector. 2224/// \returns A 256-bit vector of [8 x float] containing the converted values. 2225static __inline __m256 __DEFAULT_FN_ATTRS 2226_mm256_cvtepi32_ps(__m256i __a) 2227{ 2228 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2229} 2230 2231/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2232/// [4 x float]. 2233/// 2234/// \headerfile <x86intrin.h> 2235/// 2236/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2237/// 2238/// \param __a 2239/// A 256-bit vector of [4 x double]. 2240/// \returns A 128-bit vector of [4 x float] containing the converted values. 2241static __inline __m128 __DEFAULT_FN_ATTRS 2242_mm256_cvtpd_ps(__m256d __a) 2243{ 2244 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2245} 2246 2247/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2248/// 2249/// \headerfile <x86intrin.h> 2250/// 2251/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2252/// 2253/// \param __a 2254/// A 256-bit vector of [8 x float]. 2255/// \returns A 256-bit integer vector containing the converted values. 2256static __inline __m256i __DEFAULT_FN_ATTRS 2257_mm256_cvtps_epi32(__m256 __a) 2258{ 2259 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2260} 2261 2262/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2263/// x double]. 2264/// 2265/// \headerfile <x86intrin.h> 2266/// 2267/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2268/// 2269/// \param __a 2270/// A 128-bit vector of [4 x float]. 2271/// \returns A 256-bit vector of [4 x double] containing the converted values. 2272static __inline __m256d __DEFAULT_FN_ATTRS 2273_mm256_cvtps_pd(__m128 __a) 2274{ 2275 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2276} 2277 2278/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2279/// x i32], truncating the result by rounding towards zero when it is 2280/// inexact. 2281/// 2282/// \headerfile <x86intrin.h> 2283/// 2284/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2285/// 2286/// \param __a 2287/// A 256-bit vector of [4 x double]. 2288/// \returns A 128-bit integer vector containing the converted values. 2289static __inline __m128i __DEFAULT_FN_ATTRS 2290_mm256_cvttpd_epi32(__m256d __a) 2291{ 2292 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2293} 2294 2295/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2296/// x i32]. When a conversion is inexact, the value returned is rounded 2297/// according to the rounding control bits in the MXCSR register. 2298/// 2299/// \headerfile <x86intrin.h> 2300/// 2301/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2302/// 2303/// \param __a 2304/// A 256-bit vector of [4 x double]. 2305/// \returns A 128-bit integer vector containing the converted values. 2306static __inline __m128i __DEFAULT_FN_ATTRS 2307_mm256_cvtpd_epi32(__m256d __a) 2308{ 2309 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2310} 2311 2312/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2313/// truncating the result by rounding towards zero when it is inexact. 2314/// 2315/// \headerfile <x86intrin.h> 2316/// 2317/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2318/// 2319/// \param __a 2320/// A 256-bit vector of [8 x float]. 2321/// \returns A 256-bit integer vector containing the converted values. 2322static __inline __m256i __DEFAULT_FN_ATTRS 2323_mm256_cvttps_epi32(__m256 __a) 2324{ 2325 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2326} 2327 2328/// \brief Returns the first element of the input vector of [4 x double]. 2329/// 2330/// \headerfile <avxintrin.h> 2331/// 2332/// This intrinsic is a utility function and does not correspond to a specific 2333/// instruction. 2334/// 2335/// \param __a 2336/// A 256-bit vector of [4 x double]. 2337/// \returns A 64 bit double containing the first element of the input vector. 2338static __inline double __DEFAULT_FN_ATTRS 2339_mm256_cvtsd_f64(__m256d __a) 2340{ 2341 return __a[0]; 2342} 2343 2344/// \brief Returns the first element of the input vector of [8 x i32]. 2345/// 2346/// \headerfile <avxintrin.h> 2347/// 2348/// This intrinsic is a utility function and does not correspond to a specific 2349/// instruction. 2350/// 2351/// \param __a 2352/// A 256-bit vector of [8 x i32]. 2353/// \returns A 32 bit integer containing the first element of the input vector. 2354static __inline int __DEFAULT_FN_ATTRS 2355_mm256_cvtsi256_si32(__m256i __a) 2356{ 2357 __v8si __b = (__v8si)__a; 2358 return __b[0]; 2359} 2360 2361/// \brief Returns the first element of the input vector of [8 x float]. 2362/// 2363/// \headerfile <avxintrin.h> 2364/// 2365/// This intrinsic is a utility function and does not correspond to a specific 2366/// instruction. 2367/// 2368/// \param __a 2369/// A 256-bit vector of [8 x float]. 2370/// \returns A 32 bit float containing the first element of the input vector. 2371static __inline float __DEFAULT_FN_ATTRS 2372_mm256_cvtss_f32(__m256 __a) 2373{ 2374 return __a[0]; 2375} 2376 2377/* Vector replicate */ 2378/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2379/// vector of [8 x float] to float values in a 256-bit vector of 2380/// [8 x float]. 2381/// 2382/// \headerfile <x86intrin.h> 2383/// 2384/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2385/// 2386/// \param __a 2387/// A 256-bit vector of [8 x float]. \n 2388/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2389/// the return value. \n 2390/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2391/// the return value. \n 2392/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2393/// return value. \n 2394/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2395/// return value. 2396/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2397/// values. 2398static __inline __m256 __DEFAULT_FN_ATTRS 2399_mm256_movehdup_ps(__m256 __a) 2400{ 2401 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2402} 2403 2404/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2405/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2406/// 2407/// \headerfile <x86intrin.h> 2408/// 2409/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2410/// 2411/// \param __a 2412/// A 256-bit vector of [8 x float]. \n 2413/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2414/// the return value. \n 2415/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2416/// the return value. \n 2417/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2418/// return value. \n 2419/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2420/// return value. 2421/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2422/// values. 2423static __inline __m256 __DEFAULT_FN_ATTRS 2424_mm256_moveldup_ps(__m256 __a) 2425{ 2426 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2427} 2428 2429/// \brief Moves and duplicates double-precision floating point values from a 2430/// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2431/// vector of [4 x double]. 2432/// 2433/// \headerfile <x86intrin.h> 2434/// 2435/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2436/// 2437/// \param __a 2438/// A 256-bit vector of [4 x double]. \n 2439/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2440/// return value. \n 2441/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2442/// the return value. 2443/// \returns A 256-bit vector of [4 x double] containing the moved and 2444/// duplicated values. 2445static __inline __m256d __DEFAULT_FN_ATTRS 2446_mm256_movedup_pd(__m256d __a) 2447{ 2448 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2449} 2450 2451/* Unpack and Interleave */ 2452/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2453/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2454/// 2455/// \headerfile <x86intrin.h> 2456/// 2457/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2458/// 2459/// \param __a 2460/// A 256-bit floating-point vector of [4 x double]. \n 2461/// Bits [127:64] are written to bits [63:0] of the return value. \n 2462/// Bits [255:192] are written to bits [191:128] of the return value. \n 2463/// \param __b 2464/// A 256-bit floating-point vector of [4 x double]. \n 2465/// Bits [127:64] are written to bits [127:64] of the return value. \n 2466/// Bits [255:192] are written to bits [255:192] of the return value. \n 2467/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2468static __inline __m256d __DEFAULT_FN_ATTRS 2469_mm256_unpackhi_pd(__m256d __a, __m256d __b) 2470{ 2471 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2472} 2473 2474/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2475/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2476/// 2477/// \headerfile <x86intrin.h> 2478/// 2479/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2480/// 2481/// \param __a 2482/// A 256-bit floating-point vector of [4 x double]. \n 2483/// Bits [63:0] are written to bits [63:0] of the return value. \n 2484/// Bits [191:128] are written to bits [191:128] of the return value. 2485/// \param __b 2486/// A 256-bit floating-point vector of [4 x double]. \n 2487/// Bits [63:0] are written to bits [127:64] of the return value. \n 2488/// Bits [191:128] are written to bits [255:192] of the return value. \n 2489/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2490static __inline __m256d __DEFAULT_FN_ATTRS 2491_mm256_unpacklo_pd(__m256d __a, __m256d __b) 2492{ 2493 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2494} 2495 2496/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2497/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2498/// vector of [8 x float]. 2499/// 2500/// \headerfile <x86intrin.h> 2501/// 2502/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2503/// 2504/// \param __a 2505/// A 256-bit vector of [8 x float]. \n 2506/// Bits [95:64] are written to bits [31:0] of the return value. \n 2507/// Bits [127:96] are written to bits [95:64] of the return value. \n 2508/// Bits [223:192] are written to bits [159:128] of the return value. \n 2509/// Bits [255:224] are written to bits [223:192] of the return value. 2510/// \param __b 2511/// A 256-bit vector of [8 x float]. \n 2512/// Bits [95:64] are written to bits [63:32] of the return value. \n 2513/// Bits [127:96] are written to bits [127:96] of the return value. \n 2514/// Bits [223:192] are written to bits [191:160] of the return value. \n 2515/// Bits [255:224] are written to bits [255:224] of the return value. 2516/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2517static __inline __m256 __DEFAULT_FN_ATTRS 2518_mm256_unpackhi_ps(__m256 __a, __m256 __b) 2519{ 2520 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2521} 2522 2523/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2524/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2525/// vector of [8 x float]. 2526/// 2527/// \headerfile <x86intrin.h> 2528/// 2529/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2530/// 2531/// \param __a 2532/// A 256-bit vector of [8 x float]. \n 2533/// Bits [31:0] are written to bits [31:0] of the return value. \n 2534/// Bits [63:32] are written to bits [95:64] of the return value. \n 2535/// Bits [159:128] are written to bits [159:128] of the return value. \n 2536/// Bits [191:160] are written to bits [223:192] of the return value. 2537/// \param __b 2538/// A 256-bit vector of [8 x float]. \n 2539/// Bits [31:0] are written to bits [63:32] of the return value. \n 2540/// Bits [63:32] are written to bits [127:96] of the return value. \n 2541/// Bits [159:128] are written to bits [191:160] of the return value. \n 2542/// Bits [191:160] are written to bits [255:224] of the return value. 2543/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2544static __inline __m256 __DEFAULT_FN_ATTRS 2545_mm256_unpacklo_ps(__m256 __a, __m256 __b) 2546{ 2547 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2548} 2549 2550/* Bit Test */ 2551/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2552/// element-by-element comparison of the double-precision element in the 2553/// first source vector and the corresponding element in the second source 2554/// vector. 2555/// 2556/// The EFLAGS register is updated as follows: \n 2557/// If there is at least one pair of double-precision elements where the 2558/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2559/// ZF flag is set to 1. \n 2560/// If there is at least one pair of double-precision elements where the 2561/// sign-bit of the first element is 0 and the sign-bit of the second element 2562/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2563/// This intrinsic returns the value of the ZF flag. 2564/// 2565/// \headerfile <x86intrin.h> 2566/// 2567/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2568/// 2569/// \param __a 2570/// A 128-bit vector of [2 x double]. 2571/// \param __b 2572/// A 128-bit vector of [2 x double]. 2573/// \returns the ZF flag in the EFLAGS register. 2574static __inline int __DEFAULT_FN_ATTRS 2575_mm_testz_pd(__m128d __a, __m128d __b) 2576{ 2577 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2578} 2579 2580/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2581/// element-by-element comparison of the double-precision element in the 2582/// first source vector and the corresponding element in the second source 2583/// vector. 2584/// 2585/// The EFLAGS register is updated as follows: \n 2586/// If there is at least one pair of double-precision elements where the 2587/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2588/// ZF flag is set to 1. \n 2589/// If there is at least one pair of double-precision elements where the 2590/// sign-bit of the first element is 0 and the sign-bit of the second element 2591/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2592/// This intrinsic returns the value of the CF flag. 2593/// 2594/// \headerfile <x86intrin.h> 2595/// 2596/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2597/// 2598/// \param __a 2599/// A 128-bit vector of [2 x double]. 2600/// \param __b 2601/// A 128-bit vector of [2 x double]. 2602/// \returns the CF flag in the EFLAGS register. 2603static __inline int __DEFAULT_FN_ATTRS 2604_mm_testc_pd(__m128d __a, __m128d __b) 2605{ 2606 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2607} 2608 2609/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2610/// element-by-element comparison of the double-precision element in the 2611/// first source vector and the corresponding element in the second source 2612/// vector. 2613/// 2614/// The EFLAGS register is updated as follows: \n 2615/// If there is at least one pair of double-precision elements where the 2616/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2617/// ZF flag is set to 1. \n 2618/// If there is at least one pair of double-precision elements where the 2619/// sign-bit of the first element is 0 and the sign-bit of the second element 2620/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2621/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2622/// otherwise it returns 0. 2623/// 2624/// \headerfile <x86intrin.h> 2625/// 2626/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2627/// 2628/// \param __a 2629/// A 128-bit vector of [2 x double]. 2630/// \param __b 2631/// A 128-bit vector of [2 x double]. 2632/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2633static __inline int __DEFAULT_FN_ATTRS 2634_mm_testnzc_pd(__m128d __a, __m128d __b) 2635{ 2636 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2637} 2638 2639/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2640/// element-by-element comparison of the single-precision element in the 2641/// first source vector and the corresponding element in the second source 2642/// vector. 2643/// 2644/// The EFLAGS register is updated as follows: \n 2645/// If there is at least one pair of single-precision elements where the 2646/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2647/// ZF flag is set to 1. \n 2648/// If there is at least one pair of single-precision elements where the 2649/// sign-bit of the first element is 0 and the sign-bit of the second element 2650/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2651/// This intrinsic returns the value of the ZF flag. 2652/// 2653/// \headerfile <x86intrin.h> 2654/// 2655/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2656/// 2657/// \param __a 2658/// A 128-bit vector of [4 x float]. 2659/// \param __b 2660/// A 128-bit vector of [4 x float]. 2661/// \returns the ZF flag. 2662static __inline int __DEFAULT_FN_ATTRS 2663_mm_testz_ps(__m128 __a, __m128 __b) 2664{ 2665 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2666} 2667 2668/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2669/// element-by-element comparison of the single-precision element in the 2670/// first source vector and the corresponding element in the second source 2671/// vector. 2672/// 2673/// The EFLAGS register is updated as follows: \n 2674/// If there is at least one pair of single-precision elements where the 2675/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2676/// ZF flag is set to 1. \n 2677/// If there is at least one pair of single-precision elements where the 2678/// sign-bit of the first element is 0 and the sign-bit of the second element 2679/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2680/// This intrinsic returns the value of the CF flag. 2681/// 2682/// \headerfile <x86intrin.h> 2683/// 2684/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2685/// 2686/// \param __a 2687/// A 128-bit vector of [4 x float]. 2688/// \param __b 2689/// A 128-bit vector of [4 x float]. 2690/// \returns the CF flag. 2691static __inline int __DEFAULT_FN_ATTRS 2692_mm_testc_ps(__m128 __a, __m128 __b) 2693{ 2694 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2695} 2696 2697/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2698/// element-by-element comparison of the single-precision element in the 2699/// first source vector and the corresponding element in the second source 2700/// vector. 2701/// 2702/// The EFLAGS register is updated as follows: \n 2703/// If there is at least one pair of single-precision elements where the 2704/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2705/// ZF flag is set to 1. \n 2706/// If there is at least one pair of single-precision elements where the 2707/// sign-bit of the first element is 0 and the sign-bit of the second element 2708/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2709/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2710/// otherwise it returns 0. 2711/// 2712/// \headerfile <x86intrin.h> 2713/// 2714/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2715/// 2716/// \param __a 2717/// A 128-bit vector of [4 x float]. 2718/// \param __b 2719/// A 128-bit vector of [4 x float]. 2720/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2721static __inline int __DEFAULT_FN_ATTRS 2722_mm_testnzc_ps(__m128 __a, __m128 __b) 2723{ 2724 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2725} 2726 2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2728/// element-by-element comparison of the double-precision elements in the 2729/// first source vector and the corresponding elements in the second source 2730/// vector. 2731/// 2732/// The EFLAGS register is updated as follows: \n 2733/// If there is at least one pair of double-precision elements where the 2734/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2735/// ZF flag is set to 1. \n 2736/// If there is at least one pair of double-precision elements where the 2737/// sign-bit of the first element is 0 and the sign-bit of the second element 2738/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2739/// This intrinsic returns the value of the ZF flag. 2740/// 2741/// \headerfile <x86intrin.h> 2742/// 2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2744/// 2745/// \param __a 2746/// A 256-bit vector of [4 x double]. 2747/// \param __b 2748/// A 256-bit vector of [4 x double]. 2749/// \returns the ZF flag. 2750static __inline int __DEFAULT_FN_ATTRS 2751_mm256_testz_pd(__m256d __a, __m256d __b) 2752{ 2753 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2754} 2755 2756/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2757/// element-by-element comparison of the double-precision elements in the 2758/// first source vector and the corresponding elements in the second source 2759/// vector. 2760/// 2761/// The EFLAGS register is updated as follows: \n 2762/// If there is at least one pair of double-precision elements where the 2763/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2764/// ZF flag is set to 1. \n 2765/// If there is at least one pair of double-precision elements where the 2766/// sign-bit of the first element is 0 and the sign-bit of the second element 2767/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2768/// This intrinsic returns the value of the CF flag. 2769/// 2770/// \headerfile <x86intrin.h> 2771/// 2772/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2773/// 2774/// \param __a 2775/// A 256-bit vector of [4 x double]. 2776/// \param __b 2777/// A 256-bit vector of [4 x double]. 2778/// \returns the CF flag. 2779static __inline int __DEFAULT_FN_ATTRS 2780_mm256_testc_pd(__m256d __a, __m256d __b) 2781{ 2782 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2783} 2784 2785/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2786/// element-by-element comparison of the double-precision elements in the 2787/// first source vector and the corresponding elements in the second source 2788/// vector. 2789/// 2790/// The EFLAGS register is updated as follows: \n 2791/// If there is at least one pair of double-precision elements where the 2792/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2793/// ZF flag is set to 1. \n 2794/// If there is at least one pair of double-precision elements where the 2795/// sign-bit of the first element is 0 and the sign-bit of the second element 2796/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2797/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2798/// otherwise it returns 0. 2799/// 2800/// \headerfile <x86intrin.h> 2801/// 2802/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2803/// 2804/// \param __a 2805/// A 256-bit vector of [4 x double]. 2806/// \param __b 2807/// A 256-bit vector of [4 x double]. 2808/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2809static __inline int __DEFAULT_FN_ATTRS 2810_mm256_testnzc_pd(__m256d __a, __m256d __b) 2811{ 2812 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2813} 2814 2815/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2816/// element-by-element comparison of the single-precision element in the 2817/// first source vector and the corresponding element in the second source 2818/// vector. 2819/// 2820/// The EFLAGS register is updated as follows: \n 2821/// If there is at least one pair of single-precision elements where the 2822/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2823/// ZF flag is set to 1. \n 2824/// If there is at least one pair of single-precision elements where the 2825/// sign-bit of the first element is 0 and the sign-bit of the second element 2826/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2827/// This intrinsic returns the value of the ZF flag. 2828/// 2829/// \headerfile <x86intrin.h> 2830/// 2831/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2832/// 2833/// \param __a 2834/// A 256-bit vector of [8 x float]. 2835/// \param __b 2836/// A 256-bit vector of [8 x float]. 2837/// \returns the ZF flag. 2838static __inline int __DEFAULT_FN_ATTRS 2839_mm256_testz_ps(__m256 __a, __m256 __b) 2840{ 2841 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2842} 2843 2844/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2845/// element-by-element comparison of the single-precision element in the 2846/// first source vector and the corresponding element in the second source 2847/// vector. 2848/// 2849/// The EFLAGS register is updated as follows: \n 2850/// If there is at least one pair of single-precision elements where the 2851/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2852/// ZF flag is set to 1. \n 2853/// If there is at least one pair of single-precision elements where the 2854/// sign-bit of the first element is 0 and the sign-bit of the second element 2855/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2856/// This intrinsic returns the value of the CF flag. 2857/// 2858/// \headerfile <x86intrin.h> 2859/// 2860/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2861/// 2862/// \param __a 2863/// A 256-bit vector of [8 x float]. 2864/// \param __b 2865/// A 256-bit vector of [8 x float]. 2866/// \returns the CF flag. 2867static __inline int __DEFAULT_FN_ATTRS 2868_mm256_testc_ps(__m256 __a, __m256 __b) 2869{ 2870 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2871} 2872 2873/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2874/// element-by-element comparison of the single-precision elements in the 2875/// first source vector and the corresponding elements in the second source 2876/// vector. 2877/// 2878/// The EFLAGS register is updated as follows: \n 2879/// If there is at least one pair of single-precision elements where the 2880/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2881/// ZF flag is set to 1. \n 2882/// If there is at least one pair of single-precision elements where the 2883/// sign-bit of the first element is 0 and the sign-bit of the second element 2884/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2885/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2886/// otherwise it returns 0. 2887/// 2888/// \headerfile <x86intrin.h> 2889/// 2890/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2891/// 2892/// \param __a 2893/// A 256-bit vector of [8 x float]. 2894/// \param __b 2895/// A 256-bit vector of [8 x float]. 2896/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2897static __inline int __DEFAULT_FN_ATTRS 2898_mm256_testnzc_ps(__m256 __a, __m256 __b) 2899{ 2900 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2901} 2902 2903/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2904/// of the two source vectors. 2905/// 2906/// The EFLAGS register is updated as follows: \n 2907/// If there is at least one pair of bits where both bits are 1, the ZF flag 2908/// is set to 0. Otherwise the ZF flag is set to 1. \n 2909/// If there is at least one pair of bits where the bit from the first source 2910/// vector is 0 and the bit from the second source vector is 1, the CF flag 2911/// is set to 0. Otherwise the CF flag is set to 1. \n 2912/// This intrinsic returns the value of the ZF flag. 2913/// 2914/// \headerfile <x86intrin.h> 2915/// 2916/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2917/// 2918/// \param __a 2919/// A 256-bit integer vector. 2920/// \param __b 2921/// A 256-bit integer vector. 2922/// \returns the ZF flag. 2923static __inline int __DEFAULT_FN_ATTRS 2924_mm256_testz_si256(__m256i __a, __m256i __b) 2925{ 2926 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2927} 2928 2929/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2930/// of the two source vectors. 2931/// 2932/// The EFLAGS register is updated as follows: \n 2933/// If there is at least one pair of bits where both bits are 1, the ZF flag 2934/// is set to 0. Otherwise the ZF flag is set to 1. \n 2935/// If there is at least one pair of bits where the bit from the first source 2936/// vector is 0 and the bit from the second source vector is 1, the CF flag 2937/// is set to 0. Otherwise the CF flag is set to 1. \n 2938/// This intrinsic returns the value of the CF flag. 2939/// 2940/// \headerfile <x86intrin.h> 2941/// 2942/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2943/// 2944/// \param __a 2945/// A 256-bit integer vector. 2946/// \param __b 2947/// A 256-bit integer vector. 2948/// \returns the CF flag. 2949static __inline int __DEFAULT_FN_ATTRS 2950_mm256_testc_si256(__m256i __a, __m256i __b) 2951{ 2952 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2953} 2954 2955/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2956/// of the two source vectors. 2957/// 2958/// The EFLAGS register is updated as follows: \n 2959/// If there is at least one pair of bits where both bits are 1, the ZF flag 2960/// is set to 0. Otherwise the ZF flag is set to 1. \n 2961/// If there is at least one pair of bits where the bit from the first source 2962/// vector is 0 and the bit from the second source vector is 1, the CF flag 2963/// is set to 0. Otherwise the CF flag is set to 1. \n 2964/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2965/// otherwise it returns 0. 2966/// 2967/// \headerfile <x86intrin.h> 2968/// 2969/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2970/// 2971/// \param __a 2972/// A 256-bit integer vector. 2973/// \param __b 2974/// A 256-bit integer vector. 2975/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2976static __inline int __DEFAULT_FN_ATTRS 2977_mm256_testnzc_si256(__m256i __a, __m256i __b) 2978{ 2979 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2980} 2981 2982/* Vector extract sign mask */ 2983/// \brief Extracts the sign bits of double-precision floating point elements 2984/// in a 256-bit vector of [4 x double] and writes them to the lower order 2985/// bits of the return value. 2986/// 2987/// \headerfile <x86intrin.h> 2988/// 2989/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2990/// 2991/// \param __a 2992/// A 256-bit vector of [4 x double] containing the double-precision 2993/// floating point values with sign bits to be extracted. 2994/// \returns The sign bits from the operand, written to bits [3:0]. 2995static __inline int __DEFAULT_FN_ATTRS 2996_mm256_movemask_pd(__m256d __a) 2997{ 2998 return __builtin_ia32_movmskpd256((__v4df)__a); 2999} 3000 3001/// \brief Extracts the sign bits of double-precision floating point elements 3002/// in a 256-bit vector of [8 x float] and writes them to the lower order 3003/// bits of the return value. 3004/// 3005/// \headerfile <x86intrin.h> 3006/// 3007/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 3008/// 3009/// \param __a 3010/// A 256-bit vector of [8 x float] containing the double-precision floating 3011/// point values with sign bits to be extracted. 3012/// \returns The sign bits from the operand, written to bits [7:0]. 3013static __inline int __DEFAULT_FN_ATTRS 3014_mm256_movemask_ps(__m256 __a) 3015{ 3016 return __builtin_ia32_movmskps256((__v8sf)__a); 3017} 3018 3019/* Vector __zero */ 3020/// \brief Zeroes the contents of all XMM or YMM registers. 3021/// 3022/// \headerfile <x86intrin.h> 3023/// 3024/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 3025static __inline void __DEFAULT_FN_ATTRS 3026_mm256_zeroall(void) 3027{ 3028 __builtin_ia32_vzeroall(); 3029} 3030 3031/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 3032/// 3033/// \headerfile <x86intrin.h> 3034/// 3035/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 3036static __inline void __DEFAULT_FN_ATTRS 3037_mm256_zeroupper(void) 3038{ 3039 __builtin_ia32_vzeroupper(); 3040} 3041 3042/* Vector load with broadcast */ 3043/// \brief Loads a scalar single-precision floating point value from the 3044/// specified address pointed to by \a __a and broadcasts it to the elements 3045/// of a [4 x float] vector. 3046/// 3047/// \headerfile <x86intrin.h> 3048/// 3049/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3050/// 3051/// \param __a 3052/// The single-precision floating point value to be broadcast. 3053/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3054/// equal to the broadcast value. 3055static __inline __m128 __DEFAULT_FN_ATTRS 3056_mm_broadcast_ss(float const *__a) 3057{ 3058 float __f = *__a; 3059 return (__m128)(__v4sf){ __f, __f, __f, __f }; 3060} 3061 3062/// \brief Loads a scalar double-precision floating point value from the 3063/// specified address pointed to by \a __a and broadcasts it to the elements 3064/// of a [4 x double] vector. 3065/// 3066/// \headerfile <x86intrin.h> 3067/// 3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3069/// 3070/// \param __a 3071/// The double-precision floating point value to be broadcast. 3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3073/// equal to the broadcast value. 3074static __inline __m256d __DEFAULT_FN_ATTRS 3075_mm256_broadcast_sd(double const *__a) 3076{ 3077 double __d = *__a; 3078 return (__m256d)(__v4df){ __d, __d, __d, __d }; 3079} 3080 3081/// \brief Loads a scalar single-precision floating point value from the 3082/// specified address pointed to by \a __a and broadcasts it to the elements 3083/// of a [8 x float] vector. 3084/// 3085/// \headerfile <x86intrin.h> 3086/// 3087/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3088/// 3089/// \param __a 3090/// The single-precision floating point value to be broadcast. 3091/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3092/// equal to the broadcast value. 3093static __inline __m256 __DEFAULT_FN_ATTRS 3094_mm256_broadcast_ss(float const *__a) 3095{ 3096 float __f = *__a; 3097 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3098} 3099 3100/// \brief Loads the data from a 128-bit vector of [2 x double] from the 3101/// specified address pointed to by \a __a and broadcasts it to 128-bit 3102/// elements in a 256-bit vector of [4 x double]. 3103/// 3104/// \headerfile <x86intrin.h> 3105/// 3106/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3107/// 3108/// \param __a 3109/// The 128-bit vector of [2 x double] to be broadcast. 3110/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3111/// equal to the broadcast value. 3112static __inline __m256d __DEFAULT_FN_ATTRS 3113_mm256_broadcast_pd(__m128d const *__a) 3114{ 3115 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 3116} 3117 3118/// \brief Loads the data from a 128-bit vector of [4 x float] from the 3119/// specified address pointed to by \a __a and broadcasts it to 128-bit 3120/// elements in a 256-bit vector of [8 x float]. 3121/// 3122/// \headerfile <x86intrin.h> 3123/// 3124/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3125/// 3126/// \param __a 3127/// The 128-bit vector of [4 x float] to be broadcast. 3128/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3129/// equal to the broadcast value. 3130static __inline __m256 __DEFAULT_FN_ATTRS 3131_mm256_broadcast_ps(__m128 const *__a) 3132{ 3133 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 3134} 3135 3136/* SIMD load ops */ 3137/// \brief Loads 4 double-precision floating point values from a 32-byte aligned 3138/// memory location pointed to by \a __p into a vector of [4 x double]. 3139/// 3140/// \headerfile <x86intrin.h> 3141/// 3142/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3143/// 3144/// \param __p 3145/// A 32-byte aligned pointer to a memory location containing 3146/// double-precision floating point values. 3147/// \returns A 256-bit vector of [4 x double] containing the moved values. 3148static __inline __m256d __DEFAULT_FN_ATTRS 3149_mm256_load_pd(double const *__p) 3150{ 3151 return *(__m256d *)__p; 3152} 3153 3154/// \brief Loads 8 single-precision floating point values from a 32-byte aligned 3155/// memory location pointed to by \a __p into a vector of [8 x float]. 3156/// 3157/// \headerfile <x86intrin.h> 3158/// 3159/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3160/// 3161/// \param __p 3162/// A 32-byte aligned pointer to a memory location containing float values. 3163/// \returns A 256-bit vector of [8 x float] containing the moved values. 3164static __inline __m256 __DEFAULT_FN_ATTRS 3165_mm256_load_ps(float const *__p) 3166{ 3167 return *(__m256 *)__p; 3168} 3169 3170/// \brief Loads 4 double-precision floating point values from an unaligned 3171/// memory location pointed to by \a __p into a vector of [4 x double]. 3172/// 3173/// \headerfile <x86intrin.h> 3174/// 3175/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3176/// 3177/// \param __p 3178/// A pointer to a memory location containing double-precision floating 3179/// point values. 3180/// \returns A 256-bit vector of [4 x double] containing the moved values. 3181static __inline __m256d __DEFAULT_FN_ATTRS 3182_mm256_loadu_pd(double const *__p) 3183{ 3184 struct __loadu_pd { 3185 __m256d __v; 3186 } __attribute__((__packed__, __may_alias__)); 3187 return ((struct __loadu_pd*)__p)->__v; 3188} 3189 3190/// \brief Loads 8 single-precision floating point values from an unaligned 3191/// memory location pointed to by \a __p into a vector of [8 x float]. 3192/// 3193/// \headerfile <x86intrin.h> 3194/// 3195/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3196/// 3197/// \param __p 3198/// A pointer to a memory location containing single-precision floating 3199/// point values. 3200/// \returns A 256-bit vector of [8 x float] containing the moved values. 3201static __inline __m256 __DEFAULT_FN_ATTRS 3202_mm256_loadu_ps(float const *__p) 3203{ 3204 struct __loadu_ps { 3205 __m256 __v; 3206 } __attribute__((__packed__, __may_alias__)); 3207 return ((struct __loadu_ps*)__p)->__v; 3208} 3209 3210/// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3211/// location pointed to by \a __p into elements of a 256-bit integer vector. 3212/// 3213/// \headerfile <x86intrin.h> 3214/// 3215/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3216/// 3217/// \param __p 3218/// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3219/// values. 3220/// \returns A 256-bit integer vector containing the moved values. 3221static __inline __m256i __DEFAULT_FN_ATTRS 3222_mm256_load_si256(__m256i const *__p) 3223{ 3224 return *__p; 3225} 3226 3227/// \brief Loads 256 bits of integer data from an unaligned memory location 3228/// pointed to by \a __p into a 256-bit integer vector. 3229/// 3230/// \headerfile <x86intrin.h> 3231/// 3232/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3233/// 3234/// \param __p 3235/// A pointer to a 256-bit integer vector containing integer values. 3236/// \returns A 256-bit integer vector containing the moved values. 3237static __inline __m256i __DEFAULT_FN_ATTRS 3238_mm256_loadu_si256(__m256i const *__p) 3239{ 3240 struct __loadu_si256 { 3241 __m256i __v; 3242 } __attribute__((__packed__, __may_alias__)); 3243 return ((struct __loadu_si256*)__p)->__v; 3244} 3245 3246/// \brief Loads 256 bits of integer data from an unaligned memory location 3247/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3248/// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3249/// line boundary. 3250/// 3251/// \headerfile <x86intrin.h> 3252/// 3253/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3254/// 3255/// \param __p 3256/// A pointer to a 256-bit integer vector containing integer values. 3257/// \returns A 256-bit integer vector containing the moved values. 3258static __inline __m256i __DEFAULT_FN_ATTRS 3259_mm256_lddqu_si256(__m256i const *__p) 3260{ 3261 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3262} 3263 3264/* SIMD store ops */ 3265/// \brief Stores double-precision floating point values from a 256-bit vector 3266/// of [4 x double] to a 32-byte aligned memory location pointed to by 3267/// \a __p. 3268/// 3269/// \headerfile <x86intrin.h> 3270/// 3271/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3272/// 3273/// \param __p 3274/// A 32-byte aligned pointer to a memory location that will receive the 3275/// double-precision floaing point values. 3276/// \param __a 3277/// A 256-bit vector of [4 x double] containing the values to be moved. 3278static __inline void __DEFAULT_FN_ATTRS 3279_mm256_store_pd(double *__p, __m256d __a) 3280{ 3281 *(__m256d *)__p = __a; 3282} 3283 3284/// \brief Stores single-precision floating point values from a 256-bit vector 3285/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3286/// 3287/// \headerfile <x86intrin.h> 3288/// 3289/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3290/// 3291/// \param __p 3292/// A 32-byte aligned pointer to a memory location that will receive the 3293/// float values. 3294/// \param __a 3295/// A 256-bit vector of [8 x float] containing the values to be moved. 3296static __inline void __DEFAULT_FN_ATTRS 3297_mm256_store_ps(float *__p, __m256 __a) 3298{ 3299 *(__m256 *)__p = __a; 3300} 3301 3302/// \brief Stores double-precision floating point values from a 256-bit vector 3303/// of [4 x double] to an unaligned memory location pointed to by \a __p. 3304/// 3305/// \headerfile <x86intrin.h> 3306/// 3307/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3308/// 3309/// \param __p 3310/// A pointer to a memory location that will receive the double-precision 3311/// floating point values. 3312/// \param __a 3313/// A 256-bit vector of [4 x double] containing the values to be moved. 3314static __inline void __DEFAULT_FN_ATTRS 3315_mm256_storeu_pd(double *__p, __m256d __a) 3316{ 3317 struct __storeu_pd { 3318 __m256d __v; 3319 } __attribute__((__packed__, __may_alias__)); 3320 ((struct __storeu_pd*)__p)->__v = __a; 3321} 3322 3323/// \brief Stores single-precision floating point values from a 256-bit vector 3324/// of [8 x float] to an unaligned memory location pointed to by \a __p. 3325/// 3326/// \headerfile <x86intrin.h> 3327/// 3328/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3329/// 3330/// \param __p 3331/// A pointer to a memory location that will receive the float values. 3332/// \param __a 3333/// A 256-bit vector of [8 x float] containing the values to be moved. 3334static __inline void __DEFAULT_FN_ATTRS 3335_mm256_storeu_ps(float *__p, __m256 __a) 3336{ 3337 struct __storeu_ps { 3338 __m256 __v; 3339 } __attribute__((__packed__, __may_alias__)); 3340 ((struct __storeu_ps*)__p)->__v = __a; 3341} 3342 3343/// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3344/// aligned memory location pointed to by \a __p. 3345/// 3346/// \headerfile <x86intrin.h> 3347/// 3348/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3349/// 3350/// \param __p 3351/// A 32-byte aligned pointer to a memory location that will receive the 3352/// integer values. 3353/// \param __a 3354/// A 256-bit integer vector containing the values to be moved. 3355static __inline void __DEFAULT_FN_ATTRS 3356_mm256_store_si256(__m256i *__p, __m256i __a) 3357{ 3358 *__p = __a; 3359} 3360 3361/// \brief Stores integer values from a 256-bit integer vector to an unaligned 3362/// memory location pointed to by \a __p. 3363/// 3364/// \headerfile <x86intrin.h> 3365/// 3366/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3367/// 3368/// \param __p 3369/// A pointer to a memory location that will receive the integer values. 3370/// \param __a 3371/// A 256-bit integer vector containing the values to be moved. 3372static __inline void __DEFAULT_FN_ATTRS 3373_mm256_storeu_si256(__m256i *__p, __m256i __a) 3374{ 3375 struct __storeu_si256 { 3376 __m256i __v; 3377 } __attribute__((__packed__, __may_alias__)); 3378 ((struct __storeu_si256*)__p)->__v = __a; 3379} 3380 3381/* Conditional load ops */ 3382/// \brief Conditionally loads double-precision floating point elements from a 3383/// memory location pointed to by \a __p into a 128-bit vector of 3384/// [2 x double], depending on the mask bits associated with each data 3385/// element. 3386/// 3387/// \headerfile <x86intrin.h> 3388/// 3389/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3390/// 3391/// \param __p 3392/// A pointer to a memory location that contains the double-precision 3393/// floating point values. 3394/// \param __m 3395/// A 128-bit integer vector containing the mask. The most significant bit of 3396/// each data element represents the mask bits. If a mask bit is zero, the 3397/// corresponding value in the memory location is not loaded and the 3398/// corresponding field in the return value is set to zero. 3399/// \returns A 128-bit vector of [2 x double] containing the loaded values. 3400static __inline __m128d __DEFAULT_FN_ATTRS 3401_mm_maskload_pd(double const *__p, __m128i __m) 3402{ 3403 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3404} 3405 3406/// \brief Conditionally loads double-precision floating point elements from a 3407/// memory location pointed to by \a __p into a 256-bit vector of 3408/// [4 x double], depending on the mask bits associated with each data 3409/// element. 3410/// 3411/// \headerfile <x86intrin.h> 3412/// 3413/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3414/// 3415/// \param __p 3416/// A pointer to a memory location that contains the double-precision 3417/// floating point values. 3418/// \param __m 3419/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3420/// significant bit of each quadword element represents the mask bits. If a 3421/// mask bit is zero, the corresponding value in the memory location is not 3422/// loaded and the corresponding field in the return value is set to zero. 3423/// \returns A 256-bit vector of [4 x double] containing the loaded values. 3424static __inline __m256d __DEFAULT_FN_ATTRS 3425_mm256_maskload_pd(double const *__p, __m256i __m) 3426{ 3427 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3428 (__v4di)__m); 3429} 3430 3431/// \brief Conditionally loads single-precision floating point elements from a 3432/// memory location pointed to by \a __p into a 128-bit vector of 3433/// [4 x float], depending on the mask bits associated with each data 3434/// element. 3435/// 3436/// \headerfile <x86intrin.h> 3437/// 3438/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3439/// 3440/// \param __p 3441/// A pointer to a memory location that contains the single-precision 3442/// floating point values. 3443/// \param __m 3444/// A 128-bit integer vector containing the mask. The most significant bit of 3445/// each data element represents the mask bits. If a mask bit is zero, the 3446/// corresponding value in the memory location is not loaded and the 3447/// corresponding field in the return value is set to zero. 3448/// \returns A 128-bit vector of [4 x float] containing the loaded values. 3449static __inline __m128 __DEFAULT_FN_ATTRS 3450_mm_maskload_ps(float const *__p, __m128i __m) 3451{ 3452 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3453} 3454 3455/// \brief Conditionally loads single-precision floating point elements from a 3456/// memory location pointed to by \a __p into a 256-bit vector of 3457/// [8 x float], depending on the mask bits associated with each data 3458/// element. 3459/// 3460/// \headerfile <x86intrin.h> 3461/// 3462/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3463/// 3464/// \param __p 3465/// A pointer to a memory location that contains the single-precision 3466/// floating point values. 3467/// \param __m 3468/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3469/// significant bit of each dword element represents the mask bits. If a mask 3470/// bit is zero, the corresponding value in the memory location is not loaded 3471/// and the corresponding field in the return value is set to zero. 3472/// \returns A 256-bit vector of [8 x float] containing the loaded values. 3473static __inline __m256 __DEFAULT_FN_ATTRS 3474_mm256_maskload_ps(float const *__p, __m256i __m) 3475{ 3476 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3477} 3478 3479/* Conditional store ops */ 3480/// \brief Moves single-precision floating point values from a 256-bit vector 3481/// of [8 x float] to a memory location pointed to by \a __p, according to 3482/// the specified mask. 3483/// 3484/// \headerfile <x86intrin.h> 3485/// 3486/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3487/// 3488/// \param __p 3489/// A pointer to a memory location that will receive the float values. 3490/// \param __m 3491/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3492/// significant bit of each dword element in the mask vector represents the 3493/// mask bits. If a mask bit is zero, the corresponding value from vector 3494/// \a __a is not stored and the corresponding field in the memory location 3495/// pointed to by \a __p is not changed. 3496/// \param __a 3497/// A 256-bit vector of [8 x float] containing the values to be stored. 3498static __inline void __DEFAULT_FN_ATTRS 3499_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3500{ 3501 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3502} 3503 3504/// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3505/// to a memory location pointed to by \a __p, according to the specified 3506/// mask. 3507/// 3508/// \headerfile <x86intrin.h> 3509/// 3510/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3511/// 3512/// \param __p 3513/// A pointer to a memory location that will receive the float values. 3514/// \param __m 3515/// A 128-bit integer vector containing the mask. The most significant bit of 3516/// each field in the mask vector represents the mask bits. If a mask bit is 3517/// zero, the corresponding value from vector \a __a is not stored and the 3518/// corresponding field in the memory location pointed to by \a __p is not 3519/// changed. 3520/// \param __a 3521/// A 128-bit vector of [2 x double] containing the values to be stored. 3522static __inline void __DEFAULT_FN_ATTRS 3523_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3524{ 3525 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3526} 3527 3528/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3529/// to a memory location pointed to by \a __p, according to the specified 3530/// mask. 3531/// 3532/// \headerfile <x86intrin.h> 3533/// 3534/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3535/// 3536/// \param __p 3537/// A pointer to a memory location that will receive the float values. 3538/// \param __m 3539/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3540/// significant bit of each quadword element in the mask vector represents 3541/// the mask bits. If a mask bit is zero, the corresponding value from vector 3542/// __a is not stored and the corresponding field in the memory location 3543/// pointed to by \a __p is not changed. 3544/// \param __a 3545/// A 256-bit vector of [4 x double] containing the values to be stored. 3546static __inline void __DEFAULT_FN_ATTRS 3547_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3548{ 3549 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3550} 3551 3552/// \brief Moves single-precision floating point values from a 128-bit vector 3553/// of [4 x float] to a memory location pointed to by \a __p, according to 3554/// the specified mask. 3555/// 3556/// \headerfile <x86intrin.h> 3557/// 3558/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3559/// 3560/// \param __p 3561/// A pointer to a memory location that will receive the float values. 3562/// \param __m 3563/// A 128-bit integer vector containing the mask. The most significant bit of 3564/// each field in the mask vector represents the mask bits. If a mask bit is 3565/// zero, the corresponding value from vector __a is not stored and the 3566/// corresponding field in the memory location pointed to by \a __p is not 3567/// changed. 3568/// \param __a 3569/// A 128-bit vector of [4 x float] containing the values to be stored. 3570static __inline void __DEFAULT_FN_ATTRS 3571_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3572{ 3573 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3574} 3575 3576/* Cacheability support ops */ 3577/// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3578/// aligned memory location. To minimize caching, the data is flagged as 3579/// non-temporal (unlikely to be used again soon). 3580/// 3581/// \headerfile <x86intrin.h> 3582/// 3583/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3584/// 3585/// \param __a 3586/// A pointer to a 32-byte aligned memory location that will receive the 3587/// integer values. 3588/// \param __b 3589/// A 256-bit integer vector containing the values to be moved. 3590static __inline void __DEFAULT_FN_ATTRS 3591_mm256_stream_si256(__m256i *__a, __m256i __b) 3592{ 3593 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 3594 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); 3595} 3596 3597/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3598/// to a 32-byte aligned memory location. To minimize caching, the data is 3599/// flagged as non-temporal (unlikely to be used again soon). 3600/// 3601/// \headerfile <x86intrin.h> 3602/// 3603/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3604/// 3605/// \param __a 3606/// A pointer to a 32-byte aligned memory location that will receive the 3607/// double-precision floating-point values. 3608/// \param __b 3609/// A 256-bit vector of [4 x double] containing the values to be moved. 3610static __inline void __DEFAULT_FN_ATTRS 3611_mm256_stream_pd(double *__a, __m256d __b) 3612{ 3613 typedef __v4df __v4df_aligned __attribute__((aligned(32))); 3614 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); 3615} 3616 3617/// \brief Moves single-precision floating point values from a 256-bit vector 3618/// of [8 x float] to a 32-byte aligned memory location. To minimize 3619/// caching, the data is flagged as non-temporal (unlikely to be used again 3620/// soon). 3621/// 3622/// \headerfile <x86intrin.h> 3623/// 3624/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3625/// 3626/// \param __p 3627/// A pointer to a 32-byte aligned memory location that will receive the 3628/// single-precision floating point values. 3629/// \param __a 3630/// A 256-bit vector of [8 x float] containing the values to be moved. 3631static __inline void __DEFAULT_FN_ATTRS 3632_mm256_stream_ps(float *__p, __m256 __a) 3633{ 3634 typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); 3635 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); 3636} 3637 3638/* Create vectors */ 3639/// \brief Create a 256-bit vector of [4 x double] with undefined values. 3640/// 3641/// \headerfile <x86intrin.h> 3642/// 3643/// This intrinsic has no corresponding instruction. 3644/// 3645/// \returns A 256-bit vector of [4 x double] containing undefined values. 3646static __inline__ __m256d __DEFAULT_FN_ATTRS 3647_mm256_undefined_pd(void) 3648{ 3649 return (__m256d)__builtin_ia32_undef256(); 3650} 3651 3652/// \brief Create a 256-bit vector of [8 x float] with undefined values. 3653/// 3654/// \headerfile <x86intrin.h> 3655/// 3656/// This intrinsic has no corresponding instruction. 3657/// 3658/// \returns A 256-bit vector of [8 x float] containing undefined values. 3659static __inline__ __m256 __DEFAULT_FN_ATTRS 3660_mm256_undefined_ps(void) 3661{ 3662 return (__m256)__builtin_ia32_undef256(); 3663} 3664 3665/// \brief Create a 256-bit integer vector with undefined values. 3666/// 3667/// \headerfile <x86intrin.h> 3668/// 3669/// This intrinsic has no corresponding instruction. 3670/// 3671/// \returns A 256-bit integer vector containing undefined values. 3672static __inline__ __m256i __DEFAULT_FN_ATTRS 3673_mm256_undefined_si256(void) 3674{ 3675 return (__m256i)__builtin_ia32_undef256(); 3676} 3677 3678/// \brief Constructs a 256-bit floating-point vector of [4 x double] 3679/// initialized with the specified double-precision floating-point values. 3680/// 3681/// \headerfile <x86intrin.h> 3682/// 3683/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3684/// instruction. 3685/// 3686/// \param __a 3687/// A double-precision floating-point value used to initialize bits [255:192] 3688/// of the result. 3689/// \param __b 3690/// A double-precision floating-point value used to initialize bits [191:128] 3691/// of the result. 3692/// \param __c 3693/// A double-precision floating-point value used to initialize bits [127:64] 3694/// of the result. 3695/// \param __d 3696/// A double-precision floating-point value used to initialize bits [63:0] 3697/// of the result. 3698/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3699static __inline __m256d __DEFAULT_FN_ATTRS 3700_mm256_set_pd(double __a, double __b, double __c, double __d) 3701{ 3702 return (__m256d){ __d, __c, __b, __a }; 3703} 3704 3705/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3706/// with the specified single-precision floating-point values. 3707/// 3708/// \headerfile <x86intrin.h> 3709/// 3710/// This intrinsic is a utility function and does not correspond to a specific 3711/// instruction. 3712/// 3713/// \param __a 3714/// A single-precision floating-point value used to initialize bits [255:224] 3715/// of the result. 3716/// \param __b 3717/// A single-precision floating-point value used to initialize bits [223:192] 3718/// of the result. 3719/// \param __c 3720/// A single-precision floating-point value used to initialize bits [191:160] 3721/// of the result. 3722/// \param __d 3723/// A single-precision floating-point value used to initialize bits [159:128] 3724/// of the result. 3725/// \param __e 3726/// A single-precision floating-point value used to initialize bits [127:96] 3727/// of the result. 3728/// \param __f 3729/// A single-precision floating-point value used to initialize bits [95:64] 3730/// of the result. 3731/// \param __g 3732/// A single-precision floating-point value used to initialize bits [63:32] 3733/// of the result. 3734/// \param __h 3735/// A single-precision floating-point value used to initialize bits [31:0] 3736/// of the result. 3737/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3738static __inline __m256 __DEFAULT_FN_ATTRS 3739_mm256_set_ps(float __a, float __b, float __c, float __d, 3740 float __e, float __f, float __g, float __h) 3741{ 3742 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3743} 3744 3745/// \brief Constructs a 256-bit integer vector initialized with the specified 3746/// 32-bit integral values. 3747/// 3748/// \headerfile <x86intrin.h> 3749/// 3750/// This intrinsic is a utility function and does not correspond to a specific 3751/// instruction. 3752/// 3753/// \param __i0 3754/// A 32-bit integral value used to initialize bits [255:224] of the result. 3755/// \param __i1 3756/// A 32-bit integral value used to initialize bits [223:192] of the result. 3757/// \param __i2 3758/// A 32-bit integral value used to initialize bits [191:160] of the result. 3759/// \param __i3 3760/// A 32-bit integral value used to initialize bits [159:128] of the result. 3761/// \param __i4 3762/// A 32-bit integral value used to initialize bits [127:96] of the result. 3763/// \param __i5 3764/// A 32-bit integral value used to initialize bits [95:64] of the result. 3765/// \param __i6 3766/// A 32-bit integral value used to initialize bits [63:32] of the result. 3767/// \param __i7 3768/// A 32-bit integral value used to initialize bits [31:0] of the result. 3769/// \returns An initialized 256-bit integer vector. 3770static __inline __m256i __DEFAULT_FN_ATTRS 3771_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3772 int __i4, int __i5, int __i6, int __i7) 3773{ 3774 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3775} 3776 3777/// \brief Constructs a 256-bit integer vector initialized with the specified 3778/// 16-bit integral values. 3779/// 3780/// \headerfile <x86intrin.h> 3781/// 3782/// This intrinsic is a utility function and does not correspond to a specific 3783/// instruction. 3784/// 3785/// \param __w15 3786/// A 16-bit integral value used to initialize bits [255:240] of the result. 3787/// \param __w14 3788/// A 16-bit integral value used to initialize bits [239:224] of the result. 3789/// \param __w13 3790/// A 16-bit integral value used to initialize bits [223:208] of the result. 3791/// \param __w12 3792/// A 16-bit integral value used to initialize bits [207:192] of the result. 3793/// \param __w11 3794/// A 16-bit integral value used to initialize bits [191:176] of the result. 3795/// \param __w10 3796/// A 16-bit integral value used to initialize bits [175:160] of the result. 3797/// \param __w09 3798/// A 16-bit integral value used to initialize bits [159:144] of the result. 3799/// \param __w08 3800/// A 16-bit integral value used to initialize bits [143:128] of the result. 3801/// \param __w07 3802/// A 16-bit integral value used to initialize bits [127:112] of the result. 3803/// \param __w06 3804/// A 16-bit integral value used to initialize bits [111:96] of the result. 3805/// \param __w05 3806/// A 16-bit integral value used to initialize bits [95:80] of the result. 3807/// \param __w04 3808/// A 16-bit integral value used to initialize bits [79:64] of the result. 3809/// \param __w03 3810/// A 16-bit integral value used to initialize bits [63:48] of the result. 3811/// \param __w02 3812/// A 16-bit integral value used to initialize bits [47:32] of the result. 3813/// \param __w01 3814/// A 16-bit integral value used to initialize bits [31:16] of the result. 3815/// \param __w00 3816/// A 16-bit integral value used to initialize bits [15:0] of the result. 3817/// \returns An initialized 256-bit integer vector. 3818static __inline __m256i __DEFAULT_FN_ATTRS 3819_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3820 short __w11, short __w10, short __w09, short __w08, 3821 short __w07, short __w06, short __w05, short __w04, 3822 short __w03, short __w02, short __w01, short __w00) 3823{ 3824 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3825 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3826} 3827 3828/// \brief Constructs a 256-bit integer vector initialized with the specified 3829/// 8-bit integral values. 3830/// 3831/// \headerfile <x86intrin.h> 3832/// 3833/// This intrinsic is a utility function and does not correspond to a specific 3834/// instruction. 3835/// 3836/// \param __b31 3837/// An 8-bit integral value used to initialize bits [255:248] of the result. 3838/// \param __b30 3839/// An 8-bit integral value used to initialize bits [247:240] of the result. 3840/// \param __b29 3841/// An 8-bit integral value used to initialize bits [239:232] of the result. 3842/// \param __b28 3843/// An 8-bit integral value used to initialize bits [231:224] of the result. 3844/// \param __b27 3845/// An 8-bit integral value used to initialize bits [223:216] of the result. 3846/// \param __b26 3847/// An 8-bit integral value used to initialize bits [215:208] of the result. 3848/// \param __b25 3849/// An 8-bit integral value used to initialize bits [207:200] of the result. 3850/// \param __b24 3851/// An 8-bit integral value used to initialize bits [199:192] of the result. 3852/// \param __b23 3853/// An 8-bit integral value used to initialize bits [191:184] of the result. 3854/// \param __b22 3855/// An 8-bit integral value used to initialize bits [183:176] of the result. 3856/// \param __b21 3857/// An 8-bit integral value used to initialize bits [175:168] of the result. 3858/// \param __b20 3859/// An 8-bit integral value used to initialize bits [167:160] of the result. 3860/// \param __b19 3861/// An 8-bit integral value used to initialize bits [159:152] of the result. 3862/// \param __b18 3863/// An 8-bit integral value used to initialize bits [151:144] of the result. 3864/// \param __b17 3865/// An 8-bit integral value used to initialize bits [143:136] of the result. 3866/// \param __b16 3867/// An 8-bit integral value used to initialize bits [135:128] of the result. 3868/// \param __b15 3869/// An 8-bit integral value used to initialize bits [127:120] of the result. 3870/// \param __b14 3871/// An 8-bit integral value used to initialize bits [119:112] of the result. 3872/// \param __b13 3873/// An 8-bit integral value used to initialize bits [111:104] of the result. 3874/// \param __b12 3875/// An 8-bit integral value used to initialize bits [103:96] of the result. 3876/// \param __b11 3877/// An 8-bit integral value used to initialize bits [95:88] of the result. 3878/// \param __b10 3879/// An 8-bit integral value used to initialize bits [87:80] of the result. 3880/// \param __b09 3881/// An 8-bit integral value used to initialize bits [79:72] of the result. 3882/// \param __b08 3883/// An 8-bit integral value used to initialize bits [71:64] of the result. 3884/// \param __b07 3885/// An 8-bit integral value used to initialize bits [63:56] of the result. 3886/// \param __b06 3887/// An 8-bit integral value used to initialize bits [55:48] of the result. 3888/// \param __b05 3889/// An 8-bit integral value used to initialize bits [47:40] of the result. 3890/// \param __b04 3891/// An 8-bit integral value used to initialize bits [39:32] of the result. 3892/// \param __b03 3893/// An 8-bit integral value used to initialize bits [31:24] of the result. 3894/// \param __b02 3895/// An 8-bit integral value used to initialize bits [23:16] of the result. 3896/// \param __b01 3897/// An 8-bit integral value used to initialize bits [15:8] of the result. 3898/// \param __b00 3899/// An 8-bit integral value used to initialize bits [7:0] of the result. 3900/// \returns An initialized 256-bit integer vector. 3901static __inline __m256i __DEFAULT_FN_ATTRS 3902_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3903 char __b27, char __b26, char __b25, char __b24, 3904 char __b23, char __b22, char __b21, char __b20, 3905 char __b19, char __b18, char __b17, char __b16, 3906 char __b15, char __b14, char __b13, char __b12, 3907 char __b11, char __b10, char __b09, char __b08, 3908 char __b07, char __b06, char __b05, char __b04, 3909 char __b03, char __b02, char __b01, char __b00) 3910{ 3911 return (__m256i)(__v32qi){ 3912 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3913 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3914 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3915 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3916 }; 3917} 3918 3919/// \brief Constructs a 256-bit integer vector initialized with the specified 3920/// 64-bit integral values. 3921/// 3922/// \headerfile <x86intrin.h> 3923/// 3924/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3925/// instruction. 3926/// 3927/// \param __a 3928/// A 64-bit integral value used to initialize bits [255:192] of the result. 3929/// \param __b 3930/// A 64-bit integral value used to initialize bits [191:128] of the result. 3931/// \param __c 3932/// A 64-bit integral value used to initialize bits [127:64] of the result. 3933/// \param __d 3934/// A 64-bit integral value used to initialize bits [63:0] of the result. 3935/// \returns An initialized 256-bit integer vector. 3936static __inline __m256i __DEFAULT_FN_ATTRS 3937_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3938{ 3939 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3940} 3941 3942/* Create vectors with elements in reverse order */ 3943/// \brief Constructs a 256-bit floating-point vector of [4 x double], 3944/// initialized in reverse order with the specified double-precision 3945/// floating-point values. 3946/// 3947/// \headerfile <x86intrin.h> 3948/// 3949/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3950/// instruction. 3951/// 3952/// \param __a 3953/// A double-precision floating-point value used to initialize bits [63:0] 3954/// of the result. 3955/// \param __b 3956/// A double-precision floating-point value used to initialize bits [127:64] 3957/// of the result. 3958/// \param __c 3959/// A double-precision floating-point value used to initialize bits [191:128] 3960/// of the result. 3961/// \param __d 3962/// A double-precision floating-point value used to initialize bits [255:192] 3963/// of the result. 3964/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3965static __inline __m256d __DEFAULT_FN_ATTRS 3966_mm256_setr_pd(double __a, double __b, double __c, double __d) 3967{ 3968 return (__m256d){ __a, __b, __c, __d }; 3969} 3970 3971/// \brief Constructs a 256-bit floating-point vector of [8 x float], 3972/// initialized in reverse order with the specified single-precision 3973/// float-point values. 3974/// 3975/// \headerfile <x86intrin.h> 3976/// 3977/// This intrinsic is a utility function and does not correspond to a specific 3978/// instruction. 3979/// 3980/// \param __a 3981/// A single-precision floating-point value used to initialize bits [31:0] 3982/// of the result. 3983/// \param __b 3984/// A single-precision floating-point value used to initialize bits [63:32] 3985/// of the result. 3986/// \param __c 3987/// A single-precision floating-point value used to initialize bits [95:64] 3988/// of the result. 3989/// \param __d 3990/// A single-precision floating-point value used to initialize bits [127:96] 3991/// of the result. 3992/// \param __e 3993/// A single-precision floating-point value used to initialize bits [159:128] 3994/// of the result. 3995/// \param __f 3996/// A single-precision floating-point value used to initialize bits [191:160] 3997/// of the result. 3998/// \param __g 3999/// A single-precision floating-point value used to initialize bits [223:192] 4000/// of the result. 4001/// \param __h 4002/// A single-precision floating-point value used to initialize bits [255:224] 4003/// of the result. 4004/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4005static __inline __m256 __DEFAULT_FN_ATTRS 4006_mm256_setr_ps(float __a, float __b, float __c, float __d, 4007 float __e, float __f, float __g, float __h) 4008{ 4009 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 4010} 4011 4012/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4013/// with the specified 32-bit integral values. 4014/// 4015/// \headerfile <x86intrin.h> 4016/// 4017/// This intrinsic is a utility function and does not correspond to a specific 4018/// instruction. 4019/// 4020/// \param __i0 4021/// A 32-bit integral value used to initialize bits [31:0] of the result. 4022/// \param __i1 4023/// A 32-bit integral value used to initialize bits [63:32] of the result. 4024/// \param __i2 4025/// A 32-bit integral value used to initialize bits [95:64] of the result. 4026/// \param __i3 4027/// A 32-bit integral value used to initialize bits [127:96] of the result. 4028/// \param __i4 4029/// A 32-bit integral value used to initialize bits [159:128] of the result. 4030/// \param __i5 4031/// A 32-bit integral value used to initialize bits [191:160] of the result. 4032/// \param __i6 4033/// A 32-bit integral value used to initialize bits [223:192] of the result. 4034/// \param __i7 4035/// A 32-bit integral value used to initialize bits [255:224] of the result. 4036/// \returns An initialized 256-bit integer vector. 4037static __inline __m256i __DEFAULT_FN_ATTRS 4038_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 4039 int __i4, int __i5, int __i6, int __i7) 4040{ 4041 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 4042} 4043 4044/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4045/// with the specified 16-bit integral values. 4046/// 4047/// \headerfile <x86intrin.h> 4048/// 4049/// This intrinsic is a utility function and does not correspond to a specific 4050/// instruction. 4051/// 4052/// \param __w15 4053/// A 16-bit integral value used to initialize bits [15:0] of the result. 4054/// \param __w14 4055/// A 16-bit integral value used to initialize bits [31:16] of the result. 4056/// \param __w13 4057/// A 16-bit integral value used to initialize bits [47:32] of the result. 4058/// \param __w12 4059/// A 16-bit integral value used to initialize bits [63:48] of the result. 4060/// \param __w11 4061/// A 16-bit integral value used to initialize bits [79:64] of the result. 4062/// \param __w10 4063/// A 16-bit integral value used to initialize bits [95:80] of the result. 4064/// \param __w09 4065/// A 16-bit integral value used to initialize bits [111:96] of the result. 4066/// \param __w08 4067/// A 16-bit integral value used to initialize bits [127:112] of the result. 4068/// \param __w07 4069/// A 16-bit integral value used to initialize bits [143:128] of the result. 4070/// \param __w06 4071/// A 16-bit integral value used to initialize bits [159:144] of the result. 4072/// \param __w05 4073/// A 16-bit integral value used to initialize bits [175:160] of the result. 4074/// \param __w04 4075/// A 16-bit integral value used to initialize bits [191:176] of the result. 4076/// \param __w03 4077/// A 16-bit integral value used to initialize bits [207:192] of the result. 4078/// \param __w02 4079/// A 16-bit integral value used to initialize bits [223:208] of the result. 4080/// \param __w01 4081/// A 16-bit integral value used to initialize bits [239:224] of the result. 4082/// \param __w00 4083/// A 16-bit integral value used to initialize bits [255:240] of the result. 4084/// \returns An initialized 256-bit integer vector. 4085static __inline __m256i __DEFAULT_FN_ATTRS 4086_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4087 short __w11, short __w10, short __w09, short __w08, 4088 short __w07, short __w06, short __w05, short __w04, 4089 short __w03, short __w02, short __w01, short __w00) 4090{ 4091 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 4092 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 4093} 4094 4095/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4096/// with the specified 8-bit integral values. 4097/// 4098/// \headerfile <x86intrin.h> 4099/// 4100/// This intrinsic is a utility function and does not correspond to a specific 4101/// instruction. 4102/// 4103/// \param __b31 4104/// An 8-bit integral value used to initialize bits [7:0] of the result. 4105/// \param __b30 4106/// An 8-bit integral value used to initialize bits [15:8] of the result. 4107/// \param __b29 4108/// An 8-bit integral value used to initialize bits [23:16] of the result. 4109/// \param __b28 4110/// An 8-bit integral value used to initialize bits [31:24] of the result. 4111/// \param __b27 4112/// An 8-bit integral value used to initialize bits [39:32] of the result. 4113/// \param __b26 4114/// An 8-bit integral value used to initialize bits [47:40] of the result. 4115/// \param __b25 4116/// An 8-bit integral value used to initialize bits [55:48] of the result. 4117/// \param __b24 4118/// An 8-bit integral value used to initialize bits [63:56] of the result. 4119/// \param __b23 4120/// An 8-bit integral value used to initialize bits [71:64] of the result. 4121/// \param __b22 4122/// An 8-bit integral value used to initialize bits [79:72] of the result. 4123/// \param __b21 4124/// An 8-bit integral value used to initialize bits [87:80] of the result. 4125/// \param __b20 4126/// An 8-bit integral value used to initialize bits [95:88] of the result. 4127/// \param __b19 4128/// An 8-bit integral value used to initialize bits [103:96] of the result. 4129/// \param __b18 4130/// An 8-bit integral value used to initialize bits [111:104] of the result. 4131/// \param __b17 4132/// An 8-bit integral value used to initialize bits [119:112] of the result. 4133/// \param __b16 4134/// An 8-bit integral value used to initialize bits [127:120] of the result. 4135/// \param __b15 4136/// An 8-bit integral value used to initialize bits [135:128] of the result. 4137/// \param __b14 4138/// An 8-bit integral value used to initialize bits [143:136] of the result. 4139/// \param __b13 4140/// An 8-bit integral value used to initialize bits [151:144] of the result. 4141/// \param __b12 4142/// An 8-bit integral value used to initialize bits [159:152] of the result. 4143/// \param __b11 4144/// An 8-bit integral value used to initialize bits [167:160] of the result. 4145/// \param __b10 4146/// An 8-bit integral value used to initialize bits [175:168] of the result. 4147/// \param __b09 4148/// An 8-bit integral value used to initialize bits [183:176] of the result. 4149/// \param __b08 4150/// An 8-bit integral value used to initialize bits [191:184] of the result. 4151/// \param __b07 4152/// An 8-bit integral value used to initialize bits [199:192] of the result. 4153/// \param __b06 4154/// An 8-bit integral value used to initialize bits [207:200] of the result. 4155/// \param __b05 4156/// An 8-bit integral value used to initialize bits [215:208] of the result. 4157/// \param __b04 4158/// An 8-bit integral value used to initialize bits [223:216] of the result. 4159/// \param __b03 4160/// An 8-bit integral value used to initialize bits [231:224] of the result. 4161/// \param __b02 4162/// An 8-bit integral value used to initialize bits [239:232] of the result. 4163/// \param __b01 4164/// An 8-bit integral value used to initialize bits [247:240] of the result. 4165/// \param __b00 4166/// An 8-bit integral value used to initialize bits [255:248] of the result. 4167/// \returns An initialized 256-bit integer vector. 4168static __inline __m256i __DEFAULT_FN_ATTRS 4169_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4170 char __b27, char __b26, char __b25, char __b24, 4171 char __b23, char __b22, char __b21, char __b20, 4172 char __b19, char __b18, char __b17, char __b16, 4173 char __b15, char __b14, char __b13, char __b12, 4174 char __b11, char __b10, char __b09, char __b08, 4175 char __b07, char __b06, char __b05, char __b04, 4176 char __b03, char __b02, char __b01, char __b00) 4177{ 4178 return (__m256i)(__v32qi){ 4179 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 4180 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 4181 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 4182 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 4183} 4184 4185/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4186/// with the specified 64-bit integral values. 4187/// 4188/// \headerfile <x86intrin.h> 4189/// 4190/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4191/// instruction. 4192/// 4193/// \param __a 4194/// A 64-bit integral value used to initialize bits [63:0] of the result. 4195/// \param __b 4196/// A 64-bit integral value used to initialize bits [127:64] of the result. 4197/// \param __c 4198/// A 64-bit integral value used to initialize bits [191:128] of the result. 4199/// \param __d 4200/// A 64-bit integral value used to initialize bits [255:192] of the result. 4201/// \returns An initialized 256-bit integer vector. 4202static __inline __m256i __DEFAULT_FN_ATTRS 4203_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4204{ 4205 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4206} 4207 4208/* Create vectors with repeated elements */ 4209/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4210/// of the four double-precision floating-point vector elements set to the 4211/// specified double-precision floating-point value. 4212/// 4213/// \headerfile <x86intrin.h> 4214/// 4215/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4216/// 4217/// \param __w 4218/// A double-precision floating-point value used to initialize each vector 4219/// element of the result. 4220/// \returns An initialized 256-bit floating-point vector of [4 x double]. 4221static __inline __m256d __DEFAULT_FN_ATTRS 4222_mm256_set1_pd(double __w) 4223{ 4224 return (__m256d){ __w, __w, __w, __w }; 4225} 4226 4227/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4228/// of the eight single-precision floating-point vector elements set to the 4229/// specified single-precision floating-point value. 4230/// 4231/// \headerfile <x86intrin.h> 4232/// 4233/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4234/// instruction. 4235/// 4236/// \param __w 4237/// A single-precision floating-point value used to initialize each vector 4238/// element of the result. 4239/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4240static __inline __m256 __DEFAULT_FN_ATTRS 4241_mm256_set1_ps(float __w) 4242{ 4243 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4244} 4245 4246/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4247/// 32-bit integral vector elements set to the specified 32-bit integral 4248/// value. 4249/// 4250/// \headerfile <x86intrin.h> 4251/// 4252/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4253/// instruction. 4254/// 4255/// \param __i 4256/// A 32-bit integral value used to initialize each vector element of the 4257/// result. 4258/// \returns An initialized 256-bit integer vector of [8 x i32]. 4259static __inline __m256i __DEFAULT_FN_ATTRS 4260_mm256_set1_epi32(int __i) 4261{ 4262 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4263} 4264 4265/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4266/// 16-bit integral vector elements set to the specified 16-bit integral 4267/// value. 4268/// 4269/// \headerfile <x86intrin.h> 4270/// 4271/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4272/// 4273/// \param __w 4274/// A 16-bit integral value used to initialize each vector element of the 4275/// result. 4276/// \returns An initialized 256-bit integer vector of [16 x i16]. 4277static __inline __m256i __DEFAULT_FN_ATTRS 4278_mm256_set1_epi16(short __w) 4279{ 4280 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4281 __w, __w, __w, __w, __w, __w }; 4282} 4283 4284/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4285/// 8-bit integral vector elements set to the specified 8-bit integral value. 4286/// 4287/// \headerfile <x86intrin.h> 4288/// 4289/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4290/// 4291/// \param __b 4292/// An 8-bit integral value used to initialize each vector element of the 4293/// result. 4294/// \returns An initialized 256-bit integer vector of [32 x i8]. 4295static __inline __m256i __DEFAULT_FN_ATTRS 4296_mm256_set1_epi8(char __b) 4297{ 4298 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4299 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4300 __b, __b, __b, __b, __b, __b, __b }; 4301} 4302 4303/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4304/// 64-bit integral vector elements set to the specified 64-bit integral 4305/// value. 4306/// 4307/// \headerfile <x86intrin.h> 4308/// 4309/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4310/// 4311/// \param __q 4312/// A 64-bit integral value used to initialize each vector element of the 4313/// result. 4314/// \returns An initialized 256-bit integer vector of [4 x i64]. 4315static __inline __m256i __DEFAULT_FN_ATTRS 4316_mm256_set1_epi64x(long long __q) 4317{ 4318 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4319} 4320 4321/* Create __zeroed vectors */ 4322/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4323/// vector elements initialized to zero. 4324/// 4325/// \headerfile <x86intrin.h> 4326/// 4327/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4328/// 4329/// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4330static __inline __m256d __DEFAULT_FN_ATTRS 4331_mm256_setzero_pd(void) 4332{ 4333 return (__m256d){ 0, 0, 0, 0 }; 4334} 4335 4336/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4337/// vector elements initialized to zero. 4338/// 4339/// \headerfile <x86intrin.h> 4340/// 4341/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4342/// 4343/// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4344static __inline __m256 __DEFAULT_FN_ATTRS 4345_mm256_setzero_ps(void) 4346{ 4347 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4348} 4349 4350/// \brief Constructs a 256-bit integer vector initialized to zero. 4351/// 4352/// \headerfile <x86intrin.h> 4353/// 4354/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4355/// 4356/// \returns A 256-bit integer vector initialized to zero. 4357static __inline __m256i __DEFAULT_FN_ATTRS 4358_mm256_setzero_si256(void) 4359{ 4360 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4361} 4362 4363/* Cast between vector types */ 4364/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4365/// floating-point vector of [8 x float]. 4366/// 4367/// \headerfile <x86intrin.h> 4368/// 4369/// This intrinsic has no corresponding instruction. 4370/// 4371/// \param __a 4372/// A 256-bit floating-point vector of [4 x double]. 4373/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4374/// bitwise pattern as the parameter. 4375static __inline __m256 __DEFAULT_FN_ATTRS 4376_mm256_castpd_ps(__m256d __a) 4377{ 4378 return (__m256)__a; 4379} 4380 4381/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4382/// integer vector. 4383/// 4384/// \headerfile <x86intrin.h> 4385/// 4386/// This intrinsic has no corresponding instruction. 4387/// 4388/// \param __a 4389/// A 256-bit floating-point vector of [4 x double]. 4390/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4391/// parameter. 4392static __inline __m256i __DEFAULT_FN_ATTRS 4393_mm256_castpd_si256(__m256d __a) 4394{ 4395 return (__m256i)__a; 4396} 4397 4398/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4399/// floating-point vector of [4 x double]. 4400/// 4401/// \headerfile <x86intrin.h> 4402/// 4403/// This intrinsic has no corresponding instruction. 4404/// 4405/// \param __a 4406/// A 256-bit floating-point vector of [8 x float]. 4407/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4408/// bitwise pattern as the parameter. 4409static __inline __m256d __DEFAULT_FN_ATTRS 4410_mm256_castps_pd(__m256 __a) 4411{ 4412 return (__m256d)__a; 4413} 4414 4415/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4416/// integer vector. 4417/// 4418/// \headerfile <x86intrin.h> 4419/// 4420/// This intrinsic has no corresponding instruction. 4421/// 4422/// \param __a 4423/// A 256-bit floating-point vector of [8 x float]. 4424/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4425/// parameter. 4426static __inline __m256i __DEFAULT_FN_ATTRS 4427_mm256_castps_si256(__m256 __a) 4428{ 4429 return (__m256i)__a; 4430} 4431 4432/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4433/// of [8 x float]. 4434/// 4435/// \headerfile <x86intrin.h> 4436/// 4437/// This intrinsic has no corresponding instruction. 4438/// 4439/// \param __a 4440/// A 256-bit integer vector. 4441/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4442/// bitwise pattern as the parameter. 4443static __inline __m256 __DEFAULT_FN_ATTRS 4444_mm256_castsi256_ps(__m256i __a) 4445{ 4446 return (__m256)__a; 4447} 4448 4449/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4450/// of [4 x double]. 4451/// 4452/// \headerfile <x86intrin.h> 4453/// 4454/// This intrinsic has no corresponding instruction. 4455/// 4456/// \param __a 4457/// A 256-bit integer vector. 4458/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4459/// bitwise pattern as the parameter. 4460static __inline __m256d __DEFAULT_FN_ATTRS 4461_mm256_castsi256_pd(__m256i __a) 4462{ 4463 return (__m256d)__a; 4464} 4465 4466/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4467/// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4468/// 4469/// \headerfile <x86intrin.h> 4470/// 4471/// This intrinsic has no corresponding instruction. 4472/// 4473/// \param __a 4474/// A 256-bit floating-point vector of [4 x double]. 4475/// \returns A 128-bit floating-point vector of [2 x double] containing the 4476/// lower 128 bits of the parameter. 4477static __inline __m128d __DEFAULT_FN_ATTRS 4478_mm256_castpd256_pd128(__m256d __a) 4479{ 4480 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4481} 4482 4483/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4484/// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4485/// 4486/// \headerfile <x86intrin.h> 4487/// 4488/// This intrinsic has no corresponding instruction. 4489/// 4490/// \param __a 4491/// A 256-bit floating-point vector of [8 x float]. 4492/// \returns A 128-bit floating-point vector of [4 x float] containing the 4493/// lower 128 bits of the parameter. 4494static __inline __m128 __DEFAULT_FN_ATTRS 4495_mm256_castps256_ps128(__m256 __a) 4496{ 4497 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4498} 4499 4500/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4501/// 4502/// \headerfile <x86intrin.h> 4503/// 4504/// This intrinsic has no corresponding instruction. 4505/// 4506/// \param __a 4507/// A 256-bit integer vector. 4508/// \returns A 128-bit integer vector containing the lower 128 bits of the 4509/// parameter. 4510static __inline __m128i __DEFAULT_FN_ATTRS 4511_mm256_castsi256_si128(__m256i __a) 4512{ 4513 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4514} 4515 4516/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4517/// 128-bit floating-point vector of [2 x double]. 4518/// 4519/// The lower 128 bits contain the value of the source vector. The contents 4520/// of the upper 128 bits are undefined. 4521/// 4522/// \headerfile <x86intrin.h> 4523/// 4524/// This intrinsic has no corresponding instruction. 4525/// 4526/// \param __a 4527/// A 128-bit vector of [2 x double]. 4528/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4529/// contain the value of the parameter. The contents of the upper 128 bits 4530/// are undefined. 4531static __inline __m256d __DEFAULT_FN_ATTRS 4532_mm256_castpd128_pd256(__m128d __a) 4533{ 4534 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4535} 4536 4537/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4538/// 128-bit floating-point vector of [4 x float]. 4539/// 4540/// The lower 128 bits contain the value of the source vector. The contents 4541/// of the upper 128 bits are undefined. 4542/// 4543/// \headerfile <x86intrin.h> 4544/// 4545/// This intrinsic has no corresponding instruction. 4546/// 4547/// \param __a 4548/// A 128-bit vector of [4 x float]. 4549/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4550/// contain the value of the parameter. The contents of the upper 128 bits 4551/// are undefined. 4552static __inline __m256 __DEFAULT_FN_ATTRS 4553_mm256_castps128_ps256(__m128 __a) 4554{ 4555 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4556} 4557 4558/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4559/// 4560/// The lower 128 bits contain the value of the source vector. The contents 4561/// of the upper 128 bits are undefined. 4562/// 4563/// \headerfile <x86intrin.h> 4564/// 4565/// This intrinsic has no corresponding instruction. 4566/// 4567/// \param __a 4568/// A 128-bit integer vector. 4569/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4570/// the parameter. The contents of the upper 128 bits are undefined. 4571static __inline __m256i __DEFAULT_FN_ATTRS 4572_mm256_castsi128_si256(__m128i __a) 4573{ 4574 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4575} 4576 4577/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4578/// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4579/// contain the value of the source vector. The upper 128 bits are set 4580/// to zero. 4581/// 4582/// \headerfile <x86intrin.h> 4583/// 4584/// This intrinsic has no corresponding instruction. 4585/// 4586/// \param __a 4587/// A 128-bit vector of [2 x double]. 4588/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4589/// contain the value of the parameter. The upper 128 bits are set to zero. 4590static __inline __m256d __DEFAULT_FN_ATTRS 4591_mm256_zextpd128_pd256(__m128d __a) 4592{ 4593 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4594} 4595 4596/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4597/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4598/// the value of the source vector. The upper 128 bits are set to zero. 4599/// 4600/// \headerfile <x86intrin.h> 4601/// 4602/// This intrinsic has no corresponding instruction. 4603/// 4604/// \param __a 4605/// A 128-bit vector of [4 x float]. 4606/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4607/// contain the value of the parameter. The upper 128 bits are set to zero. 4608static __inline __m256 __DEFAULT_FN_ATTRS 4609_mm256_zextps128_ps256(__m128 __a) 4610{ 4611 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4612} 4613 4614/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4615/// The lower 128 bits contain the value of the source vector. The upper 4616/// 128 bits are set to zero. 4617/// 4618/// \headerfile <x86intrin.h> 4619/// 4620/// This intrinsic has no corresponding instruction. 4621/// 4622/// \param __a 4623/// A 128-bit integer vector. 4624/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4625/// the parameter. The upper 128 bits are set to zero. 4626static __inline __m256i __DEFAULT_FN_ATTRS 4627_mm256_zextsi128_si256(__m128i __a) 4628{ 4629 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4630} 4631 4632/* 4633 Vector insert. 4634 We use macros rather than inlines because we only want to accept 4635 invocations where the immediate M is a constant expression. 4636*/ 4637/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4638/// a 256-bit vector of [8 x float] given in the first parameter, and then 4639/// replacing either the upper or the lower 128 bits with the contents of a 4640/// 128-bit vector of [4 x float] in the second parameter. 4641/// 4642/// The immediate integer parameter determines between the upper or the lower 4643/// 128 bits. 4644/// 4645/// \headerfile <x86intrin.h> 4646/// 4647/// \code 4648/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4649/// \endcode 4650/// 4651/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4652/// 4653/// \param V1 4654/// A 256-bit vector of [8 x float]. This vector is copied to the result 4655/// first, and then either the upper or the lower 128 bits of the result will 4656/// be replaced by the contents of \a V2. 4657/// \param V2 4658/// A 128-bit vector of [4 x float]. The contents of this parameter are 4659/// written to either the upper or the lower 128 bits of the result depending 4660/// on the value of parameter \a M. 4661/// \param M 4662/// An immediate integer. The least significant bit determines how the values 4663/// from the two parameters are interleaved: \n 4664/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4665/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4666/// result. \n 4667/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4668/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4669/// result. 4670/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4671#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4672 (__m256)__builtin_shufflevector( \ 4673 (__v8sf)(__m256)(V1), \ 4674 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4675 (((M) & 1) ? 0 : 8), \ 4676 (((M) & 1) ? 1 : 9), \ 4677 (((M) & 1) ? 2 : 10), \ 4678 (((M) & 1) ? 3 : 11), \ 4679 (((M) & 1) ? 8 : 4), \ 4680 (((M) & 1) ? 9 : 5), \ 4681 (((M) & 1) ? 10 : 6), \ 4682 (((M) & 1) ? 11 : 7) );}) 4683 4684/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4685/// a 256-bit vector of [4 x double] given in the first parameter, and then 4686/// replacing either the upper or the lower 128 bits with the contents of a 4687/// 128-bit vector of [2 x double] in the second parameter. 4688/// 4689/// The immediate integer parameter determines between the upper or the lower 4690/// 128 bits. 4691/// 4692/// \headerfile <x86intrin.h> 4693/// 4694/// \code 4695/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4696/// \endcode 4697/// 4698/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4699/// 4700/// \param V1 4701/// A 256-bit vector of [4 x double]. This vector is copied to the result 4702/// first, and then either the upper or the lower 128 bits of the result will 4703/// be replaced by the contents of \a V2. 4704/// \param V2 4705/// A 128-bit vector of [2 x double]. The contents of this parameter are 4706/// written to either the upper or the lower 128 bits of the result depending 4707/// on the value of parameter \a M. 4708/// \param M 4709/// An immediate integer. The least significant bit determines how the values 4710/// from the two parameters are interleaved: \n 4711/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4712/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4713/// result. \n 4714/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4715/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4716/// result. 4717/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4718#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4719 (__m256d)__builtin_shufflevector( \ 4720 (__v4df)(__m256d)(V1), \ 4721 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4722 (((M) & 1) ? 0 : 4), \ 4723 (((M) & 1) ? 1 : 5), \ 4724 (((M) & 1) ? 4 : 2), \ 4725 (((M) & 1) ? 5 : 3) );}) 4726 4727/// \brief Constructs a new 256-bit integer vector by first duplicating a 4728/// 256-bit integer vector given in the first parameter, and then replacing 4729/// either the upper or the lower 128 bits with the contents of a 128-bit 4730/// integer vector in the second parameter. 4731/// 4732/// The immediate integer parameter determines between the upper or the lower 4733/// 128 bits. 4734/// 4735/// \headerfile <x86intrin.h> 4736/// 4737/// \code 4738/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4739/// \endcode 4740/// 4741/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4742/// 4743/// \param V1 4744/// A 256-bit integer vector. This vector is copied to the result first, and 4745/// then either the upper or the lower 128 bits of the result will be 4746/// replaced by the contents of \a V2. 4747/// \param V2 4748/// A 128-bit integer vector. The contents of this parameter are written to 4749/// either the upper or the lower 128 bits of the result depending on the 4750/// value of parameter \a M. 4751/// \param M 4752/// An immediate integer. The least significant bit determines how the values 4753/// from the two parameters are interleaved: \n 4754/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4755/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4756/// result. \n 4757/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4758/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4759/// result. 4760/// \returns A 256-bit integer vector containing the interleaved values. 4761#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4762 (__m256i)__builtin_shufflevector( \ 4763 (__v4di)(__m256i)(V1), \ 4764 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4765 (((M) & 1) ? 0 : 4), \ 4766 (((M) & 1) ? 1 : 5), \ 4767 (((M) & 1) ? 4 : 2), \ 4768 (((M) & 1) ? 5 : 3) );}) 4769 4770/* 4771 Vector extract. 4772 We use macros rather than inlines because we only want to accept 4773 invocations where the immediate M is a constant expression. 4774*/ 4775/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4776/// of [8 x float], as determined by the immediate integer parameter, and 4777/// returns the extracted bits as a 128-bit vector of [4 x float]. 4778/// 4779/// \headerfile <x86intrin.h> 4780/// 4781/// \code 4782/// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4783/// \endcode 4784/// 4785/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4786/// 4787/// \param V 4788/// A 256-bit vector of [8 x float]. 4789/// \param M 4790/// An immediate integer. The least significant bit determines which bits are 4791/// extracted from the first parameter: \n 4792/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4793/// result. \n 4794/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4795/// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4796#define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4797 (__m128)__builtin_shufflevector( \ 4798 (__v8sf)(__m256)(V), \ 4799 (__v8sf)(_mm256_undefined_ps()), \ 4800 (((M) & 1) ? 4 : 0), \ 4801 (((M) & 1) ? 5 : 1), \ 4802 (((M) & 1) ? 6 : 2), \ 4803 (((M) & 1) ? 7 : 3) );}) 4804 4805/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4806/// of [4 x double], as determined by the immediate integer parameter, and 4807/// returns the extracted bits as a 128-bit vector of [2 x double]. 4808/// 4809/// \headerfile <x86intrin.h> 4810/// 4811/// \code 4812/// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4813/// \endcode 4814/// 4815/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4816/// 4817/// \param V 4818/// A 256-bit vector of [4 x double]. 4819/// \param M 4820/// An immediate integer. The least significant bit determines which bits are 4821/// extracted from the first parameter: \n 4822/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4823/// result. \n 4824/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4825/// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4826#define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4827 (__m128d)__builtin_shufflevector( \ 4828 (__v4df)(__m256d)(V), \ 4829 (__v4df)(_mm256_undefined_pd()), \ 4830 (((M) & 1) ? 2 : 0), \ 4831 (((M) & 1) ? 3 : 1) );}) 4832 4833/// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4834/// integer vector, as determined by the immediate integer parameter, and 4835/// returns the extracted bits as a 128-bit integer vector. 4836/// 4837/// \headerfile <x86intrin.h> 4838/// 4839/// \code 4840/// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4841/// \endcode 4842/// 4843/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4844/// 4845/// \param V 4846/// A 256-bit integer vector. 4847/// \param M 4848/// An immediate integer. The least significant bit determines which bits are 4849/// extracted from the first parameter: \n 4850/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4851/// result. \n 4852/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4853/// \returns A 128-bit integer vector containing the extracted bits. 4854#define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4855 (__m128i)__builtin_shufflevector( \ 4856 (__v4di)(__m256i)(V), \ 4857 (__v4di)(_mm256_undefined_si256()), \ 4858 (((M) & 1) ? 2 : 0), \ 4859 (((M) & 1) ? 3 : 1) );}) 4860 4861/* SIMD load ops (unaligned) */ 4862/// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4863/// unaligned memory locations and constructs a 256-bit floating-point vector 4864/// of [8 x float] by concatenating the two 128-bit vectors. 4865/// 4866/// \headerfile <x86intrin.h> 4867/// 4868/// This intrinsic corresponds to load instructions followed by the 4869/// <c> VINSERTF128 </c> instruction. 4870/// 4871/// \param __addr_hi 4872/// A pointer to a 128-bit memory location containing 4 consecutive 4873/// single-precision floating-point values. These values are to be copied to 4874/// bits[255:128] of the result. The address of the memory location does not 4875/// have to be aligned. 4876/// \param __addr_lo 4877/// A pointer to a 128-bit memory location containing 4 consecutive 4878/// single-precision floating-point values. These values are to be copied to 4879/// bits[127:0] of the result. The address of the memory location does not 4880/// have to be aligned. 4881/// \returns A 256-bit floating-point vector of [8 x float] containing the 4882/// concatenated result. 4883static __inline __m256 __DEFAULT_FN_ATTRS 4884_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4885{ 4886 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4887 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4888} 4889 4890/// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4891/// unaligned memory locations and constructs a 256-bit floating-point vector 4892/// of [4 x double] by concatenating the two 128-bit vectors. 4893/// 4894/// \headerfile <x86intrin.h> 4895/// 4896/// This intrinsic corresponds to load instructions followed by the 4897/// <c> VINSERTF128 </c> instruction. 4898/// 4899/// \param __addr_hi 4900/// A pointer to a 128-bit memory location containing two consecutive 4901/// double-precision floating-point values. These values are to be copied to 4902/// bits[255:128] of the result. The address of the memory location does not 4903/// have to be aligned. 4904/// \param __addr_lo 4905/// A pointer to a 128-bit memory location containing two consecutive 4906/// double-precision floating-point values. These values are to be copied to 4907/// bits[127:0] of the result. The address of the memory location does not 4908/// have to be aligned. 4909/// \returns A 256-bit floating-point vector of [4 x double] containing the 4910/// concatenated result. 4911static __inline __m256d __DEFAULT_FN_ATTRS 4912_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4913{ 4914 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4915 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4916} 4917 4918/// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4919/// constructs a 256-bit integer vector by concatenating the two 128-bit 4920/// vectors. 4921/// 4922/// \headerfile <x86intrin.h> 4923/// 4924/// This intrinsic corresponds to load instructions followed by the 4925/// <c> VINSERTF128 </c> instruction. 4926/// 4927/// \param __addr_hi 4928/// A pointer to a 128-bit memory location containing a 128-bit integer 4929/// vector. This vector is to be copied to bits[255:128] of the result. The 4930/// address of the memory location does not have to be aligned. 4931/// \param __addr_lo 4932/// A pointer to a 128-bit memory location containing a 128-bit integer 4933/// vector. This vector is to be copied to bits[127:0] of the result. The 4934/// address of the memory location does not have to be aligned. 4935/// \returns A 256-bit integer vector containing the concatenated result. 4936static __inline __m256i __DEFAULT_FN_ATTRS 4937_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4938{ 4939 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4940 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4941} 4942 4943/* SIMD store ops (unaligned) */ 4944/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4945/// vector of [8 x float] into two different unaligned memory locations. 4946/// 4947/// \headerfile <x86intrin.h> 4948/// 4949/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4950/// store instructions. 4951/// 4952/// \param __addr_hi 4953/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4954/// copied to this memory location. The address of this memory location does 4955/// not have to be aligned. 4956/// \param __addr_lo 4957/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4958/// copied to this memory location. The address of this memory location does 4959/// not have to be aligned. 4960/// \param __a 4961/// A 256-bit floating-point vector of [8 x float]. 4962static __inline void __DEFAULT_FN_ATTRS 4963_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4964{ 4965 __m128 __v128; 4966 4967 __v128 = _mm256_castps256_ps128(__a); 4968 _mm_storeu_ps(__addr_lo, __v128); 4969 __v128 = _mm256_extractf128_ps(__a, 1); 4970 _mm_storeu_ps(__addr_hi, __v128); 4971} 4972 4973/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4974/// vector of [4 x double] into two different unaligned memory locations. 4975/// 4976/// \headerfile <x86intrin.h> 4977/// 4978/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4979/// store instructions. 4980/// 4981/// \param __addr_hi 4982/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4983/// copied to this memory location. The address of this memory location does 4984/// not have to be aligned. 4985/// \param __addr_lo 4986/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4987/// copied to this memory location. The address of this memory location does 4988/// not have to be aligned. 4989/// \param __a 4990/// A 256-bit floating-point vector of [4 x double]. 4991static __inline void __DEFAULT_FN_ATTRS 4992_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4993{ 4994 __m128d __v128; 4995 4996 __v128 = _mm256_castpd256_pd128(__a); 4997 _mm_storeu_pd(__addr_lo, __v128); 4998 __v128 = _mm256_extractf128_pd(__a, 1); 4999 _mm_storeu_pd(__addr_hi, __v128); 5000} 5001 5002/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 5003/// two different unaligned memory locations. 5004/// 5005/// \headerfile <x86intrin.h> 5006/// 5007/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5008/// store instructions. 5009/// 5010/// \param __addr_hi 5011/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5012/// copied to this memory location. The address of this memory location does 5013/// not have to be aligned. 5014/// \param __addr_lo 5015/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5016/// copied to this memory location. The address of this memory location does 5017/// not have to be aligned. 5018/// \param __a 5019/// A 256-bit integer vector. 5020static __inline void __DEFAULT_FN_ATTRS 5021_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 5022{ 5023 __m128i __v128; 5024 5025 __v128 = _mm256_castsi256_si128(__a); 5026 _mm_storeu_si128(__addr_lo, __v128); 5027 __v128 = _mm256_extractf128_si256(__a, 1); 5028 _mm_storeu_si128(__addr_hi, __v128); 5029} 5030 5031/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5032/// concatenating two 128-bit floating-point vectors of [4 x float]. 5033/// 5034/// \headerfile <x86intrin.h> 5035/// 5036/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5037/// 5038/// \param __hi 5039/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5040/// 128 bits of the result. 5041/// \param __lo 5042/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5043/// 128 bits of the result. 5044/// \returns A 256-bit floating-point vector of [8 x float] containing the 5045/// concatenated result. 5046static __inline __m256 __DEFAULT_FN_ATTRS 5047_mm256_set_m128 (__m128 __hi, __m128 __lo) 5048{ 5049 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 5050} 5051 5052/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5053/// concatenating two 128-bit floating-point vectors of [2 x double]. 5054/// 5055/// \headerfile <x86intrin.h> 5056/// 5057/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5058/// 5059/// \param __hi 5060/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5061/// 128 bits of the result. 5062/// \param __lo 5063/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5064/// 128 bits of the result. 5065/// \returns A 256-bit floating-point vector of [4 x double] containing the 5066/// concatenated result. 5067static __inline __m256d __DEFAULT_FN_ATTRS 5068_mm256_set_m128d (__m128d __hi, __m128d __lo) 5069{ 5070 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5071} 5072 5073/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5074/// integer vectors. 5075/// 5076/// \headerfile <x86intrin.h> 5077/// 5078/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5079/// 5080/// \param __hi 5081/// A 128-bit integer vector to be copied to the upper 128 bits of the 5082/// result. 5083/// \param __lo 5084/// A 128-bit integer vector to be copied to the lower 128 bits of the 5085/// result. 5086/// \returns A 256-bit integer vector containing the concatenated result. 5087static __inline __m256i __DEFAULT_FN_ATTRS 5088_mm256_set_m128i (__m128i __hi, __m128i __lo) 5089{ 5090 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5091} 5092 5093/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5094/// concatenating two 128-bit floating-point vectors of [4 x float]. This is 5095/// similar to _mm256_set_m128, but the order of the input parameters is 5096/// swapped. 5097/// 5098/// \headerfile <x86intrin.h> 5099/// 5100/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5101/// 5102/// \param __lo 5103/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5104/// 128 bits of the result. 5105/// \param __hi 5106/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5107/// 128 bits of the result. 5108/// \returns A 256-bit floating-point vector of [8 x float] containing the 5109/// concatenated result. 5110static __inline __m256 __DEFAULT_FN_ATTRS 5111_mm256_setr_m128 (__m128 __lo, __m128 __hi) 5112{ 5113 return _mm256_set_m128(__hi, __lo); 5114} 5115 5116/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5117/// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5118/// similar to _mm256_set_m128d, but the order of the input parameters is 5119/// swapped. 5120/// 5121/// \headerfile <x86intrin.h> 5122/// 5123/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5124/// 5125/// \param __lo 5126/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5127/// 128 bits of the result. 5128/// \param __hi 5129/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5130/// 128 bits of the result. 5131/// \returns A 256-bit floating-point vector of [4 x double] containing the 5132/// concatenated result. 5133static __inline __m256d __DEFAULT_FN_ATTRS 5134_mm256_setr_m128d (__m128d __lo, __m128d __hi) 5135{ 5136 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5137} 5138 5139/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5140/// integer vectors. This is similar to _mm256_set_m128i, but the order of 5141/// the input parameters is swapped. 5142/// 5143/// \headerfile <x86intrin.h> 5144/// 5145/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5146/// 5147/// \param __lo 5148/// A 128-bit integer vector to be copied to the lower 128 bits of the 5149/// result. 5150/// \param __hi 5151/// A 128-bit integer vector to be copied to the upper 128 bits of the 5152/// result. 5153/// \returns A 256-bit integer vector containing the concatenated result. 5154static __inline __m256i __DEFAULT_FN_ATTRS 5155_mm256_setr_m128i (__m128i __lo, __m128i __hi) 5156{ 5157 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5158} 5159 5160#undef __DEFAULT_FN_ATTRS 5161 5162#endif /* __AVXINTRIN_H */ 5163