avxintrin.h revision 321369
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28#ifndef __AVXINTRIN_H 29#define __AVXINTRIN_H 30 31typedef double __v4df __attribute__ ((__vector_size__ (32))); 32typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34typedef int __v8si __attribute__ ((__vector_size__ (32))); 35typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38/* Unsigned types */ 39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44/* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48typedef float __m256 __attribute__ ((__vector_size__ (32))); 49typedef double __m256d __attribute__((__vector_size__(32))); 50typedef long long __m256i __attribute__((__vector_size__(32))); 51 52/* Define the default attributes for the functions in this file. */ 53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55/* Arithmetic */ 56/// \brief Adds two 256-bit vectors of [4 x double]. 57/// 58/// \headerfile <x86intrin.h> 59/// 60/// This intrinsic corresponds to the <c> VADDPD </c> instruction. 61/// 62/// \param __a 63/// A 256-bit vector of [4 x double] containing one of the source operands. 64/// \param __b 65/// A 256-bit vector of [4 x double] containing one of the source operands. 66/// \returns A 256-bit vector of [4 x double] containing the sums of both 67/// operands. 68static __inline __m256d __DEFAULT_FN_ATTRS 69_mm256_add_pd(__m256d __a, __m256d __b) 70{ 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72} 73 74/// \brief Adds two 256-bit vectors of [8 x float]. 75/// 76/// \headerfile <x86intrin.h> 77/// 78/// This intrinsic corresponds to the <c> VADDPS </c> instruction. 79/// 80/// \param __a 81/// A 256-bit vector of [8 x float] containing one of the source operands. 82/// \param __b 83/// A 256-bit vector of [8 x float] containing one of the source operands. 84/// \returns A 256-bit vector of [8 x float] containing the sums of both 85/// operands. 86static __inline __m256 __DEFAULT_FN_ATTRS 87_mm256_add_ps(__m256 __a, __m256 __b) 88{ 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90} 91 92/// \brief Subtracts two 256-bit vectors of [4 x double]. 93/// 94/// \headerfile <x86intrin.h> 95/// 96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 97/// 98/// \param __a 99/// A 256-bit vector of [4 x double] containing the minuend. 100/// \param __b 101/// A 256-bit vector of [4 x double] containing the subtrahend. 102/// \returns A 256-bit vector of [4 x double] containing the differences between 103/// both operands. 104static __inline __m256d __DEFAULT_FN_ATTRS 105_mm256_sub_pd(__m256d __a, __m256d __b) 106{ 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108} 109 110/// \brief Subtracts two 256-bit vectors of [8 x float]. 111/// 112/// \headerfile <x86intrin.h> 113/// 114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 115/// 116/// \param __a 117/// A 256-bit vector of [8 x float] containing the minuend. 118/// \param __b 119/// A 256-bit vector of [8 x float] containing the subtrahend. 120/// \returns A 256-bit vector of [8 x float] containing the differences between 121/// both operands. 122static __inline __m256 __DEFAULT_FN_ATTRS 123_mm256_sub_ps(__m256 __a, __m256 __b) 124{ 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126} 127 128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129/// two 256-bit vectors of [4 x double]. 130/// 131/// \headerfile <x86intrin.h> 132/// 133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 134/// 135/// \param __a 136/// A 256-bit vector of [4 x double] containing the left source operand. 137/// \param __b 138/// A 256-bit vector of [4 x double] containing the right source operand. 139/// \returns A 256-bit vector of [4 x double] containing the alternating sums 140/// and differences between both operands. 141static __inline __m256d __DEFAULT_FN_ATTRS 142_mm256_addsub_pd(__m256d __a, __m256d __b) 143{ 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145} 146 147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148/// two 256-bit vectors of [8 x float]. 149/// 150/// \headerfile <x86intrin.h> 151/// 152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 153/// 154/// \param __a 155/// A 256-bit vector of [8 x float] containing the left source operand. 156/// \param __b 157/// A 256-bit vector of [8 x float] containing the right source operand. 158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159/// differences between both operands. 160static __inline __m256 __DEFAULT_FN_ATTRS 161_mm256_addsub_ps(__m256 __a, __m256 __b) 162{ 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164} 165 166/// \brief Divides two 256-bit vectors of [4 x double]. 167/// 168/// \headerfile <x86intrin.h> 169/// 170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 171/// 172/// \param __a 173/// A 256-bit vector of [4 x double] containing the dividend. 174/// \param __b 175/// A 256-bit vector of [4 x double] containing the divisor. 176/// \returns A 256-bit vector of [4 x double] containing the quotients of both 177/// operands. 178static __inline __m256d __DEFAULT_FN_ATTRS 179_mm256_div_pd(__m256d __a, __m256d __b) 180{ 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182} 183 184/// \brief Divides two 256-bit vectors of [8 x float]. 185/// 186/// \headerfile <x86intrin.h> 187/// 188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189/// 190/// \param __a 191/// A 256-bit vector of [8 x float] containing the dividend. 192/// \param __b 193/// A 256-bit vector of [8 x float] containing the divisor. 194/// \returns A 256-bit vector of [8 x float] containing the quotients of both 195/// operands. 196static __inline __m256 __DEFAULT_FN_ATTRS 197_mm256_div_ps(__m256 __a, __m256 __b) 198{ 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200} 201 202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203/// of each pair of values. 204/// 205/// \headerfile <x86intrin.h> 206/// 207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208/// 209/// \param __a 210/// A 256-bit vector of [4 x double] containing one of the operands. 211/// \param __b 212/// A 256-bit vector of [4 x double] containing one of the operands. 213/// \returns A 256-bit vector of [4 x double] containing the maximum values 214/// between both operands. 215static __inline __m256d __DEFAULT_FN_ATTRS 216_mm256_max_pd(__m256d __a, __m256d __b) 217{ 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219} 220 221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222/// of each pair of values. 223/// 224/// \headerfile <x86intrin.h> 225/// 226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227/// 228/// \param __a 229/// A 256-bit vector of [8 x float] containing one of the operands. 230/// \param __b 231/// A 256-bit vector of [8 x float] containing one of the operands. 232/// \returns A 256-bit vector of [8 x float] containing the maximum values 233/// between both operands. 234static __inline __m256 __DEFAULT_FN_ATTRS 235_mm256_max_ps(__m256 __a, __m256 __b) 236{ 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238} 239 240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241/// of each pair of values. 242/// 243/// \headerfile <x86intrin.h> 244/// 245/// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246/// 247/// \param __a 248/// A 256-bit vector of [4 x double] containing one of the operands. 249/// \param __b 250/// A 256-bit vector of [4 x double] containing one of the operands. 251/// \returns A 256-bit vector of [4 x double] containing the minimum values 252/// between both operands. 253static __inline __m256d __DEFAULT_FN_ATTRS 254_mm256_min_pd(__m256d __a, __m256d __b) 255{ 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257} 258 259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260/// of each pair of values. 261/// 262/// \headerfile <x86intrin.h> 263/// 264/// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265/// 266/// \param __a 267/// A 256-bit vector of [8 x float] containing one of the operands. 268/// \param __b 269/// A 256-bit vector of [8 x float] containing one of the operands. 270/// \returns A 256-bit vector of [8 x float] containing the minimum values 271/// between both operands. 272static __inline __m256 __DEFAULT_FN_ATTRS 273_mm256_min_ps(__m256 __a, __m256 __b) 274{ 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276} 277 278/// \brief Multiplies two 256-bit vectors of [4 x double]. 279/// 280/// \headerfile <x86intrin.h> 281/// 282/// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283/// 284/// \param __a 285/// A 256-bit vector of [4 x double] containing one of the operands. 286/// \param __b 287/// A 256-bit vector of [4 x double] containing one of the operands. 288/// \returns A 256-bit vector of [4 x double] containing the products of both 289/// operands. 290static __inline __m256d __DEFAULT_FN_ATTRS 291_mm256_mul_pd(__m256d __a, __m256d __b) 292{ 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294} 295 296/// \brief Multiplies two 256-bit vectors of [8 x float]. 297/// 298/// \headerfile <x86intrin.h> 299/// 300/// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301/// 302/// \param __a 303/// A 256-bit vector of [8 x float] containing one of the operands. 304/// \param __b 305/// A 256-bit vector of [8 x float] containing one of the operands. 306/// \returns A 256-bit vector of [8 x float] containing the products of both 307/// operands. 308static __inline __m256 __DEFAULT_FN_ATTRS 309_mm256_mul_ps(__m256 __a, __m256 __b) 310{ 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312} 313 314/// \brief Calculates the square roots of the values in a 256-bit vector of 315/// [4 x double]. 316/// 317/// \headerfile <x86intrin.h> 318/// 319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320/// 321/// \param __a 322/// A 256-bit vector of [4 x double]. 323/// \returns A 256-bit vector of [4 x double] containing the square roots of the 324/// values in the operand. 325static __inline __m256d __DEFAULT_FN_ATTRS 326_mm256_sqrt_pd(__m256d __a) 327{ 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329} 330 331/// \brief Calculates the square roots of the values in a 256-bit vector of 332/// [8 x float]. 333/// 334/// \headerfile <x86intrin.h> 335/// 336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337/// 338/// \param __a 339/// A 256-bit vector of [8 x float]. 340/// \returns A 256-bit vector of [8 x float] containing the square roots of the 341/// values in the operand. 342static __inline __m256 __DEFAULT_FN_ATTRS 343_mm256_sqrt_ps(__m256 __a) 344{ 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346} 347 348/// \brief Calculates the reciprocal square roots of the values in a 256-bit 349/// vector of [8 x float]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354/// 355/// \param __a 356/// A 256-bit vector of [8 x float]. 357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358/// roots of the values in the operand. 359static __inline __m256 __DEFAULT_FN_ATTRS 360_mm256_rsqrt_ps(__m256 __a) 361{ 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363} 364 365/// \brief Calculates the reciprocals of the values in a 256-bit vector of 366/// [8 x float]. 367/// 368/// \headerfile <x86intrin.h> 369/// 370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371/// 372/// \param __a 373/// A 256-bit vector of [8 x float]. 374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375/// values in the operand. 376static __inline __m256 __DEFAULT_FN_ATTRS 377_mm256_rcp_ps(__m256 __a) 378{ 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380} 381 382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383/// by the byte operand. The source values are rounded to integer values and 384/// returned as 64-bit double-precision floating-point values. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// \code 389/// __m256d _mm256_round_pd(__m256d V, const int M); 390/// \endcode 391/// 392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393/// 394/// \param V 395/// A 256-bit vector of [4 x double]. 396/// \param M 397/// An integer value that specifies the rounding operation. \n 398/// Bits [7:4] are reserved. \n 399/// Bit [3] is a precision exception value: \n 400/// 0: A normal PE exception is used. \n 401/// 1: The PE field is not updated. \n 402/// Bit [2] is the rounding control source: \n 403/// 0: Use bits [1:0] of \a M. \n 404/// 1: Use the current MXCSR setting. \n 405/// Bits [1:0] contain the rounding control definition: \n 406/// 00: Nearest. \n 407/// 01: Downward (toward negative infinity). \n 408/// 10: Upward (toward positive infinity). \n 409/// 11: Truncated. 410/// \returns A 256-bit vector of [4 x double] containing the rounded values. 411#define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415/// specified by the byte operand. The source values are rounded to integer 416/// values and returned as floating-point values. 417/// 418/// \headerfile <x86intrin.h> 419/// 420/// \code 421/// __m256 _mm256_round_ps(__m256 V, const int M); 422/// \endcode 423/// 424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425/// 426/// \param V 427/// A 256-bit vector of [8 x float]. 428/// \param M 429/// An integer value that specifies the rounding operation. \n 430/// Bits [7:4] are reserved. \n 431/// Bit [3] is a precision exception value: \n 432/// 0: A normal PE exception is used. \n 433/// 1: The PE field is not updated. \n 434/// Bit [2] is the rounding control source: \n 435/// 0: Use bits [1:0] of \a M. \n 436/// 1: Use the current MXCSR setting. \n 437/// Bits [1:0] contain the rounding control definition: \n 438/// 00: Nearest. \n 439/// 01: Downward (toward negative infinity). \n 440/// 10: Upward (toward positive infinity). \n 441/// 11: Truncated. 442/// \returns A 256-bit vector of [8 x float] containing the rounded values. 443#define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447/// source values are rounded up to integer values and returned as 64-bit 448/// double-precision floating-point values. 449/// 450/// \headerfile <x86intrin.h> 451/// 452/// \code 453/// __m256d _mm256_ceil_pd(__m256d V); 454/// \endcode 455/// 456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457/// 458/// \param V 459/// A 256-bit vector of [4 x double]. 460/// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464/// The source values are rounded down to integer values and returned as 465/// 64-bit double-precision floating-point values. 466/// 467/// \headerfile <x86intrin.h> 468/// 469/// \code 470/// __m256d _mm256_floor_pd(__m256d V); 471/// \endcode 472/// 473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474/// 475/// \param V 476/// A 256-bit vector of [4 x double]. 477/// \returns A 256-bit vector of [4 x double] containing the rounded down 478/// values. 479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482/// source values are rounded up to integer values and returned as 483/// floating-point values. 484/// 485/// \headerfile <x86intrin.h> 486/// 487/// \code 488/// __m256 _mm256_ceil_ps(__m256 V); 489/// \endcode 490/// 491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492/// 493/// \param V 494/// A 256-bit vector of [8 x float]. 495/// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499/// source values are rounded down to integer values and returned as 500/// floating-point values. 501/// 502/// \headerfile <x86intrin.h> 503/// 504/// \code 505/// __m256 _mm256_floor_ps(__m256 V); 506/// \endcode 507/// 508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509/// 510/// \param V 511/// A 256-bit vector of [8 x float]. 512/// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515/* Logical */ 516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517/// 518/// \headerfile <x86intrin.h> 519/// 520/// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521/// 522/// \param __a 523/// A 256-bit vector of [4 x double] containing one of the source operands. 524/// \param __b 525/// A 256-bit vector of [4 x double] containing one of the source operands. 526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527/// values between both operands. 528static __inline __m256d __DEFAULT_FN_ATTRS 529_mm256_and_pd(__m256d __a, __m256d __b) 530{ 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532} 533 534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535/// 536/// \headerfile <x86intrin.h> 537/// 538/// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539/// 540/// \param __a 541/// A 256-bit vector of [8 x float] containing one of the source operands. 542/// \param __b 543/// A 256-bit vector of [8 x float] containing one of the source operands. 544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545/// values between both operands. 546static __inline __m256 __DEFAULT_FN_ATTRS 547_mm256_and_ps(__m256 __a, __m256 __b) 548{ 549 return (__m256)((__v8su)__a & (__v8su)__b); 550} 551 552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553/// the one's complement of the values contained in the first source operand. 554/// 555/// \headerfile <x86intrin.h> 556/// 557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558/// 559/// \param __a 560/// A 256-bit vector of [4 x double] containing the left source operand. The 561/// one's complement of this value is used in the bitwise AND. 562/// \param __b 563/// A 256-bit vector of [4 x double] containing the right source operand. 564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565/// values of the second operand and the one's complement of the first 566/// operand. 567static __inline __m256d __DEFAULT_FN_ATTRS 568_mm256_andnot_pd(__m256d __a, __m256d __b) 569{ 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571} 572 573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574/// the one's complement of the values contained in the first source operand. 575/// 576/// \headerfile <x86intrin.h> 577/// 578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579/// 580/// \param __a 581/// A 256-bit vector of [8 x float] containing the left source operand. The 582/// one's complement of this value is used in the bitwise AND. 583/// \param __b 584/// A 256-bit vector of [8 x float] containing the right source operand. 585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586/// values of the second operand and the one's complement of the first 587/// operand. 588static __inline __m256 __DEFAULT_FN_ATTRS 589_mm256_andnot_ps(__m256 __a, __m256 __b) 590{ 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592} 593 594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595/// 596/// \headerfile <x86intrin.h> 597/// 598/// This intrinsic corresponds to the <c> VORPD </c> instruction. 599/// 600/// \param __a 601/// A 256-bit vector of [4 x double] containing one of the source operands. 602/// \param __b 603/// A 256-bit vector of [4 x double] containing one of the source operands. 604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605/// values between both operands. 606static __inline __m256d __DEFAULT_FN_ATTRS 607_mm256_or_pd(__m256d __a, __m256d __b) 608{ 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610} 611 612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613/// 614/// \headerfile <x86intrin.h> 615/// 616/// This intrinsic corresponds to the <c> VORPS </c> instruction. 617/// 618/// \param __a 619/// A 256-bit vector of [8 x float] containing one of the source operands. 620/// \param __b 621/// A 256-bit vector of [8 x float] containing one of the source operands. 622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623/// values between both operands. 624static __inline __m256 __DEFAULT_FN_ATTRS 625_mm256_or_ps(__m256 __a, __m256 __b) 626{ 627 return (__m256)((__v8su)__a | (__v8su)__b); 628} 629 630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631/// 632/// \headerfile <x86intrin.h> 633/// 634/// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635/// 636/// \param __a 637/// A 256-bit vector of [4 x double] containing one of the source operands. 638/// \param __b 639/// A 256-bit vector of [4 x double] containing one of the source operands. 640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641/// values between both operands. 642static __inline __m256d __DEFAULT_FN_ATTRS 643_mm256_xor_pd(__m256d __a, __m256d __b) 644{ 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646} 647 648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649/// 650/// \headerfile <x86intrin.h> 651/// 652/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653/// 654/// \param __a 655/// A 256-bit vector of [8 x float] containing one of the source operands. 656/// \param __b 657/// A 256-bit vector of [8 x float] containing one of the source operands. 658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659/// values between both operands. 660static __inline __m256 __DEFAULT_FN_ATTRS 661_mm256_xor_ps(__m256 __a, __m256 __b) 662{ 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664} 665 666/* Horizontal arithmetic */ 667/// \brief Horizontally adds the adjacent pairs of values contained in two 668/// 256-bit vectors of [4 x double]. 669/// 670/// \headerfile <x86intrin.h> 671/// 672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673/// 674/// \param __a 675/// A 256-bit vector of [4 x double] containing one of the source operands. 676/// The horizontal sums of the values are returned in the even-indexed 677/// elements of a vector of [4 x double]. 678/// \param __b 679/// A 256-bit vector of [4 x double] containing one of the source operands. 680/// The horizontal sums of the values are returned in the odd-indexed 681/// elements of a vector of [4 x double]. 682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683/// both operands. 684static __inline __m256d __DEFAULT_FN_ATTRS 685_mm256_hadd_pd(__m256d __a, __m256d __b) 686{ 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688} 689 690/// \brief Horizontally adds the adjacent pairs of values contained in two 691/// 256-bit vectors of [8 x float]. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696/// 697/// \param __a 698/// A 256-bit vector of [8 x float] containing one of the source operands. 699/// The horizontal sums of the values are returned in the elements with 700/// index 0, 1, 4, 5 of a vector of [8 x float]. 701/// \param __b 702/// A 256-bit vector of [8 x float] containing one of the source operands. 703/// The horizontal sums of the values are returned in the elements with 704/// index 2, 3, 6, 7 of a vector of [8 x float]. 705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706/// both operands. 707static __inline __m256 __DEFAULT_FN_ATTRS 708_mm256_hadd_ps(__m256 __a, __m256 __b) 709{ 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711} 712 713/// \brief Horizontally subtracts the adjacent pairs of values contained in two 714/// 256-bit vectors of [4 x double]. 715/// 716/// \headerfile <x86intrin.h> 717/// 718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719/// 720/// \param __a 721/// A 256-bit vector of [4 x double] containing one of the source operands. 722/// The horizontal differences between the values are returned in the 723/// even-indexed elements of a vector of [4 x double]. 724/// \param __b 725/// A 256-bit vector of [4 x double] containing one of the source operands. 726/// The horizontal differences between the values are returned in the 727/// odd-indexed elements of a vector of [4 x double]. 728/// \returns A 256-bit vector of [4 x double] containing the horizontal 729/// differences of both operands. 730static __inline __m256d __DEFAULT_FN_ATTRS 731_mm256_hsub_pd(__m256d __a, __m256d __b) 732{ 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734} 735 736/// \brief Horizontally subtracts the adjacent pairs of values contained in two 737/// 256-bit vectors of [8 x float]. 738/// 739/// \headerfile <x86intrin.h> 740/// 741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742/// 743/// \param __a 744/// A 256-bit vector of [8 x float] containing one of the source operands. 745/// The horizontal differences between the values are returned in the 746/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747/// \param __b 748/// A 256-bit vector of [8 x float] containing one of the source operands. 749/// The horizontal differences between the values are returned in the 750/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751/// \returns A 256-bit vector of [8 x float] containing the horizontal 752/// differences of both operands. 753static __inline __m256 __DEFAULT_FN_ATTRS 754_mm256_hsub_ps(__m256 __a, __m256 __b) 755{ 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757} 758 759/* Vector permutations */ 760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761/// by the 128-bit integer vector operand. 762/// 763/// \headerfile <x86intrin.h> 764/// 765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766/// 767/// \param __a 768/// A 128-bit vector of [2 x double]. 769/// \param __c 770/// A 128-bit integer vector operand specifying how the values are to be 771/// copied. \n 772/// Bit [1]: \n 773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774/// vector. \n 775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776/// returned vector. \n 777/// Bit [65]: \n 778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779/// returned vector. \n 780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781/// returned vector. 782/// \returns A 128-bit vector of [2 x double] containing the copied values. 783static __inline __m128d __DEFAULT_FN_ATTRS 784_mm_permutevar_pd(__m128d __a, __m128i __c) 785{ 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787} 788 789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790/// by the 256-bit integer vector operand. 791/// 792/// \headerfile <x86intrin.h> 793/// 794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795/// 796/// \param __a 797/// A 256-bit vector of [4 x double]. 798/// \param __c 799/// A 256-bit integer vector operand specifying how the values are to be 800/// copied. \n 801/// Bit [1]: \n 802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803/// vector. \n 804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805/// returned vector. \n 806/// Bit [65]: \n 807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808/// returned vector. \n 809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810/// returned vector. \n 811/// Bit [129]: \n 812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813/// returned vector. \n 814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815/// returned vector. \n 816/// Bit [193]: \n 817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818/// returned vector. \n 819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820/// returned vector. 821/// \returns A 256-bit vector of [4 x double] containing the copied values. 822static __inline __m256d __DEFAULT_FN_ATTRS 823_mm256_permutevar_pd(__m256d __a, __m256i __c) 824{ 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826} 827 828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829/// specified by the 128-bit integer vector operand. 830/// \headerfile <x86intrin.h> 831/// 832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833/// 834/// \param __a 835/// A 128-bit vector of [4 x float]. 836/// \param __c 837/// A 128-bit integer vector operand specifying how the values are to be 838/// copied. \n 839/// Bits [1:0]: \n 840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841/// returned vector. \n 842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843/// returned vector. \n 844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845/// returned vector. \n 846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847/// returned vector. \n 848/// Bits [33:32]: \n 849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850/// returned vector. \n 851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852/// returned vector. \n 853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854/// returned vector. \n 855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856/// returned vector. \n 857/// Bits [65:64]: \n 858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859/// returned vector. \n 860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861/// returned vector. \n 862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863/// returned vector. \n 864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865/// returned vector. \n 866/// Bits [97:96]: \n 867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868/// returned vector. \n 869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870/// returned vector. \n 871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872/// returned vector. \n 873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874/// returned vector. 875/// \returns A 128-bit vector of [4 x float] containing the copied values. 876static __inline __m128 __DEFAULT_FN_ATTRS 877_mm_permutevar_ps(__m128 __a, __m128i __c) 878{ 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880} 881 882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883/// specified by the 256-bit integer vector operand. 884/// 885/// \headerfile <x86intrin.h> 886/// 887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888/// 889/// \param __a 890/// A 256-bit vector of [8 x float]. 891/// \param __c 892/// A 256-bit integer vector operand specifying how the values are to be 893/// copied. \n 894/// Bits [1:0]: \n 895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896/// returned vector. \n 897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898/// returned vector. \n 899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900/// returned vector. \n 901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902/// returned vector. \n 903/// Bits [33:32]: \n 904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905/// returned vector. \n 906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907/// returned vector. \n 908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909/// returned vector. \n 910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911/// returned vector. \n 912/// Bits [65:64]: \n 913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914/// returned vector. \n 915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916/// returned vector. \n 917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918/// returned vector. \n 919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920/// returned vector. \n 921/// Bits [97:96]: \n 922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923/// returned vector. \n 924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925/// returned vector. \n 926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927/// returned vector. \n 928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929/// returned vector. \n 930/// Bits [129:128]: \n 931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932/// returned vector. \n 933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934/// returned vector. \n 935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936/// returned vector. \n 937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938/// returned vector. \n 939/// Bits [161:160]: \n 940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941/// returned vector. \n 942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943/// returned vector. \n 944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945/// returned vector. \n 946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947/// returned vector. \n 948/// Bits [193:192]: \n 949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950/// returned vector. \n 951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952/// returned vector. \n 953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954/// returned vector. \n 955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956/// returned vector. \n 957/// Bits [225:224]: \n 958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959/// returned vector. \n 960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961/// returned vector. \n 962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963/// returned vector. \n 964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965/// returned vector. 966/// \returns A 256-bit vector of [8 x float] containing the copied values. 967static __inline __m256 __DEFAULT_FN_ATTRS 968_mm256_permutevar_ps(__m256 __a, __m256i __c) 969{ 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971} 972 973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974/// by the immediate integer operand. 975/// 976/// \headerfile <x86intrin.h> 977/// 978/// \code 979/// __m128d _mm_permute_pd(__m128d A, const int C); 980/// \endcode 981/// 982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983/// 984/// \param A 985/// A 128-bit vector of [2 x double]. 986/// \param C 987/// An immediate integer operand specifying how the values are to be 988/// copied. \n 989/// Bit [0]: \n 990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991/// vector. \n 992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993/// returned vector. \n 994/// Bit [1]: \n 995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996/// returned vector. \n 997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998/// returned vector. 999/// \returns A 128-bit vector of [2 x double] containing the copied values. 1000#define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006/// the immediate integer operand. 1007/// 1008/// \headerfile <x86intrin.h> 1009/// 1010/// \code 1011/// __m256d _mm256_permute_pd(__m256d A, const int C); 1012/// \endcode 1013/// 1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015/// 1016/// \param A 1017/// A 256-bit vector of [4 x double]. 1018/// \param C 1019/// An immediate integer operand specifying how the values are to be 1020/// copied. \n 1021/// Bit [0]: \n 1022/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023/// vector. \n 1024/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025/// returned vector. \n 1026/// Bit [1]: \n 1027/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028/// returned vector. \n 1029/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030/// returned vector. \n 1031/// Bit [2]: \n 1032/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033/// returned vector. \n 1034/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035/// returned vector. \n 1036/// Bit [3]: \n 1037/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038/// returned vector. \n 1039/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040/// returned vector. 1041/// \returns A 256-bit vector of [4 x double] containing the copied values. 1042#define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051/// the immediate integer operand. 1052/// 1053/// \headerfile <x86intrin.h> 1054/// 1055/// \code 1056/// __m128 _mm_permute_ps(__m128 A, const int C); 1057/// \endcode 1058/// 1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060/// 1061/// \param A 1062/// A 128-bit vector of [4 x float]. 1063/// \param C 1064/// An immediate integer operand specifying how the values are to be 1065/// copied. \n 1066/// Bits [1:0]: \n 1067/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068/// returned vector. \n 1069/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070/// returned vector. \n 1071/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072/// returned vector. \n 1073/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074/// returned vector. \n 1075/// Bits [3:2]: \n 1076/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077/// returned vector. \n 1078/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079/// returned vector. \n 1080/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081/// returned vector. \n 1082/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083/// returned vector. \n 1084/// Bits [5:4]: \n 1085/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086/// returned vector. \n 1087/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088/// returned vector. \n 1089/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090/// returned vector. \n 1091/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092/// returned vector. \n 1093/// Bits [7:6]: \n 1094/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095/// returned vector. \n 1096/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097/// returned vector. \n 1098/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099/// returned vector. \n 1100/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101/// returned vector. 1102/// \returns A 128-bit vector of [4 x float] containing the copied values. 1103#define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110/// the immediate integer operand. 1111/// 1112/// \headerfile <x86intrin.h> 1113/// 1114/// \code 1115/// __m256 _mm256_permute_ps(__m256 A, const int C); 1116/// \endcode 1117/// 1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119/// 1120/// \param A 1121/// A 256-bit vector of [8 x float]. 1122/// \param C 1123/// An immediate integer operand specifying how the values are to be \n 1124/// copied. \n 1125/// Bits [1:0]: \n 1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127/// returned vector. \n 1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129/// returned vector. \n 1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131/// returned vector. \n 1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133/// returned vector. \n 1134/// Bits [3:2]: \n 1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136/// returned vector. \n 1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138/// returned vector. \n 1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140/// returned vector. \n 1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142/// returned vector. \n 1143/// Bits [5:4]: \n 1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145/// returned vector. \n 1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147/// returned vector. \n 1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149/// returned vector. \n 1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151/// returned vector. \n 1152/// Bits [7:6]: \n 1153/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154/// returned vector. \n 1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156/// returned vector. \n 1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158/// returned vector. \n 1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160/// returned vector. \n 1161/// Bits [1:0]: \n 1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163/// returned vector. \n 1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165/// returned vector. \n 1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167/// returned vector. \n 1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169/// returned vector. \n 1170/// Bits [3:2]: \n 1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172/// returned vector. \n 1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174/// returned vector. \n 1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176/// returned vector. \n 1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178/// returned vector. \n 1179/// Bits [5:4]: \n 1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181/// returned vector. \n 1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183/// returned vector. \n 1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185/// returned vector. \n 1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187/// returned vector. \n 1188/// Bits [7:6]: \n 1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190/// returned vector. \n 1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192/// returned vector. \n 1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194/// returned vector. \n 1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196/// returned vector. 1197/// \returns A 256-bit vector of [8 x float] containing the copied values. 1198#define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211/// [4 x double], as specified by the immediate integer operand. 1212/// 1213/// \headerfile <x86intrin.h> 1214/// 1215/// \code 1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217/// \endcode 1218/// 1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220/// 1221/// \param V1 1222/// A 256-bit vector of [4 x double]. 1223/// \param V2 1224/// A 256-bit vector of [4 x double. 1225/// \param M 1226/// An immediate integer operand specifying how the values are to be 1227/// permuted. \n 1228/// Bits [1:0]: \n 1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230/// destination. \n 1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232/// destination. \n 1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234/// destination. \n 1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236/// destination. \n 1237/// Bits [5:4]: \n 1238/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239/// destination. \n 1240/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241/// destination. \n 1242/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243/// destination. \n 1244/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245/// destination. 1246/// \returns A 256-bit vector of [4 x double] containing the copied values. 1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252/// [8 x float], as specified by the immediate integer operand. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// \code 1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258/// \endcode 1259/// 1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261/// 1262/// \param V1 1263/// A 256-bit vector of [8 x float]. 1264/// \param V2 1265/// A 256-bit vector of [8 x float]. 1266/// \param M 1267/// An immediate integer operand specifying how the values are to be 1268/// permuted. \n 1269/// Bits [1:0]: \n 1270/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271/// destination. \n 1272/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273/// destination. \n 1274/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275/// destination. \n 1276/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277/// destination. \n 1278/// Bits [5:4]: \n 1279/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280/// destination. \n 1281/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282/// destination. \n 1283/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284/// destination. \n 1285/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286/// destination. 1287/// \returns A 256-bit vector of [8 x float] containing the copied values. 1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293/// as specified by the immediate integer operand. 1294/// 1295/// \headerfile <x86intrin.h> 1296/// 1297/// \code 1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299/// \endcode 1300/// 1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302/// 1303/// \param V1 1304/// A 256-bit integer vector. 1305/// \param V2 1306/// A 256-bit integer vector. 1307/// \param M 1308/// An immediate integer operand specifying how the values are to be copied. 1309/// Bits [1:0]: \n 1310/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311/// destination. \n 1312/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313/// destination. \n 1314/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315/// destination. \n 1316/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317/// destination. \n 1318/// Bits [5:4]: \n 1319/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320/// destination. \n 1321/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322/// destination. \n 1323/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324/// destination. \n 1325/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326/// destination. 1327/// \returns A 256-bit integer vector containing the copied values. 1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332/* Vector Blend */ 1333/// \brief Merges 64-bit double-precision data values stored in either of the 1334/// two 256-bit vectors of [4 x double], as specified by the immediate 1335/// integer operand. 1336/// 1337/// \headerfile <x86intrin.h> 1338/// 1339/// \code 1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341/// \endcode 1342/// 1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344/// 1345/// \param V1 1346/// A 256-bit vector of [4 x double]. 1347/// \param V2 1348/// A 256-bit vector of [4 x double]. 1349/// \param M 1350/// An immediate integer operand, with mask bits [3:0] specifying how the 1351/// values are to be copied. The position of the mask bit corresponds to the 1352/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353/// element in operand \a V1 is copied to the same position in the 1354/// destination. When a mask bit is 1, the corresponding 64-bit element in 1355/// operand \a V2 is copied to the same position in the destination. 1356/// \returns A 256-bit vector of [4 x double] containing the copied values. 1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365/// \brief Merges 32-bit single-precision data values stored in either of the 1366/// two 256-bit vectors of [8 x float], as specified by the immediate 1367/// integer operand. 1368/// 1369/// \headerfile <x86intrin.h> 1370/// 1371/// \code 1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373/// \endcode 1374/// 1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376/// 1377/// \param V1 1378/// A 256-bit vector of [8 x float]. 1379/// \param V2 1380/// A 256-bit vector of [8 x float]. 1381/// \param M 1382/// An immediate integer operand, with mask bits [7:0] specifying how the 1383/// values are to be copied. The position of the mask bit corresponds to the 1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385/// element in operand \a V1 is copied to the same position in the 1386/// destination. When a mask bit is 1, the corresponding 32-bit element in 1387/// operand \a V2 is copied to the same position in the destination. 1388/// \returns A 256-bit vector of [8 x float] containing the copied values. 1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401/// \brief Merges 64-bit double-precision data values stored in either of the 1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403/// operand. 1404/// 1405/// \headerfile <x86intrin.h> 1406/// 1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408/// 1409/// \param __a 1410/// A 256-bit vector of [4 x double]. 1411/// \param __b 1412/// A 256-bit vector of [4 x double]. 1413/// \param __c 1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415/// how the values are to be copied. The position of the mask bit corresponds 1416/// to the most significant bit of a copied value. When a mask bit is 0, the 1417/// corresponding 64-bit element in operand \a __a is copied to the same 1418/// position in the destination. When a mask bit is 1, the corresponding 1419/// 64-bit element in operand \a __b is copied to the same position in the 1420/// destination. 1421/// \returns A 256-bit vector of [4 x double] containing the copied values. 1422static __inline __m256d __DEFAULT_FN_ATTRS 1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424{ 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427} 1428 1429/// \brief Merges 32-bit single-precision data values stored in either of the 1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431/// operand. 1432/// 1433/// \headerfile <x86intrin.h> 1434/// 1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436/// 1437/// \param __a 1438/// A 256-bit vector of [8 x float]. 1439/// \param __b 1440/// A 256-bit vector of [8 x float]. 1441/// \param __c 1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443/// and 31 specifying how the values are to be copied. The position of the 1444/// mask bit corresponds to the most significant bit of a copied value. When 1445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446/// copied to the same position in the destination. When a mask bit is 1, the 1447/// corresponding 32-bit element in operand \a __b is copied to the same 1448/// position in the destination. 1449/// \returns A 256-bit vector of [8 x float] containing the copied values. 1450static __inline __m256 __DEFAULT_FN_ATTRS 1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452{ 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455} 1456 1457/* Vector Dot Product */ 1458/// \brief Computes two dot products in parallel, using the lower and upper 1459/// halves of two [8 x float] vectors as input to the two computations, and 1460/// returning the two dot products in the lower and upper halves of the 1461/// [8 x float] result. 1462/// 1463/// The immediate integer operand controls which input elements will 1464/// contribute to the dot product, and where the final results are returned. 1465/// In general, for each dot product, the four corresponding elements of the 1466/// input vectors are multiplied; the first two and second two products are 1467/// summed, then the two sums are added to form the final result. 1468/// 1469/// \headerfile <x86intrin.h> 1470/// 1471/// \code 1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1473/// \endcode 1474/// 1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1476/// 1477/// \param V1 1478/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1479/// \param V2 1480/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1481/// \param M 1482/// An immediate integer argument. Bits [7:4] determine which elements of 1483/// the input vectors are used, with bit [4] corresponding to the lowest 1484/// element and bit [7] corresponding to the highest element of each [4 x 1485/// float] subvector. If a bit is set, the corresponding elements from the 1486/// two input vectors are used as an input for dot product; otherwise that 1487/// input is treated as zero. Bits [3:0] determine which elements of the 1488/// result will receive a copy of the final dot product, with bit [0] 1489/// corresponding to the lowest element and bit [3] corresponding to the 1490/// highest element of each [4 x float] subvector. If a bit is set, the dot 1491/// product is returned in the corresponding element; otherwise that element 1492/// is set to zero. The bitmask is applied in the same way to each of the 1493/// two parallel dot product computations. 1494/// \returns A 256-bit vector of [8 x float] containing the two dot products. 1495#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1496 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1497 (__v8sf)(__m256)(V2), (M)); }) 1498 1499/* Vector shuffle */ 1500/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1501/// specified by the immediate value operand. 1502/// 1503/// The four selected elements in each operand are copied to the destination 1504/// according to the bits specified in the immediate operand. The selected 1505/// elements from the first 256-bit operand are copied to bits [63:0] and 1506/// bits [191:128] of the destination, and the selected elements from the 1507/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1508/// the destination. For example, if bits [7:0] of the immediate operand 1509/// contain a value of 0xFF, the 256-bit destination vector would contain the 1510/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1511/// 1512/// \headerfile <x86intrin.h> 1513/// 1514/// \code 1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1516/// \endcode 1517/// 1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1519/// 1520/// \param a 1521/// A 256-bit vector of [8 x float]. The four selected elements in this 1522/// operand are copied to bits [63:0] and bits [191:128] in the destination, 1523/// according to the bits specified in the immediate operand. 1524/// \param b 1525/// A 256-bit vector of [8 x float]. The four selected elements in this 1526/// operand are copied to bits [127:64] and bits [255:192] in the 1527/// destination, according to the bits specified in the immediate operand. 1528/// \param mask 1529/// An immediate value containing an 8-bit value specifying which elements to 1530/// copy from \a a and \a b \n. 1531/// Bits [3:0] specify the values copied from operand \a a. \n 1532/// Bits [7:4] specify the values copied from operand \a b. \n 1533/// The destinations within the 256-bit destination are assigned values as 1534/// follows, according to the bit value assignments described below: \n 1535/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1536/// destination. \n 1537/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1538/// destination. \n 1539/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1540/// destination. \n 1541/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1542/// the destination. \n 1543/// Bit value assignments: \n 1544/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1545/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1546/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1547/// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1548/// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1549#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1550 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1551 (__v8sf)(__m256)(b), \ 1552 0 + (((mask) >> 0) & 0x3), \ 1553 0 + (((mask) >> 2) & 0x3), \ 1554 8 + (((mask) >> 4) & 0x3), \ 1555 8 + (((mask) >> 6) & 0x3), \ 1556 4 + (((mask) >> 0) & 0x3), \ 1557 4 + (((mask) >> 2) & 0x3), \ 1558 12 + (((mask) >> 4) & 0x3), \ 1559 12 + (((mask) >> 6) & 0x3)); }) 1560 1561/// \brief Selects four double-precision values from the 256-bit operands of 1562/// [4 x double], as specified by the immediate value operand. 1563/// 1564/// The selected elements from the first 256-bit operand are copied to bits 1565/// [63:0] and bits [191:128] in the destination, and the selected elements 1566/// from the second 256-bit operand are copied to bits [127:64] and bits 1567/// [255:192] in the destination. For example, if bits [3:0] of the immediate 1568/// operand contain a value of 0xF, the 256-bit destination vector would 1569/// contain the following values: b[3], a[3], b[1], a[1]. 1570/// 1571/// \headerfile <x86intrin.h> 1572/// 1573/// \code 1574/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1575/// \endcode 1576/// 1577/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1578/// 1579/// \param a 1580/// A 256-bit vector of [4 x double]. 1581/// \param b 1582/// A 256-bit vector of [4 x double]. 1583/// \param mask 1584/// An immediate value containing 8-bit values specifying which elements to 1585/// copy from \a a and \a b: \n 1586/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1587/// destination. \n 1588/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1589/// destination. \n 1590/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1591/// destination. \n 1592/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1593/// destination. \n 1594/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1595/// destination. \n 1596/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1597/// destination. \n 1598/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1599/// destination. \n 1600/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1601/// destination. 1602/// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1603#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1604 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1605 (__v4df)(__m256d)(b), \ 1606 0 + (((mask) >> 0) & 0x1), \ 1607 4 + (((mask) >> 1) & 0x1), \ 1608 2 + (((mask) >> 2) & 0x1), \ 1609 6 + (((mask) >> 3) & 0x1)); }) 1610 1611/* Compare */ 1612#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1613#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1614#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1615#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1616#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1617#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1618#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1619#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1620#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1621#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1622#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1623#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1624#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1625#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1626#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1627#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1628#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1629#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1630#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1631#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1632#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1633#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1634#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1635#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1636#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1637#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1638#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1639#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1640#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1641#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1642#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1643#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1644 1645/// \brief Compares each of the corresponding double-precision values of two 1646/// 128-bit vectors of [2 x double], using the operation specified by the 1647/// immediate integer operand. 1648/// 1649/// Returns a [2 x double] vector consisting of two doubles corresponding to 1650/// the two comparison results: zero if the comparison is false, and all 1's 1651/// if the comparison is true. 1652/// 1653/// \headerfile <x86intrin.h> 1654/// 1655/// \code 1656/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1657/// \endcode 1658/// 1659/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1660/// 1661/// \param a 1662/// A 128-bit vector of [2 x double]. 1663/// \param b 1664/// A 128-bit vector of [2 x double]. 1665/// \param c 1666/// An immediate integer operand, with bits [4:0] specifying which comparison 1667/// operation to use: \n 1668/// 0x00 : Equal (ordered, non-signaling) 1669/// 0x01 : Less-than (ordered, signaling) 1670/// 0x02 : Less-than-or-equal (ordered, signaling) 1671/// 0x03 : Unordered (non-signaling) 1672/// 0x04 : Not-equal (unordered, non-signaling) 1673/// 0x05 : Not-less-than (unordered, signaling) 1674/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1675/// 0x07 : Ordered (non-signaling) 1676/// 0x08 : Equal (unordered, non-signaling) 1677/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1678/// 0x0a : Not-greater-than (unordered, signaling) 1679/// 0x0b : False (ordered, non-signaling) 1680/// 0x0c : Not-equal (ordered, non-signaling) 1681/// 0x0d : Greater-than-or-equal (ordered, signaling) 1682/// 0x0e : Greater-than (ordered, signaling) 1683/// 0x0f : True (unordered, non-signaling) 1684/// 0x10 : Equal (ordered, signaling) 1685/// 0x11 : Less-than (ordered, non-signaling) 1686/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1687/// 0x13 : Unordered (signaling) 1688/// 0x14 : Not-equal (unordered, signaling) 1689/// 0x15 : Not-less-than (unordered, non-signaling) 1690/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1691/// 0x17 : Ordered (signaling) 1692/// 0x18 : Equal (unordered, signaling) 1693/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1694/// 0x1a : Not-greater-than (unordered, non-signaling) 1695/// 0x1b : False (ordered, signaling) 1696/// 0x1c : Not-equal (ordered, signaling) 1697/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1698/// 0x1e : Greater-than (ordered, non-signaling) 1699/// 0x1f : True (unordered, signaling) 1700/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1701#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1702 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1703 (__v2df)(__m128d)(b), (c)); }) 1704 1705/// \brief Compares each of the corresponding values of two 128-bit vectors of 1706/// [4 x float], using the operation specified by the immediate integer 1707/// operand. 1708/// 1709/// Returns a [4 x float] vector consisting of four floats corresponding to 1710/// the four comparison results: zero if the comparison is false, and all 1's 1711/// if the comparison is true. 1712/// 1713/// \headerfile <x86intrin.h> 1714/// 1715/// \code 1716/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1717/// \endcode 1718/// 1719/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1720/// 1721/// \param a 1722/// A 128-bit vector of [4 x float]. 1723/// \param b 1724/// A 128-bit vector of [4 x float]. 1725/// \param c 1726/// An immediate integer operand, with bits [4:0] specifying which comparison 1727/// operation to use: \n 1728/// 0x00 : Equal (ordered, non-signaling) 1729/// 0x01 : Less-than (ordered, signaling) 1730/// 0x02 : Less-than-or-equal (ordered, signaling) 1731/// 0x03 : Unordered (non-signaling) 1732/// 0x04 : Not-equal (unordered, non-signaling) 1733/// 0x05 : Not-less-than (unordered, signaling) 1734/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1735/// 0x07 : Ordered (non-signaling) 1736/// 0x08 : Equal (unordered, non-signaling) 1737/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1738/// 0x0a : Not-greater-than (unordered, signaling) 1739/// 0x0b : False (ordered, non-signaling) 1740/// 0x0c : Not-equal (ordered, non-signaling) 1741/// 0x0d : Greater-than-or-equal (ordered, signaling) 1742/// 0x0e : Greater-than (ordered, signaling) 1743/// 0x0f : True (unordered, non-signaling) 1744/// 0x10 : Equal (ordered, signaling) 1745/// 0x11 : Less-than (ordered, non-signaling) 1746/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1747/// 0x13 : Unordered (signaling) 1748/// 0x14 : Not-equal (unordered, signaling) 1749/// 0x15 : Not-less-than (unordered, non-signaling) 1750/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1751/// 0x17 : Ordered (signaling) 1752/// 0x18 : Equal (unordered, signaling) 1753/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1754/// 0x1a : Not-greater-than (unordered, non-signaling) 1755/// 0x1b : False (ordered, signaling) 1756/// 0x1c : Not-equal (ordered, signaling) 1757/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1758/// 0x1e : Greater-than (ordered, non-signaling) 1759/// 0x1f : True (unordered, signaling) 1760/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1761#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1762 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1763 (__v4sf)(__m128)(b), (c)); }) 1764 1765/// \brief Compares each of the corresponding double-precision values of two 1766/// 256-bit vectors of [4 x double], using the operation specified by the 1767/// immediate integer operand. 1768/// 1769/// Returns a [4 x double] vector consisting of four doubles corresponding to 1770/// the four comparison results: zero if the comparison is false, and all 1's 1771/// if the comparison is true. 1772/// 1773/// \headerfile <x86intrin.h> 1774/// 1775/// \code 1776/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1777/// \endcode 1778/// 1779/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1780/// 1781/// \param a 1782/// A 256-bit vector of [4 x double]. 1783/// \param b 1784/// A 256-bit vector of [4 x double]. 1785/// \param c 1786/// An immediate integer operand, with bits [4:0] specifying which comparison 1787/// operation to use: \n 1788/// 0x00 : Equal (ordered, non-signaling) 1789/// 0x01 : Less-than (ordered, signaling) 1790/// 0x02 : Less-than-or-equal (ordered, signaling) 1791/// 0x03 : Unordered (non-signaling) 1792/// 0x04 : Not-equal (unordered, non-signaling) 1793/// 0x05 : Not-less-than (unordered, signaling) 1794/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1795/// 0x07 : Ordered (non-signaling) 1796/// 0x08 : Equal (unordered, non-signaling) 1797/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1798/// 0x0a : Not-greater-than (unordered, signaling) 1799/// 0x0b : False (ordered, non-signaling) 1800/// 0x0c : Not-equal (ordered, non-signaling) 1801/// 0x0d : Greater-than-or-equal (ordered, signaling) 1802/// 0x0e : Greater-than (ordered, signaling) 1803/// 0x0f : True (unordered, non-signaling) 1804/// 0x10 : Equal (ordered, signaling) 1805/// 0x11 : Less-than (ordered, non-signaling) 1806/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1807/// 0x13 : Unordered (signaling) 1808/// 0x14 : Not-equal (unordered, signaling) 1809/// 0x15 : Not-less-than (unordered, non-signaling) 1810/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1811/// 0x17 : Ordered (signaling) 1812/// 0x18 : Equal (unordered, signaling) 1813/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1814/// 0x1a : Not-greater-than (unordered, non-signaling) 1815/// 0x1b : False (ordered, signaling) 1816/// 0x1c : Not-equal (ordered, signaling) 1817/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1818/// 0x1e : Greater-than (ordered, non-signaling) 1819/// 0x1f : True (unordered, signaling) 1820/// \returns A 256-bit vector of [4 x double] containing the comparison results. 1821#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1822 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1823 (__v4df)(__m256d)(b), (c)); }) 1824 1825/// \brief Compares each of the corresponding values of two 256-bit vectors of 1826/// [8 x float], using the operation specified by the immediate integer 1827/// operand. 1828/// 1829/// Returns a [8 x float] vector consisting of eight floats corresponding to 1830/// the eight comparison results: zero if the comparison is false, and all 1831/// 1's if the comparison is true. 1832/// 1833/// \headerfile <x86intrin.h> 1834/// 1835/// \code 1836/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1837/// \endcode 1838/// 1839/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1840/// 1841/// \param a 1842/// A 256-bit vector of [8 x float]. 1843/// \param b 1844/// A 256-bit vector of [8 x float]. 1845/// \param c 1846/// An immediate integer operand, with bits [4:0] specifying which comparison 1847/// operation to use: \n 1848/// 0x00 : Equal (ordered, non-signaling) 1849/// 0x01 : Less-than (ordered, signaling) 1850/// 0x02 : Less-than-or-equal (ordered, signaling) 1851/// 0x03 : Unordered (non-signaling) 1852/// 0x04 : Not-equal (unordered, non-signaling) 1853/// 0x05 : Not-less-than (unordered, signaling) 1854/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1855/// 0x07 : Ordered (non-signaling) 1856/// 0x08 : Equal (unordered, non-signaling) 1857/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1858/// 0x0a : Not-greater-than (unordered, signaling) 1859/// 0x0b : False (ordered, non-signaling) 1860/// 0x0c : Not-equal (ordered, non-signaling) 1861/// 0x0d : Greater-than-or-equal (ordered, signaling) 1862/// 0x0e : Greater-than (ordered, signaling) 1863/// 0x0f : True (unordered, non-signaling) 1864/// 0x10 : Equal (ordered, signaling) 1865/// 0x11 : Less-than (ordered, non-signaling) 1866/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1867/// 0x13 : Unordered (signaling) 1868/// 0x14 : Not-equal (unordered, signaling) 1869/// 0x15 : Not-less-than (unordered, non-signaling) 1870/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1871/// 0x17 : Ordered (signaling) 1872/// 0x18 : Equal (unordered, signaling) 1873/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1874/// 0x1a : Not-greater-than (unordered, non-signaling) 1875/// 0x1b : False (ordered, signaling) 1876/// 0x1c : Not-equal (ordered, signaling) 1877/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1878/// 0x1e : Greater-than (ordered, non-signaling) 1879/// 0x1f : True (unordered, signaling) 1880/// \returns A 256-bit vector of [8 x float] containing the comparison results. 1881#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1882 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1883 (__v8sf)(__m256)(b), (c)); }) 1884 1885/// \brief Compares each of the corresponding scalar double-precision values of 1886/// two 128-bit vectors of [2 x double], using the operation specified by the 1887/// immediate integer operand. 1888/// 1889/// If the result is true, all 64 bits of the destination vector are set; 1890/// otherwise they are cleared. 1891/// 1892/// \headerfile <x86intrin.h> 1893/// 1894/// \code 1895/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1896/// \endcode 1897/// 1898/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1899/// 1900/// \param a 1901/// A 128-bit vector of [2 x double]. 1902/// \param b 1903/// A 128-bit vector of [2 x double]. 1904/// \param c 1905/// An immediate integer operand, with bits [4:0] specifying which comparison 1906/// operation to use: \n 1907/// 0x00 : Equal (ordered, non-signaling) 1908/// 0x01 : Less-than (ordered, signaling) 1909/// 0x02 : Less-than-or-equal (ordered, signaling) 1910/// 0x03 : Unordered (non-signaling) 1911/// 0x04 : Not-equal (unordered, non-signaling) 1912/// 0x05 : Not-less-than (unordered, signaling) 1913/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1914/// 0x07 : Ordered (non-signaling) 1915/// 0x08 : Equal (unordered, non-signaling) 1916/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1917/// 0x0a : Not-greater-than (unordered, signaling) 1918/// 0x0b : False (ordered, non-signaling) 1919/// 0x0c : Not-equal (ordered, non-signaling) 1920/// 0x0d : Greater-than-or-equal (ordered, signaling) 1921/// 0x0e : Greater-than (ordered, signaling) 1922/// 0x0f : True (unordered, non-signaling) 1923/// 0x10 : Equal (ordered, signaling) 1924/// 0x11 : Less-than (ordered, non-signaling) 1925/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1926/// 0x13 : Unordered (signaling) 1927/// 0x14 : Not-equal (unordered, signaling) 1928/// 0x15 : Not-less-than (unordered, non-signaling) 1929/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1930/// 0x17 : Ordered (signaling) 1931/// 0x18 : Equal (unordered, signaling) 1932/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1933/// 0x1a : Not-greater-than (unordered, non-signaling) 1934/// 0x1b : False (ordered, signaling) 1935/// 0x1c : Not-equal (ordered, signaling) 1936/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1937/// 0x1e : Greater-than (ordered, non-signaling) 1938/// 0x1f : True (unordered, signaling) 1939/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1940#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1941 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1942 (__v2df)(__m128d)(b), (c)); }) 1943 1944/// \brief Compares each of the corresponding scalar values of two 128-bit 1945/// vectors of [4 x float], using the operation specified by the immediate 1946/// integer operand. 1947/// 1948/// If the result is true, all 32 bits of the destination vector are set; 1949/// otherwise they are cleared. 1950/// 1951/// \headerfile <x86intrin.h> 1952/// 1953/// \code 1954/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1955/// \endcode 1956/// 1957/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1958/// 1959/// \param a 1960/// A 128-bit vector of [4 x float]. 1961/// \param b 1962/// A 128-bit vector of [4 x float]. 1963/// \param c 1964/// An immediate integer operand, with bits [4:0] specifying which comparison 1965/// operation to use: \n 1966/// 0x00 : Equal (ordered, non-signaling) 1967/// 0x01 : Less-than (ordered, signaling) 1968/// 0x02 : Less-than-or-equal (ordered, signaling) 1969/// 0x03 : Unordered (non-signaling) 1970/// 0x04 : Not-equal (unordered, non-signaling) 1971/// 0x05 : Not-less-than (unordered, signaling) 1972/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1973/// 0x07 : Ordered (non-signaling) 1974/// 0x08 : Equal (unordered, non-signaling) 1975/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1976/// 0x0a : Not-greater-than (unordered, signaling) 1977/// 0x0b : False (ordered, non-signaling) 1978/// 0x0c : Not-equal (ordered, non-signaling) 1979/// 0x0d : Greater-than-or-equal (ordered, signaling) 1980/// 0x0e : Greater-than (ordered, signaling) 1981/// 0x0f : True (unordered, non-signaling) 1982/// 0x10 : Equal (ordered, signaling) 1983/// 0x11 : Less-than (ordered, non-signaling) 1984/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1985/// 0x13 : Unordered (signaling) 1986/// 0x14 : Not-equal (unordered, signaling) 1987/// 0x15 : Not-less-than (unordered, non-signaling) 1988/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1989/// 0x17 : Ordered (signaling) 1990/// 0x18 : Equal (unordered, signaling) 1991/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1992/// 0x1a : Not-greater-than (unordered, non-signaling) 1993/// 0x1b : False (ordered, signaling) 1994/// 0x1c : Not-equal (ordered, signaling) 1995/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1996/// 0x1e : Greater-than (ordered, non-signaling) 1997/// 0x1f : True (unordered, signaling) 1998/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1999#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 2000 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 2001 (__v4sf)(__m128)(b), (c)); }) 2002 2003/// \brief Takes a [8 x i32] vector and returns the vector element value 2004/// indexed by the immediate constant operand. 2005/// 2006/// \headerfile <x86intrin.h> 2007/// 2008/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2009/// instruction. 2010/// 2011/// \param __a 2012/// A 256-bit vector of [8 x i32]. 2013/// \param __imm 2014/// An immediate integer operand with bits [2:0] determining which vector 2015/// element is extracted and returned. 2016/// \returns A 32-bit integer containing the extracted 32 bits of extended 2017/// packed data. 2018static __inline int __DEFAULT_FN_ATTRS 2019_mm256_extract_epi32(__m256i __a, const int __imm) 2020{ 2021 __v8si __b = (__v8si)__a; 2022 return __b[__imm & 7]; 2023} 2024 2025/// \brief Takes a [16 x i16] vector and returns the vector element value 2026/// indexed by the immediate constant operand. 2027/// 2028/// \headerfile <x86intrin.h> 2029/// 2030/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2031/// instruction. 2032/// 2033/// \param __a 2034/// A 256-bit integer vector of [16 x i16]. 2035/// \param __imm 2036/// An immediate integer operand with bits [3:0] determining which vector 2037/// element is extracted and returned. 2038/// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2039/// packed data. 2040static __inline int __DEFAULT_FN_ATTRS 2041_mm256_extract_epi16(__m256i __a, const int __imm) 2042{ 2043 __v16hi __b = (__v16hi)__a; 2044 return (unsigned short)__b[__imm & 15]; 2045} 2046 2047/// \brief Takes a [32 x i8] vector and returns the vector element value 2048/// indexed by the immediate constant operand. 2049/// 2050/// \headerfile <x86intrin.h> 2051/// 2052/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2053/// instruction. 2054/// 2055/// \param __a 2056/// A 256-bit integer vector of [32 x i8]. 2057/// \param __imm 2058/// An immediate integer operand with bits [4:0] determining which vector 2059/// element is extracted and returned. 2060/// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2061/// packed data. 2062static __inline int __DEFAULT_FN_ATTRS 2063_mm256_extract_epi8(__m256i __a, const int __imm) 2064{ 2065 __v32qi __b = (__v32qi)__a; 2066 return (unsigned char)__b[__imm & 31]; 2067} 2068 2069#ifdef __x86_64__ 2070/// \brief Takes a [4 x i64] vector and returns the vector element value 2071/// indexed by the immediate constant operand. 2072/// 2073/// \headerfile <x86intrin.h> 2074/// 2075/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2076/// instruction. 2077/// 2078/// \param __a 2079/// A 256-bit integer vector of [4 x i64]. 2080/// \param __imm 2081/// An immediate integer operand with bits [1:0] determining which vector 2082/// element is extracted and returned. 2083/// \returns A 64-bit integer containing the extracted 64 bits of extended 2084/// packed data. 2085static __inline long long __DEFAULT_FN_ATTRS 2086_mm256_extract_epi64(__m256i __a, const int __imm) 2087{ 2088 __v4di __b = (__v4di)__a; 2089 return __b[__imm & 3]; 2090} 2091#endif 2092 2093/// \brief Takes a [8 x i32] vector and replaces the vector element value 2094/// indexed by the immediate constant operand by a new value. Returns the 2095/// modified vector. 2096/// 2097/// \headerfile <x86intrin.h> 2098/// 2099/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2100/// instruction. 2101/// 2102/// \param __a 2103/// A vector of [8 x i32] to be used by the insert operation. 2104/// \param __b 2105/// An integer value. The replacement value for the insert operation. 2106/// \param __imm 2107/// An immediate integer specifying the index of the vector element to be 2108/// replaced. 2109/// \returns A copy of vector \a __a, after replacing its element indexed by 2110/// \a __imm with \a __b. 2111static __inline __m256i __DEFAULT_FN_ATTRS 2112_mm256_insert_epi32(__m256i __a, int __b, int const __imm) 2113{ 2114 __v8si __c = (__v8si)__a; 2115 __c[__imm & 7] = __b; 2116 return (__m256i)__c; 2117} 2118 2119 2120/// \brief Takes a [16 x i16] vector and replaces the vector element value 2121/// indexed by the immediate constant operand with a new value. Returns the 2122/// modified vector. 2123/// 2124/// \headerfile <x86intrin.h> 2125/// 2126/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2127/// instruction. 2128/// 2129/// \param __a 2130/// A vector of [16 x i16] to be used by the insert operation. 2131/// \param __b 2132/// An i16 integer value. The replacement value for the insert operation. 2133/// \param __imm 2134/// An immediate integer specifying the index of the vector element to be 2135/// replaced. 2136/// \returns A copy of vector \a __a, after replacing its element indexed by 2137/// \a __imm with \a __b. 2138static __inline __m256i __DEFAULT_FN_ATTRS 2139_mm256_insert_epi16(__m256i __a, int __b, int const __imm) 2140{ 2141 __v16hi __c = (__v16hi)__a; 2142 __c[__imm & 15] = __b; 2143 return (__m256i)__c; 2144} 2145 2146/// \brief Takes a [32 x i8] vector and replaces the vector element value 2147/// indexed by the immediate constant operand with a new value. Returns the 2148/// modified vector. 2149/// 2150/// \headerfile <x86intrin.h> 2151/// 2152/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2153/// instruction. 2154/// 2155/// \param __a 2156/// A vector of [32 x i8] to be used by the insert operation. 2157/// \param __b 2158/// An i8 integer value. The replacement value for the insert operation. 2159/// \param __imm 2160/// An immediate integer specifying the index of the vector element to be 2161/// replaced. 2162/// \returns A copy of vector \a __a, after replacing its element indexed by 2163/// \a __imm with \a __b. 2164static __inline __m256i __DEFAULT_FN_ATTRS 2165_mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2166{ 2167 __v32qi __c = (__v32qi)__a; 2168 __c[__imm & 31] = __b; 2169 return (__m256i)__c; 2170} 2171 2172#ifdef __x86_64__ 2173/// \brief Takes a [4 x i64] vector and replaces the vector element value 2174/// indexed by the immediate constant operand with a new value. Returns the 2175/// modified vector. 2176/// 2177/// \headerfile <x86intrin.h> 2178/// 2179/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2180/// instruction. 2181/// 2182/// \param __a 2183/// A vector of [4 x i64] to be used by the insert operation. 2184/// \param __b 2185/// A 64-bit integer value. The replacement value for the insert operation. 2186/// \param __imm 2187/// An immediate integer specifying the index of the vector element to be 2188/// replaced. 2189/// \returns A copy of vector \a __a, after replacing its element indexed by 2190/// \a __imm with \a __b. 2191static __inline __m256i __DEFAULT_FN_ATTRS 2192_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2193{ 2194 __v4di __c = (__v4di)__a; 2195 __c[__imm & 3] = __b; 2196 return (__m256i)__c; 2197} 2198#endif 2199 2200/* Conversion */ 2201/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2202/// 2203/// \headerfile <x86intrin.h> 2204/// 2205/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2206/// 2207/// \param __a 2208/// A 128-bit integer vector of [4 x i32]. 2209/// \returns A 256-bit vector of [4 x double] containing the converted values. 2210static __inline __m256d __DEFAULT_FN_ATTRS 2211_mm256_cvtepi32_pd(__m128i __a) 2212{ 2213 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2214} 2215 2216/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2217/// 2218/// \headerfile <x86intrin.h> 2219/// 2220/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2221/// 2222/// \param __a 2223/// A 256-bit integer vector. 2224/// \returns A 256-bit vector of [8 x float] containing the converted values. 2225static __inline __m256 __DEFAULT_FN_ATTRS 2226_mm256_cvtepi32_ps(__m256i __a) 2227{ 2228 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2229} 2230 2231/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2232/// [4 x float]. 2233/// 2234/// \headerfile <x86intrin.h> 2235/// 2236/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2237/// 2238/// \param __a 2239/// A 256-bit vector of [4 x double]. 2240/// \returns A 128-bit vector of [4 x float] containing the converted values. 2241static __inline __m128 __DEFAULT_FN_ATTRS 2242_mm256_cvtpd_ps(__m256d __a) 2243{ 2244 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2245} 2246 2247/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2248/// 2249/// \headerfile <x86intrin.h> 2250/// 2251/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2252/// 2253/// \param __a 2254/// A 256-bit vector of [8 x float]. 2255/// \returns A 256-bit integer vector containing the converted values. 2256static __inline __m256i __DEFAULT_FN_ATTRS 2257_mm256_cvtps_epi32(__m256 __a) 2258{ 2259 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2260} 2261 2262/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2263/// x double]. 2264/// 2265/// \headerfile <x86intrin.h> 2266/// 2267/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2268/// 2269/// \param __a 2270/// A 128-bit vector of [4 x float]. 2271/// \returns A 256-bit vector of [4 x double] containing the converted values. 2272static __inline __m256d __DEFAULT_FN_ATTRS 2273_mm256_cvtps_pd(__m128 __a) 2274{ 2275 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2276} 2277 2278/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2279/// x i32], truncating the result by rounding towards zero when it is 2280/// inexact. 2281/// 2282/// \headerfile <x86intrin.h> 2283/// 2284/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2285/// 2286/// \param __a 2287/// A 256-bit vector of [4 x double]. 2288/// \returns A 128-bit integer vector containing the converted values. 2289static __inline __m128i __DEFAULT_FN_ATTRS 2290_mm256_cvttpd_epi32(__m256d __a) 2291{ 2292 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2293} 2294 2295/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2296/// x i32]. When a conversion is inexact, the value returned is rounded 2297/// according to the rounding control bits in the MXCSR register. 2298/// 2299/// \headerfile <x86intrin.h> 2300/// 2301/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2302/// 2303/// \param __a 2304/// A 256-bit vector of [4 x double]. 2305/// \returns A 128-bit integer vector containing the converted values. 2306static __inline __m128i __DEFAULT_FN_ATTRS 2307_mm256_cvtpd_epi32(__m256d __a) 2308{ 2309 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2310} 2311 2312/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2313/// truncating the result by rounding towards zero when it is inexact. 2314/// 2315/// \headerfile <x86intrin.h> 2316/// 2317/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2318/// 2319/// \param __a 2320/// A 256-bit vector of [8 x float]. 2321/// \returns A 256-bit integer vector containing the converted values. 2322static __inline __m256i __DEFAULT_FN_ATTRS 2323_mm256_cvttps_epi32(__m256 __a) 2324{ 2325 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2326} 2327 2328/// \brief Returns the first element of the input vector of [4 x double]. 2329/// 2330/// \headerfile <avxintrin.h> 2331/// 2332/// This intrinsic is a utility function and does not correspond to a specific 2333/// instruction. 2334/// 2335/// \param __a 2336/// A 256-bit vector of [4 x double]. 2337/// \returns A 64 bit double containing the first element of the input vector. 2338static __inline double __DEFAULT_FN_ATTRS 2339_mm256_cvtsd_f64(__m256d __a) 2340{ 2341 return __a[0]; 2342} 2343 2344/// \brief Returns the first element of the input vector of [8 x i32]. 2345/// 2346/// \headerfile <avxintrin.h> 2347/// 2348/// This intrinsic is a utility function and does not correspond to a specific 2349/// instruction. 2350/// 2351/// \param __a 2352/// A 256-bit vector of [8 x i32]. 2353/// \returns A 32 bit integer containing the first element of the input vector. 2354static __inline int __DEFAULT_FN_ATTRS 2355_mm256_cvtsi256_si32(__m256i __a) 2356{ 2357 __v8si __b = (__v8si)__a; 2358 return __b[0]; 2359} 2360 2361/// \brief Returns the first element of the input vector of [8 x float]. 2362/// 2363/// \headerfile <avxintrin.h> 2364/// 2365/// This intrinsic is a utility function and does not correspond to a specific 2366/// instruction. 2367/// 2368/// \param __a 2369/// A 256-bit vector of [8 x float]. 2370/// \returns A 32 bit float containing the first element of the input vector. 2371static __inline float __DEFAULT_FN_ATTRS 2372_mm256_cvtss_f32(__m256 __a) 2373{ 2374 return __a[0]; 2375} 2376 2377/* Vector replicate */ 2378/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2379/// vector of [8 x float] to float values in a 256-bit vector of 2380/// [8 x float]. 2381/// 2382/// \headerfile <x86intrin.h> 2383/// 2384/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2385/// 2386/// \param __a 2387/// A 256-bit vector of [8 x float]. \n 2388/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2389/// the return value. \n 2390/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2391/// the return value. \n 2392/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2393/// return value. \n 2394/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2395/// return value. 2396/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2397/// values. 2398static __inline __m256 __DEFAULT_FN_ATTRS 2399_mm256_movehdup_ps(__m256 __a) 2400{ 2401 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2402} 2403 2404/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2405/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2406/// 2407/// \headerfile <x86intrin.h> 2408/// 2409/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2410/// 2411/// \param __a 2412/// A 256-bit vector of [8 x float]. \n 2413/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2414/// the return value. \n 2415/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2416/// the return value. \n 2417/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2418/// return value. \n 2419/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2420/// return value. 2421/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2422/// values. 2423static __inline __m256 __DEFAULT_FN_ATTRS 2424_mm256_moveldup_ps(__m256 __a) 2425{ 2426 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2427} 2428 2429/// \brief Moves and duplicates double-precision floating point values from a 2430/// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2431/// vector of [4 x double]. 2432/// 2433/// \headerfile <x86intrin.h> 2434/// 2435/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2436/// 2437/// \param __a 2438/// A 256-bit vector of [4 x double]. \n 2439/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2440/// return value. \n 2441/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2442/// the return value. 2443/// \returns A 256-bit vector of [4 x double] containing the moved and 2444/// duplicated values. 2445static __inline __m256d __DEFAULT_FN_ATTRS 2446_mm256_movedup_pd(__m256d __a) 2447{ 2448 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2449} 2450 2451/* Unpack and Interleave */ 2452/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2453/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2454/// 2455/// \headerfile <x86intrin.h> 2456/// 2457/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2458/// 2459/// \param __a 2460/// A 256-bit floating-point vector of [4 x double]. \n 2461/// Bits [127:64] are written to bits [63:0] of the return value. \n 2462/// Bits [255:192] are written to bits [191:128] of the return value. \n 2463/// \param __b 2464/// A 256-bit floating-point vector of [4 x double]. \n 2465/// Bits [127:64] are written to bits [127:64] of the return value. \n 2466/// Bits [255:192] are written to bits [255:192] of the return value. \n 2467/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2468static __inline __m256d __DEFAULT_FN_ATTRS 2469_mm256_unpackhi_pd(__m256d __a, __m256d __b) 2470{ 2471 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2472} 2473 2474/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2475/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2476/// 2477/// \headerfile <x86intrin.h> 2478/// 2479/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2480/// 2481/// \param __a 2482/// A 256-bit floating-point vector of [4 x double]. \n 2483/// Bits [63:0] are written to bits [63:0] of the return value. \n 2484/// Bits [191:128] are written to bits [191:128] of the return value. 2485/// \param __b 2486/// A 256-bit floating-point vector of [4 x double]. \n 2487/// Bits [63:0] are written to bits [127:64] of the return value. \n 2488/// Bits [191:128] are written to bits [255:192] of the return value. \n 2489/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2490static __inline __m256d __DEFAULT_FN_ATTRS 2491_mm256_unpacklo_pd(__m256d __a, __m256d __b) 2492{ 2493 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2494} 2495 2496/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2497/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2498/// vector of [8 x float]. 2499/// 2500/// \headerfile <x86intrin.h> 2501/// 2502/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2503/// 2504/// \param __a 2505/// A 256-bit vector of [8 x float]. \n 2506/// Bits [95:64] are written to bits [31:0] of the return value. \n 2507/// Bits [127:96] are written to bits [95:64] of the return value. \n 2508/// Bits [223:192] are written to bits [159:128] of the return value. \n 2509/// Bits [255:224] are written to bits [223:192] of the return value. 2510/// \param __b 2511/// A 256-bit vector of [8 x float]. \n 2512/// Bits [95:64] are written to bits [63:32] of the return value. \n 2513/// Bits [127:96] are written to bits [127:96] of the return value. \n 2514/// Bits [223:192] are written to bits [191:160] of the return value. \n 2515/// Bits [255:224] are written to bits [255:224] of the return value. 2516/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2517static __inline __m256 __DEFAULT_FN_ATTRS 2518_mm256_unpackhi_ps(__m256 __a, __m256 __b) 2519{ 2520 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2521} 2522 2523/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2524/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2525/// vector of [8 x float]. 2526/// 2527/// \headerfile <x86intrin.h> 2528/// 2529/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2530/// 2531/// \param __a 2532/// A 256-bit vector of [8 x float]. \n 2533/// Bits [31:0] are written to bits [31:0] of the return value. \n 2534/// Bits [63:32] are written to bits [95:64] of the return value. \n 2535/// Bits [159:128] are written to bits [159:128] of the return value. \n 2536/// Bits [191:160] are written to bits [223:192] of the return value. 2537/// \param __b 2538/// A 256-bit vector of [8 x float]. \n 2539/// Bits [31:0] are written to bits [63:32] of the return value. \n 2540/// Bits [63:32] are written to bits [127:96] of the return value. \n 2541/// Bits [159:128] are written to bits [191:160] of the return value. \n 2542/// Bits [191:160] are written to bits [255:224] of the return value. 2543/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2544static __inline __m256 __DEFAULT_FN_ATTRS 2545_mm256_unpacklo_ps(__m256 __a, __m256 __b) 2546{ 2547 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2548} 2549 2550/* Bit Test */ 2551/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2552/// element-by-element comparison of the double-precision element in the 2553/// first source vector and the corresponding element in the second source 2554/// vector. 2555/// 2556/// The EFLAGS register is updated as follows: \n 2557/// If there is at least one pair of double-precision elements where the 2558/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2559/// ZF flag is set to 1. \n 2560/// If there is at least one pair of double-precision elements where the 2561/// sign-bit of the first element is 0 and the sign-bit of the second element 2562/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2563/// This intrinsic returns the value of the ZF flag. 2564/// 2565/// \headerfile <x86intrin.h> 2566/// 2567/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2568/// 2569/// \param __a 2570/// A 128-bit vector of [2 x double]. 2571/// \param __b 2572/// A 128-bit vector of [2 x double]. 2573/// \returns the ZF flag in the EFLAGS register. 2574static __inline int __DEFAULT_FN_ATTRS 2575_mm_testz_pd(__m128d __a, __m128d __b) 2576{ 2577 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2578} 2579 2580/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2581/// element-by-element comparison of the double-precision element in the 2582/// first source vector and the corresponding element in the second source 2583/// vector. 2584/// 2585/// The EFLAGS register is updated as follows: \n 2586/// If there is at least one pair of double-precision elements where the 2587/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2588/// ZF flag is set to 1. \n 2589/// If there is at least one pair of double-precision elements where the 2590/// sign-bit of the first element is 0 and the sign-bit of the second element 2591/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2592/// This intrinsic returns the value of the CF flag. 2593/// 2594/// \headerfile <x86intrin.h> 2595/// 2596/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2597/// 2598/// \param __a 2599/// A 128-bit vector of [2 x double]. 2600/// \param __b 2601/// A 128-bit vector of [2 x double]. 2602/// \returns the CF flag in the EFLAGS register. 2603static __inline int __DEFAULT_FN_ATTRS 2604_mm_testc_pd(__m128d __a, __m128d __b) 2605{ 2606 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2607} 2608 2609/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2610/// element-by-element comparison of the double-precision element in the 2611/// first source vector and the corresponding element in the second source 2612/// vector. 2613/// 2614/// The EFLAGS register is updated as follows: \n 2615/// If there is at least one pair of double-precision elements where the 2616/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2617/// ZF flag is set to 1. \n 2618/// If there is at least one pair of double-precision elements where the 2619/// sign-bit of the first element is 0 and the sign-bit of the second element 2620/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2621/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2622/// otherwise it returns 0. 2623/// 2624/// \headerfile <x86intrin.h> 2625/// 2626/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2627/// 2628/// \param __a 2629/// A 128-bit vector of [2 x double]. 2630/// \param __b 2631/// A 128-bit vector of [2 x double]. 2632/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2633static __inline int __DEFAULT_FN_ATTRS 2634_mm_testnzc_pd(__m128d __a, __m128d __b) 2635{ 2636 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2637} 2638 2639/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2640/// element-by-element comparison of the single-precision element in the 2641/// first source vector and the corresponding element in the second source 2642/// vector. 2643/// 2644/// The EFLAGS register is updated as follows: \n 2645/// If there is at least one pair of single-precision elements where the 2646/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2647/// ZF flag is set to 1. \n 2648/// If there is at least one pair of single-precision elements where the 2649/// sign-bit of the first element is 0 and the sign-bit of the second element 2650/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2651/// This intrinsic returns the value of the ZF flag. 2652/// 2653/// \headerfile <x86intrin.h> 2654/// 2655/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2656/// 2657/// \param __a 2658/// A 128-bit vector of [4 x float]. 2659/// \param __b 2660/// A 128-bit vector of [4 x float]. 2661/// \returns the ZF flag. 2662static __inline int __DEFAULT_FN_ATTRS 2663_mm_testz_ps(__m128 __a, __m128 __b) 2664{ 2665 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2666} 2667 2668/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2669/// element-by-element comparison of the single-precision element in the 2670/// first source vector and the corresponding element in the second source 2671/// vector. 2672/// 2673/// The EFLAGS register is updated as follows: \n 2674/// If there is at least one pair of single-precision elements where the 2675/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2676/// ZF flag is set to 1. \n 2677/// If there is at least one pair of single-precision elements where the 2678/// sign-bit of the first element is 0 and the sign-bit of the second element 2679/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2680/// This intrinsic returns the value of the CF flag. 2681/// 2682/// \headerfile <x86intrin.h> 2683/// 2684/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2685/// 2686/// \param __a 2687/// A 128-bit vector of [4 x float]. 2688/// \param __b 2689/// A 128-bit vector of [4 x float]. 2690/// \returns the CF flag. 2691static __inline int __DEFAULT_FN_ATTRS 2692_mm_testc_ps(__m128 __a, __m128 __b) 2693{ 2694 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2695} 2696 2697/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2698/// element-by-element comparison of the single-precision element in the 2699/// first source vector and the corresponding element in the second source 2700/// vector. 2701/// 2702/// The EFLAGS register is updated as follows: \n 2703/// If there is at least one pair of single-precision elements where the 2704/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2705/// ZF flag is set to 1. \n 2706/// If there is at least one pair of single-precision elements where the 2707/// sign-bit of the first element is 0 and the sign-bit of the second element 2708/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2709/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2710/// otherwise it returns 0. 2711/// 2712/// \headerfile <x86intrin.h> 2713/// 2714/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2715/// 2716/// \param __a 2717/// A 128-bit vector of [4 x float]. 2718/// \param __b 2719/// A 128-bit vector of [4 x float]. 2720/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2721static __inline int __DEFAULT_FN_ATTRS 2722_mm_testnzc_ps(__m128 __a, __m128 __b) 2723{ 2724 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2725} 2726 2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2728/// element-by-element comparison of the double-precision elements in the 2729/// first source vector and the corresponding elements in the second source 2730/// vector. 2731/// 2732/// The EFLAGS register is updated as follows: \n 2733/// If there is at least one pair of double-precision elements where the 2734/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2735/// ZF flag is set to 1. \n 2736/// If there is at least one pair of double-precision elements where the 2737/// sign-bit of the first element is 0 and the sign-bit of the second element 2738/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2739/// This intrinsic returns the value of the ZF flag. 2740/// 2741/// \headerfile <x86intrin.h> 2742/// 2743/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2744/// 2745/// \param __a 2746/// A 256-bit vector of [4 x double]. 2747/// \param __b 2748/// A 256-bit vector of [4 x double]. 2749/// \returns the ZF flag. 2750static __inline int __DEFAULT_FN_ATTRS 2751_mm256_testz_pd(__m256d __a, __m256d __b) 2752{ 2753 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2754} 2755 2756/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2757/// element-by-element comparison of the double-precision elements in the 2758/// first source vector and the corresponding elements in the second source 2759/// vector. 2760/// 2761/// The EFLAGS register is updated as follows: \n 2762/// If there is at least one pair of double-precision elements where the 2763/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2764/// ZF flag is set to 1. \n 2765/// If there is at least one pair of double-precision elements where the 2766/// sign-bit of the first element is 0 and the sign-bit of the second element 2767/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2768/// This intrinsic returns the value of the CF flag. 2769/// 2770/// \headerfile <x86intrin.h> 2771/// 2772/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2773/// 2774/// \param __a 2775/// A 256-bit vector of [4 x double]. 2776/// \param __b 2777/// A 256-bit vector of [4 x double]. 2778/// \returns the CF flag. 2779static __inline int __DEFAULT_FN_ATTRS 2780_mm256_testc_pd(__m256d __a, __m256d __b) 2781{ 2782 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2783} 2784 2785/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2786/// element-by-element comparison of the double-precision elements in the 2787/// first source vector and the corresponding elements in the second source 2788/// vector. 2789/// 2790/// The EFLAGS register is updated as follows: \n 2791/// If there is at least one pair of double-precision elements where the 2792/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2793/// ZF flag is set to 1. \n 2794/// If there is at least one pair of double-precision elements where the 2795/// sign-bit of the first element is 0 and the sign-bit of the second element 2796/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2797/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2798/// otherwise it returns 0. 2799/// 2800/// \headerfile <x86intrin.h> 2801/// 2802/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2803/// 2804/// \param __a 2805/// A 256-bit vector of [4 x double]. 2806/// \param __b 2807/// A 256-bit vector of [4 x double]. 2808/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2809static __inline int __DEFAULT_FN_ATTRS 2810_mm256_testnzc_pd(__m256d __a, __m256d __b) 2811{ 2812 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2813} 2814 2815/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2816/// element-by-element comparison of the single-precision element in the 2817/// first source vector and the corresponding element in the second source 2818/// vector. 2819/// 2820/// The EFLAGS register is updated as follows: \n 2821/// If there is at least one pair of single-precision elements where the 2822/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2823/// ZF flag is set to 1. \n 2824/// If there is at least one pair of single-precision elements where the 2825/// sign-bit of the first element is 0 and the sign-bit of the second element 2826/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2827/// This intrinsic returns the value of the ZF flag. 2828/// 2829/// \headerfile <x86intrin.h> 2830/// 2831/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2832/// 2833/// \param __a 2834/// A 256-bit vector of [8 x float]. 2835/// \param __b 2836/// A 256-bit vector of [8 x float]. 2837/// \returns the ZF flag. 2838static __inline int __DEFAULT_FN_ATTRS 2839_mm256_testz_ps(__m256 __a, __m256 __b) 2840{ 2841 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2842} 2843 2844/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2845/// element-by-element comparison of the single-precision element in the 2846/// first source vector and the corresponding element in the second source 2847/// vector. 2848/// 2849/// The EFLAGS register is updated as follows: \n 2850/// If there is at least one pair of single-precision elements where the 2851/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2852/// ZF flag is set to 1. \n 2853/// If there is at least one pair of single-precision elements where the 2854/// sign-bit of the first element is 0 and the sign-bit of the second element 2855/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2856/// This intrinsic returns the value of the CF flag. 2857/// 2858/// \headerfile <x86intrin.h> 2859/// 2860/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2861/// 2862/// \param __a 2863/// A 256-bit vector of [8 x float]. 2864/// \param __b 2865/// A 256-bit vector of [8 x float]. 2866/// \returns the CF flag. 2867static __inline int __DEFAULT_FN_ATTRS 2868_mm256_testc_ps(__m256 __a, __m256 __b) 2869{ 2870 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2871} 2872 2873/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2874/// element-by-element comparison of the single-precision elements in the 2875/// first source vector and the corresponding elements in the second source 2876/// vector. 2877/// 2878/// The EFLAGS register is updated as follows: \n 2879/// If there is at least one pair of single-precision elements where the 2880/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2881/// ZF flag is set to 1. \n 2882/// If there is at least one pair of single-precision elements where the 2883/// sign-bit of the first element is 0 and the sign-bit of the second element 2884/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2885/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2886/// otherwise it returns 0. 2887/// 2888/// \headerfile <x86intrin.h> 2889/// 2890/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2891/// 2892/// \param __a 2893/// A 256-bit vector of [8 x float]. 2894/// \param __b 2895/// A 256-bit vector of [8 x float]. 2896/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2897static __inline int __DEFAULT_FN_ATTRS 2898_mm256_testnzc_ps(__m256 __a, __m256 __b) 2899{ 2900 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2901} 2902 2903/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2904/// of the two source vectors. 2905/// 2906/// The EFLAGS register is updated as follows: \n 2907/// If there is at least one pair of bits where both bits are 1, the ZF flag 2908/// is set to 0. Otherwise the ZF flag is set to 1. \n 2909/// If there is at least one pair of bits where the bit from the first source 2910/// vector is 0 and the bit from the second source vector is 1, the CF flag 2911/// is set to 0. Otherwise the CF flag is set to 1. \n 2912/// This intrinsic returns the value of the ZF flag. 2913/// 2914/// \headerfile <x86intrin.h> 2915/// 2916/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2917/// 2918/// \param __a 2919/// A 256-bit integer vector. 2920/// \param __b 2921/// A 256-bit integer vector. 2922/// \returns the ZF flag. 2923static __inline int __DEFAULT_FN_ATTRS 2924_mm256_testz_si256(__m256i __a, __m256i __b) 2925{ 2926 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2927} 2928 2929/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2930/// of the two source vectors. 2931/// 2932/// The EFLAGS register is updated as follows: \n 2933/// If there is at least one pair of bits where both bits are 1, the ZF flag 2934/// is set to 0. Otherwise the ZF flag is set to 1. \n 2935/// If there is at least one pair of bits where the bit from the first source 2936/// vector is 0 and the bit from the second source vector is 1, the CF flag 2937/// is set to 0. Otherwise the CF flag is set to 1. \n 2938/// This intrinsic returns the value of the CF flag. 2939/// 2940/// \headerfile <x86intrin.h> 2941/// 2942/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2943/// 2944/// \param __a 2945/// A 256-bit integer vector. 2946/// \param __b 2947/// A 256-bit integer vector. 2948/// \returns the CF flag. 2949static __inline int __DEFAULT_FN_ATTRS 2950_mm256_testc_si256(__m256i __a, __m256i __b) 2951{ 2952 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2953} 2954 2955/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2956/// of the two source vectors. 2957/// 2958/// The EFLAGS register is updated as follows: \n 2959/// If there is at least one pair of bits where both bits are 1, the ZF flag 2960/// is set to 0. Otherwise the ZF flag is set to 1. \n 2961/// If there is at least one pair of bits where the bit from the first source 2962/// vector is 0 and the bit from the second source vector is 1, the CF flag 2963/// is set to 0. Otherwise the CF flag is set to 1. \n 2964/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2965/// otherwise it returns 0. 2966/// 2967/// \headerfile <x86intrin.h> 2968/// 2969/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2970/// 2971/// \param __a 2972/// A 256-bit integer vector. 2973/// \param __b 2974/// A 256-bit integer vector. 2975/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2976static __inline int __DEFAULT_FN_ATTRS 2977_mm256_testnzc_si256(__m256i __a, __m256i __b) 2978{ 2979 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2980} 2981 2982/* Vector extract sign mask */ 2983/// \brief Extracts the sign bits of double-precision floating point elements 2984/// in a 256-bit vector of [4 x double] and writes them to the lower order 2985/// bits of the return value. 2986/// 2987/// \headerfile <x86intrin.h> 2988/// 2989/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2990/// 2991/// \param __a 2992/// A 256-bit vector of [4 x double] containing the double-precision 2993/// floating point values with sign bits to be extracted. 2994/// \returns The sign bits from the operand, written to bits [3:0]. 2995static __inline int __DEFAULT_FN_ATTRS 2996_mm256_movemask_pd(__m256d __a) 2997{ 2998 return __builtin_ia32_movmskpd256((__v4df)__a); 2999} 3000 3001/// \brief Extracts the sign bits of double-precision floating point elements 3002/// in a 256-bit vector of [8 x float] and writes them to the lower order 3003/// bits of the return value. 3004/// 3005/// \headerfile <x86intrin.h> 3006/// 3007/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 3008/// 3009/// \param __a 3010/// A 256-bit vector of [8 x float] containing the double-precision floating 3011/// point values with sign bits to be extracted. 3012/// \returns The sign bits from the operand, written to bits [7:0]. 3013static __inline int __DEFAULT_FN_ATTRS 3014_mm256_movemask_ps(__m256 __a) 3015{ 3016 return __builtin_ia32_movmskps256((__v8sf)__a); 3017} 3018 3019/* Vector __zero */ 3020/// \brief Zeroes the contents of all XMM or YMM registers. 3021/// 3022/// \headerfile <x86intrin.h> 3023/// 3024/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 3025static __inline void __DEFAULT_FN_ATTRS 3026_mm256_zeroall(void) 3027{ 3028 __builtin_ia32_vzeroall(); 3029} 3030 3031/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 3032/// 3033/// \headerfile <x86intrin.h> 3034/// 3035/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 3036static __inline void __DEFAULT_FN_ATTRS 3037_mm256_zeroupper(void) 3038{ 3039 __builtin_ia32_vzeroupper(); 3040} 3041 3042/* Vector load with broadcast */ 3043/// \brief Loads a scalar single-precision floating point value from the 3044/// specified address pointed to by \a __a and broadcasts it to the elements 3045/// of a [4 x float] vector. 3046/// 3047/// \headerfile <x86intrin.h> 3048/// 3049/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3050/// 3051/// \param __a 3052/// The single-precision floating point value to be broadcast. 3053/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3054/// equal to the broadcast value. 3055static __inline __m128 __DEFAULT_FN_ATTRS 3056_mm_broadcast_ss(float const *__a) 3057{ 3058 float __f = *__a; 3059 return (__m128)(__v4sf){ __f, __f, __f, __f }; 3060} 3061 3062/// \brief Loads a scalar double-precision floating point value from the 3063/// specified address pointed to by \a __a and broadcasts it to the elements 3064/// of a [4 x double] vector. 3065/// 3066/// \headerfile <x86intrin.h> 3067/// 3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3069/// 3070/// \param __a 3071/// The double-precision floating point value to be broadcast. 3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3073/// equal to the broadcast value. 3074static __inline __m256d __DEFAULT_FN_ATTRS 3075_mm256_broadcast_sd(double const *__a) 3076{ 3077 double __d = *__a; 3078 return (__m256d)(__v4df){ __d, __d, __d, __d }; 3079} 3080 3081/// \brief Loads a scalar single-precision floating point value from the 3082/// specified address pointed to by \a __a and broadcasts it to the elements 3083/// of a [8 x float] vector. 3084/// 3085/// \headerfile <x86intrin.h> 3086/// 3087/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3088/// 3089/// \param __a 3090/// The single-precision floating point value to be broadcast. 3091/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3092/// equal to the broadcast value. 3093static __inline __m256 __DEFAULT_FN_ATTRS 3094_mm256_broadcast_ss(float const *__a) 3095{ 3096 float __f = *__a; 3097 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3098} 3099 3100/// \brief Loads the data from a 128-bit vector of [2 x double] from the 3101/// specified address pointed to by \a __a and broadcasts it to 128-bit 3102/// elements in a 256-bit vector of [4 x double]. 3103/// 3104/// \headerfile <x86intrin.h> 3105/// 3106/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3107/// 3108/// \param __a 3109/// The 128-bit vector of [2 x double] to be broadcast. 3110/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3111/// equal to the broadcast value. 3112static __inline __m256d __DEFAULT_FN_ATTRS 3113_mm256_broadcast_pd(__m128d const *__a) 3114{ 3115 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 3116} 3117 3118/// \brief Loads the data from a 128-bit vector of [4 x float] from the 3119/// specified address pointed to by \a __a and broadcasts it to 128-bit 3120/// elements in a 256-bit vector of [8 x float]. 3121/// 3122/// \headerfile <x86intrin.h> 3123/// 3124/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3125/// 3126/// \param __a 3127/// The 128-bit vector of [4 x float] to be broadcast. 3128/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3129/// equal to the broadcast value. 3130static __inline __m256 __DEFAULT_FN_ATTRS 3131_mm256_broadcast_ps(__m128 const *__a) 3132{ 3133 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 3134} 3135 3136/* SIMD load ops */ 3137/// \brief Loads 4 double-precision floating point values from a 32-byte aligned 3138/// memory location pointed to by \a __p into a vector of [4 x double]. 3139/// 3140/// \headerfile <x86intrin.h> 3141/// 3142/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3143/// 3144/// \param __p 3145/// A 32-byte aligned pointer to a memory location containing 3146/// double-precision floating point values. 3147/// \returns A 256-bit vector of [4 x double] containing the moved values. 3148static __inline __m256d __DEFAULT_FN_ATTRS 3149_mm256_load_pd(double const *__p) 3150{ 3151 return *(__m256d *)__p; 3152} 3153 3154/// \brief Loads 8 single-precision floating point values from a 32-byte aligned 3155/// memory location pointed to by \a __p into a vector of [8 x float]. 3156/// 3157/// \headerfile <x86intrin.h> 3158/// 3159/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3160/// 3161/// \param __p 3162/// A 32-byte aligned pointer to a memory location containing float values. 3163/// \returns A 256-bit vector of [8 x float] containing the moved values. 3164static __inline __m256 __DEFAULT_FN_ATTRS 3165_mm256_load_ps(float const *__p) 3166{ 3167 return *(__m256 *)__p; 3168} 3169 3170/// \brief Loads 4 double-precision floating point values from an unaligned 3171/// memory location pointed to by \a __p into a vector of [4 x double]. 3172/// 3173/// \headerfile <x86intrin.h> 3174/// 3175/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3176/// 3177/// \param __p 3178/// A pointer to a memory location containing double-precision floating 3179/// point values. 3180/// \returns A 256-bit vector of [4 x double] containing the moved values. 3181static __inline __m256d __DEFAULT_FN_ATTRS 3182_mm256_loadu_pd(double const *__p) 3183{ 3184 struct __loadu_pd { 3185 __m256d __v; 3186 } __attribute__((__packed__, __may_alias__)); 3187 return ((struct __loadu_pd*)__p)->__v; 3188} 3189 3190/// \brief Loads 8 single-precision floating point values from an unaligned 3191/// memory location pointed to by \a __p into a vector of [8 x float]. 3192/// 3193/// \headerfile <x86intrin.h> 3194/// 3195/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3196/// 3197/// \param __p 3198/// A pointer to a memory location containing single-precision floating 3199/// point values. 3200/// \returns A 256-bit vector of [8 x float] containing the moved values. 3201static __inline __m256 __DEFAULT_FN_ATTRS 3202_mm256_loadu_ps(float const *__p) 3203{ 3204 struct __loadu_ps { 3205 __m256 __v; 3206 } __attribute__((__packed__, __may_alias__)); 3207 return ((struct __loadu_ps*)__p)->__v; 3208} 3209 3210/// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3211/// location pointed to by \a __p into elements of a 256-bit integer vector. 3212/// 3213/// \headerfile <x86intrin.h> 3214/// 3215/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3216/// 3217/// \param __p 3218/// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3219/// values. 3220/// \returns A 256-bit integer vector containing the moved values. 3221static __inline __m256i __DEFAULT_FN_ATTRS 3222_mm256_load_si256(__m256i const *__p) 3223{ 3224 return *__p; 3225} 3226 3227/// \brief Loads 256 bits of integer data from an unaligned memory location 3228/// pointed to by \a __p into a 256-bit integer vector. 3229/// 3230/// \headerfile <x86intrin.h> 3231/// 3232/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3233/// 3234/// \param __p 3235/// A pointer to a 256-bit integer vector containing integer values. 3236/// \returns A 256-bit integer vector containing the moved values. 3237static __inline __m256i __DEFAULT_FN_ATTRS 3238_mm256_loadu_si256(__m256i const *__p) 3239{ 3240 struct __loadu_si256 { 3241 __m256i __v; 3242 } __attribute__((__packed__, __may_alias__)); 3243 return ((struct __loadu_si256*)__p)->__v; 3244} 3245 3246/// \brief Loads 256 bits of integer data from an unaligned memory location 3247/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3248/// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3249/// line boundary. 3250/// 3251/// \headerfile <x86intrin.h> 3252/// 3253/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3254/// 3255/// \param __p 3256/// A pointer to a 256-bit integer vector containing integer values. 3257/// \returns A 256-bit integer vector containing the moved values. 3258static __inline __m256i __DEFAULT_FN_ATTRS 3259_mm256_lddqu_si256(__m256i const *__p) 3260{ 3261 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3262} 3263 3264/* SIMD store ops */ 3265/// \brief Stores double-precision floating point values from a 256-bit vector 3266/// of [4 x double] to a 32-byte aligned memory location pointed to by 3267/// \a __p. 3268/// 3269/// \headerfile <x86intrin.h> 3270/// 3271/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3272/// 3273/// \param __p 3274/// A 32-byte aligned pointer to a memory location that will receive the 3275/// double-precision floaing point values. 3276/// \param __a 3277/// A 256-bit vector of [4 x double] containing the values to be moved. 3278static __inline void __DEFAULT_FN_ATTRS 3279_mm256_store_pd(double *__p, __m256d __a) 3280{ 3281 *(__m256d *)__p = __a; 3282} 3283 3284/// \brief Stores single-precision floating point values from a 256-bit vector 3285/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3286/// 3287/// \headerfile <x86intrin.h> 3288/// 3289/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3290/// 3291/// \param __p 3292/// A 32-byte aligned pointer to a memory location that will receive the 3293/// float values. 3294/// \param __a 3295/// A 256-bit vector of [8 x float] containing the values to be moved. 3296static __inline void __DEFAULT_FN_ATTRS 3297_mm256_store_ps(float *__p, __m256 __a) 3298{ 3299 *(__m256 *)__p = __a; 3300} 3301 3302/// \brief Stores double-precision floating point values from a 256-bit vector 3303/// of [4 x double] to an unaligned memory location pointed to by \a __p. 3304/// 3305/// \headerfile <x86intrin.h> 3306/// 3307/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3308/// 3309/// \param __p 3310/// A pointer to a memory location that will receive the double-precision 3311/// floating point values. 3312/// \param __a 3313/// A 256-bit vector of [4 x double] containing the values to be moved. 3314static __inline void __DEFAULT_FN_ATTRS 3315_mm256_storeu_pd(double *__p, __m256d __a) 3316{ 3317 struct __storeu_pd { 3318 __m256d __v; 3319 } __attribute__((__packed__, __may_alias__)); 3320 ((struct __storeu_pd*)__p)->__v = __a; 3321} 3322 3323/// \brief Stores single-precision floating point values from a 256-bit vector 3324/// of [8 x float] to an unaligned memory location pointed to by \a __p. 3325/// 3326/// \headerfile <x86intrin.h> 3327/// 3328/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3329/// 3330/// \param __p 3331/// A pointer to a memory location that will receive the float values. 3332/// \param __a 3333/// A 256-bit vector of [8 x float] containing the values to be moved. 3334static __inline void __DEFAULT_FN_ATTRS 3335_mm256_storeu_ps(float *__p, __m256 __a) 3336{ 3337 struct __storeu_ps { 3338 __m256 __v; 3339 } __attribute__((__packed__, __may_alias__)); 3340 ((struct __storeu_ps*)__p)->__v = __a; 3341} 3342 3343/// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3344/// aligned memory location pointed to by \a __p. 3345/// 3346/// \headerfile <x86intrin.h> 3347/// 3348/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3349/// 3350/// \param __p 3351/// A 32-byte aligned pointer to a memory location that will receive the 3352/// integer values. 3353/// \param __a 3354/// A 256-bit integer vector containing the values to be moved. 3355static __inline void __DEFAULT_FN_ATTRS 3356_mm256_store_si256(__m256i *__p, __m256i __a) 3357{ 3358 *__p = __a; 3359} 3360 3361/// \brief Stores integer values from a 256-bit integer vector to an unaligned 3362/// memory location pointed to by \a __p. 3363/// 3364/// \headerfile <x86intrin.h> 3365/// 3366/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3367/// 3368/// \param __p 3369/// A pointer to a memory location that will receive the integer values. 3370/// \param __a 3371/// A 256-bit integer vector containing the values to be moved. 3372static __inline void __DEFAULT_FN_ATTRS 3373_mm256_storeu_si256(__m256i *__p, __m256i __a) 3374{ 3375 struct __storeu_si256 { 3376 __m256i __v; 3377 } __attribute__((__packed__, __may_alias__)); 3378 ((struct __storeu_si256*)__p)->__v = __a; 3379} 3380 3381/* Conditional load ops */ 3382/// \brief Conditionally loads double-precision floating point elements from a 3383/// memory location pointed to by \a __p into a 128-bit vector of 3384/// [2 x double], depending on the mask bits associated with each data 3385/// element. 3386/// 3387/// \headerfile <x86intrin.h> 3388/// 3389/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3390/// 3391/// \param __p 3392/// A pointer to a memory location that contains the double-precision 3393/// floating point values. 3394/// \param __m 3395/// A 128-bit integer vector containing the mask. The most significant bit of 3396/// each data element represents the mask bits. If a mask bit is zero, the 3397/// corresponding value in the memory location is not loaded and the 3398/// corresponding field in the return value is set to zero. 3399/// \returns A 128-bit vector of [2 x double] containing the loaded values. 3400static __inline __m128d __DEFAULT_FN_ATTRS 3401_mm_maskload_pd(double const *__p, __m128i __m) 3402{ 3403 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3404} 3405 3406/// \brief Conditionally loads double-precision floating point elements from a 3407/// memory location pointed to by \a __p into a 256-bit vector of 3408/// [4 x double], depending on the mask bits associated with each data 3409/// element. 3410/// 3411/// \headerfile <x86intrin.h> 3412/// 3413/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3414/// 3415/// \param __p 3416/// A pointer to a memory location that contains the double-precision 3417/// floating point values. 3418/// \param __m 3419/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3420/// significant bit of each quadword element represents the mask bits. If a 3421/// mask bit is zero, the corresponding value in the memory location is not 3422/// loaded and the corresponding field in the return value is set to zero. 3423/// \returns A 256-bit vector of [4 x double] containing the loaded values. 3424static __inline __m256d __DEFAULT_FN_ATTRS 3425_mm256_maskload_pd(double const *__p, __m256i __m) 3426{ 3427 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3428 (__v4di)__m); 3429} 3430 3431/// \brief Conditionally loads single-precision floating point elements from a 3432/// memory location pointed to by \a __p into a 128-bit vector of 3433/// [4 x float], depending on the mask bits associated with each data 3434/// element. 3435/// 3436/// \headerfile <x86intrin.h> 3437/// 3438/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3439/// 3440/// \param __p 3441/// A pointer to a memory location that contains the single-precision 3442/// floating point values. 3443/// \param __m 3444/// A 128-bit integer vector containing the mask. The most significant bit of 3445/// each data element represents the mask bits. If a mask bit is zero, the 3446/// corresponding value in the memory location is not loaded and the 3447/// corresponding field in the return value is set to zero. 3448/// \returns A 128-bit vector of [4 x float] containing the loaded values. 3449static __inline __m128 __DEFAULT_FN_ATTRS 3450_mm_maskload_ps(float const *__p, __m128i __m) 3451{ 3452 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3453} 3454 3455/// \brief Conditionally loads single-precision floating point elements from a 3456/// memory location pointed to by \a __p into a 256-bit vector of 3457/// [8 x float], depending on the mask bits associated with each data 3458/// element. 3459/// 3460/// \headerfile <x86intrin.h> 3461/// 3462/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3463/// 3464/// \param __p 3465/// A pointer to a memory location that contains the single-precision 3466/// floating point values. 3467/// \param __m 3468/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3469/// significant bit of each dword element represents the mask bits. If a mask 3470/// bit is zero, the corresponding value in the memory location is not loaded 3471/// and the corresponding field in the return value is set to zero. 3472/// \returns A 256-bit vector of [8 x float] containing the loaded values. 3473static __inline __m256 __DEFAULT_FN_ATTRS 3474_mm256_maskload_ps(float const *__p, __m256i __m) 3475{ 3476 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3477} 3478 3479/* Conditional store ops */ 3480/// \brief Moves single-precision floating point values from a 256-bit vector 3481/// of [8 x float] to a memory location pointed to by \a __p, according to 3482/// the specified mask. 3483/// 3484/// \headerfile <x86intrin.h> 3485/// 3486/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3487/// 3488/// \param __p 3489/// A pointer to a memory location that will receive the float values. 3490/// \param __m 3491/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3492/// significant bit of each dword element in the mask vector represents the 3493/// mask bits. If a mask bit is zero, the corresponding value from vector 3494/// \a __a is not stored and the corresponding field in the memory location 3495/// pointed to by \a __p is not changed. 3496/// \param __a 3497/// A 256-bit vector of [8 x float] containing the values to be stored. 3498static __inline void __DEFAULT_FN_ATTRS 3499_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3500{ 3501 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3502} 3503 3504/// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3505/// to a memory location pointed to by \a __p, according to the specified 3506/// mask. 3507/// 3508/// \headerfile <x86intrin.h> 3509/// 3510/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3511/// 3512/// \param __p 3513/// A pointer to a memory location that will receive the float values. 3514/// \param __m 3515/// A 128-bit integer vector containing the mask. The most significant bit of 3516/// each field in the mask vector represents the mask bits. If a mask bit is 3517/// zero, the corresponding value from vector \a __a is not stored and the 3518/// corresponding field in the memory location pointed to by \a __p is not 3519/// changed. 3520/// \param __a 3521/// A 128-bit vector of [2 x double] containing the values to be stored. 3522static __inline void __DEFAULT_FN_ATTRS 3523_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3524{ 3525 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3526} 3527 3528/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3529/// to a memory location pointed to by \a __p, according to the specified 3530/// mask. 3531/// 3532/// \headerfile <x86intrin.h> 3533/// 3534/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3535/// 3536/// \param __p 3537/// A pointer to a memory location that will receive the float values. 3538/// \param __m 3539/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3540/// significant bit of each quadword element in the mask vector represents 3541/// the mask bits. If a mask bit is zero, the corresponding value from vector 3542/// __a is not stored and the corresponding field in the memory location 3543/// pointed to by \a __p is not changed. 3544/// \param __a 3545/// A 256-bit vector of [4 x double] containing the values to be stored. 3546static __inline void __DEFAULT_FN_ATTRS 3547_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3548{ 3549 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3550} 3551 3552/// \brief Moves single-precision floating point values from a 128-bit vector 3553/// of [4 x float] to a memory location pointed to by \a __p, according to 3554/// the specified mask. 3555/// 3556/// \headerfile <x86intrin.h> 3557/// 3558/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3559/// 3560/// \param __p 3561/// A pointer to a memory location that will receive the float values. 3562/// \param __m 3563/// A 128-bit integer vector containing the mask. The most significant bit of 3564/// each field in the mask vector represents the mask bits. If a mask bit is 3565/// zero, the corresponding value from vector __a is not stored and the 3566/// corresponding field in the memory location pointed to by \a __p is not 3567/// changed. 3568/// \param __a 3569/// A 128-bit vector of [4 x float] containing the values to be stored. 3570static __inline void __DEFAULT_FN_ATTRS 3571_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3572{ 3573 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3574} 3575 3576/* Cacheability support ops */ 3577/// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3578/// aligned memory location. To minimize caching, the data is flagged as 3579/// non-temporal (unlikely to be used again soon). 3580/// 3581/// \headerfile <x86intrin.h> 3582/// 3583/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3584/// 3585/// \param __a 3586/// A pointer to a 32-byte aligned memory location that will receive the 3587/// integer values. 3588/// \param __b 3589/// A 256-bit integer vector containing the values to be moved. 3590static __inline void __DEFAULT_FN_ATTRS 3591_mm256_stream_si256(__m256i *__a, __m256i __b) 3592{ 3593 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 3594} 3595 3596/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3597/// to a 32-byte aligned memory location. To minimize caching, the data is 3598/// flagged as non-temporal (unlikely to be used again soon). 3599/// 3600/// \headerfile <x86intrin.h> 3601/// 3602/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3603/// 3604/// \param __a 3605/// A pointer to a 32-byte aligned memory location that will receive the 3606/// double-precision floating-point values. 3607/// \param __b 3608/// A 256-bit vector of [4 x double] containing the values to be moved. 3609static __inline void __DEFAULT_FN_ATTRS 3610_mm256_stream_pd(double *__a, __m256d __b) 3611{ 3612 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 3613} 3614 3615/// \brief Moves single-precision floating point values from a 256-bit vector 3616/// of [8 x float] to a 32-byte aligned memory location. To minimize 3617/// caching, the data is flagged as non-temporal (unlikely to be used again 3618/// soon). 3619/// 3620/// \headerfile <x86intrin.h> 3621/// 3622/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3623/// 3624/// \param __p 3625/// A pointer to a 32-byte aligned memory location that will receive the 3626/// single-precision floating point values. 3627/// \param __a 3628/// A 256-bit vector of [8 x float] containing the values to be moved. 3629static __inline void __DEFAULT_FN_ATTRS 3630_mm256_stream_ps(float *__p, __m256 __a) 3631{ 3632 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 3633} 3634 3635/* Create vectors */ 3636/// \brief Create a 256-bit vector of [4 x double] with undefined values. 3637/// 3638/// \headerfile <x86intrin.h> 3639/// 3640/// This intrinsic has no corresponding instruction. 3641/// 3642/// \returns A 256-bit vector of [4 x double] containing undefined values. 3643static __inline__ __m256d __DEFAULT_FN_ATTRS 3644_mm256_undefined_pd(void) 3645{ 3646 return (__m256d)__builtin_ia32_undef256(); 3647} 3648 3649/// \brief Create a 256-bit vector of [8 x float] with undefined values. 3650/// 3651/// \headerfile <x86intrin.h> 3652/// 3653/// This intrinsic has no corresponding instruction. 3654/// 3655/// \returns A 256-bit vector of [8 x float] containing undefined values. 3656static __inline__ __m256 __DEFAULT_FN_ATTRS 3657_mm256_undefined_ps(void) 3658{ 3659 return (__m256)__builtin_ia32_undef256(); 3660} 3661 3662/// \brief Create a 256-bit integer vector with undefined values. 3663/// 3664/// \headerfile <x86intrin.h> 3665/// 3666/// This intrinsic has no corresponding instruction. 3667/// 3668/// \returns A 256-bit integer vector containing undefined values. 3669static __inline__ __m256i __DEFAULT_FN_ATTRS 3670_mm256_undefined_si256(void) 3671{ 3672 return (__m256i)__builtin_ia32_undef256(); 3673} 3674 3675/// \brief Constructs a 256-bit floating-point vector of [4 x double] 3676/// initialized with the specified double-precision floating-point values. 3677/// 3678/// \headerfile <x86intrin.h> 3679/// 3680/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3681/// instruction. 3682/// 3683/// \param __a 3684/// A double-precision floating-point value used to initialize bits [255:192] 3685/// of the result. 3686/// \param __b 3687/// A double-precision floating-point value used to initialize bits [191:128] 3688/// of the result. 3689/// \param __c 3690/// A double-precision floating-point value used to initialize bits [127:64] 3691/// of the result. 3692/// \param __d 3693/// A double-precision floating-point value used to initialize bits [63:0] 3694/// of the result. 3695/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3696static __inline __m256d __DEFAULT_FN_ATTRS 3697_mm256_set_pd(double __a, double __b, double __c, double __d) 3698{ 3699 return (__m256d){ __d, __c, __b, __a }; 3700} 3701 3702/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3703/// with the specified single-precision floating-point values. 3704/// 3705/// \headerfile <x86intrin.h> 3706/// 3707/// This intrinsic is a utility function and does not correspond to a specific 3708/// instruction. 3709/// 3710/// \param __a 3711/// A single-precision floating-point value used to initialize bits [255:224] 3712/// of the result. 3713/// \param __b 3714/// A single-precision floating-point value used to initialize bits [223:192] 3715/// of the result. 3716/// \param __c 3717/// A single-precision floating-point value used to initialize bits [191:160] 3718/// of the result. 3719/// \param __d 3720/// A single-precision floating-point value used to initialize bits [159:128] 3721/// of the result. 3722/// \param __e 3723/// A single-precision floating-point value used to initialize bits [127:96] 3724/// of the result. 3725/// \param __f 3726/// A single-precision floating-point value used to initialize bits [95:64] 3727/// of the result. 3728/// \param __g 3729/// A single-precision floating-point value used to initialize bits [63:32] 3730/// of the result. 3731/// \param __h 3732/// A single-precision floating-point value used to initialize bits [31:0] 3733/// of the result. 3734/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3735static __inline __m256 __DEFAULT_FN_ATTRS 3736_mm256_set_ps(float __a, float __b, float __c, float __d, 3737 float __e, float __f, float __g, float __h) 3738{ 3739 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3740} 3741 3742/// \brief Constructs a 256-bit integer vector initialized with the specified 3743/// 32-bit integral values. 3744/// 3745/// \headerfile <x86intrin.h> 3746/// 3747/// This intrinsic is a utility function and does not correspond to a specific 3748/// instruction. 3749/// 3750/// \param __i0 3751/// A 32-bit integral value used to initialize bits [255:224] of the result. 3752/// \param __i1 3753/// A 32-bit integral value used to initialize bits [223:192] of the result. 3754/// \param __i2 3755/// A 32-bit integral value used to initialize bits [191:160] of the result. 3756/// \param __i3 3757/// A 32-bit integral value used to initialize bits [159:128] of the result. 3758/// \param __i4 3759/// A 32-bit integral value used to initialize bits [127:96] of the result. 3760/// \param __i5 3761/// A 32-bit integral value used to initialize bits [95:64] of the result. 3762/// \param __i6 3763/// A 32-bit integral value used to initialize bits [63:32] of the result. 3764/// \param __i7 3765/// A 32-bit integral value used to initialize bits [31:0] of the result. 3766/// \returns An initialized 256-bit integer vector. 3767static __inline __m256i __DEFAULT_FN_ATTRS 3768_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3769 int __i4, int __i5, int __i6, int __i7) 3770{ 3771 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3772} 3773 3774/// \brief Constructs a 256-bit integer vector initialized with the specified 3775/// 16-bit integral values. 3776/// 3777/// \headerfile <x86intrin.h> 3778/// 3779/// This intrinsic is a utility function and does not correspond to a specific 3780/// instruction. 3781/// 3782/// \param __w15 3783/// A 16-bit integral value used to initialize bits [255:240] of the result. 3784/// \param __w14 3785/// A 16-bit integral value used to initialize bits [239:224] of the result. 3786/// \param __w13 3787/// A 16-bit integral value used to initialize bits [223:208] of the result. 3788/// \param __w12 3789/// A 16-bit integral value used to initialize bits [207:192] of the result. 3790/// \param __w11 3791/// A 16-bit integral value used to initialize bits [191:176] of the result. 3792/// \param __w10 3793/// A 16-bit integral value used to initialize bits [175:160] of the result. 3794/// \param __w09 3795/// A 16-bit integral value used to initialize bits [159:144] of the result. 3796/// \param __w08 3797/// A 16-bit integral value used to initialize bits [143:128] of the result. 3798/// \param __w07 3799/// A 16-bit integral value used to initialize bits [127:112] of the result. 3800/// \param __w06 3801/// A 16-bit integral value used to initialize bits [111:96] of the result. 3802/// \param __w05 3803/// A 16-bit integral value used to initialize bits [95:80] of the result. 3804/// \param __w04 3805/// A 16-bit integral value used to initialize bits [79:64] of the result. 3806/// \param __w03 3807/// A 16-bit integral value used to initialize bits [63:48] of the result. 3808/// \param __w02 3809/// A 16-bit integral value used to initialize bits [47:32] of the result. 3810/// \param __w01 3811/// A 16-bit integral value used to initialize bits [31:16] of the result. 3812/// \param __w00 3813/// A 16-bit integral value used to initialize bits [15:0] of the result. 3814/// \returns An initialized 256-bit integer vector. 3815static __inline __m256i __DEFAULT_FN_ATTRS 3816_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3817 short __w11, short __w10, short __w09, short __w08, 3818 short __w07, short __w06, short __w05, short __w04, 3819 short __w03, short __w02, short __w01, short __w00) 3820{ 3821 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3822 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3823} 3824 3825/// \brief Constructs a 256-bit integer vector initialized with the specified 3826/// 8-bit integral values. 3827/// 3828/// \headerfile <x86intrin.h> 3829/// 3830/// This intrinsic is a utility function and does not correspond to a specific 3831/// instruction. 3832/// 3833/// \param __b31 3834/// An 8-bit integral value used to initialize bits [255:248] of the result. 3835/// \param __b30 3836/// An 8-bit integral value used to initialize bits [247:240] of the result. 3837/// \param __b29 3838/// An 8-bit integral value used to initialize bits [239:232] of the result. 3839/// \param __b28 3840/// An 8-bit integral value used to initialize bits [231:224] of the result. 3841/// \param __b27 3842/// An 8-bit integral value used to initialize bits [223:216] of the result. 3843/// \param __b26 3844/// An 8-bit integral value used to initialize bits [215:208] of the result. 3845/// \param __b25 3846/// An 8-bit integral value used to initialize bits [207:200] of the result. 3847/// \param __b24 3848/// An 8-bit integral value used to initialize bits [199:192] of the result. 3849/// \param __b23 3850/// An 8-bit integral value used to initialize bits [191:184] of the result. 3851/// \param __b22 3852/// An 8-bit integral value used to initialize bits [183:176] of the result. 3853/// \param __b21 3854/// An 8-bit integral value used to initialize bits [175:168] of the result. 3855/// \param __b20 3856/// An 8-bit integral value used to initialize bits [167:160] of the result. 3857/// \param __b19 3858/// An 8-bit integral value used to initialize bits [159:152] of the result. 3859/// \param __b18 3860/// An 8-bit integral value used to initialize bits [151:144] of the result. 3861/// \param __b17 3862/// An 8-bit integral value used to initialize bits [143:136] of the result. 3863/// \param __b16 3864/// An 8-bit integral value used to initialize bits [135:128] of the result. 3865/// \param __b15 3866/// An 8-bit integral value used to initialize bits [127:120] of the result. 3867/// \param __b14 3868/// An 8-bit integral value used to initialize bits [119:112] of the result. 3869/// \param __b13 3870/// An 8-bit integral value used to initialize bits [111:104] of the result. 3871/// \param __b12 3872/// An 8-bit integral value used to initialize bits [103:96] of the result. 3873/// \param __b11 3874/// An 8-bit integral value used to initialize bits [95:88] of the result. 3875/// \param __b10 3876/// An 8-bit integral value used to initialize bits [87:80] of the result. 3877/// \param __b09 3878/// An 8-bit integral value used to initialize bits [79:72] of the result. 3879/// \param __b08 3880/// An 8-bit integral value used to initialize bits [71:64] of the result. 3881/// \param __b07 3882/// An 8-bit integral value used to initialize bits [63:56] of the result. 3883/// \param __b06 3884/// An 8-bit integral value used to initialize bits [55:48] of the result. 3885/// \param __b05 3886/// An 8-bit integral value used to initialize bits [47:40] of the result. 3887/// \param __b04 3888/// An 8-bit integral value used to initialize bits [39:32] of the result. 3889/// \param __b03 3890/// An 8-bit integral value used to initialize bits [31:24] of the result. 3891/// \param __b02 3892/// An 8-bit integral value used to initialize bits [23:16] of the result. 3893/// \param __b01 3894/// An 8-bit integral value used to initialize bits [15:8] of the result. 3895/// \param __b00 3896/// An 8-bit integral value used to initialize bits [7:0] of the result. 3897/// \returns An initialized 256-bit integer vector. 3898static __inline __m256i __DEFAULT_FN_ATTRS 3899_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3900 char __b27, char __b26, char __b25, char __b24, 3901 char __b23, char __b22, char __b21, char __b20, 3902 char __b19, char __b18, char __b17, char __b16, 3903 char __b15, char __b14, char __b13, char __b12, 3904 char __b11, char __b10, char __b09, char __b08, 3905 char __b07, char __b06, char __b05, char __b04, 3906 char __b03, char __b02, char __b01, char __b00) 3907{ 3908 return (__m256i)(__v32qi){ 3909 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3910 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3911 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3912 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3913 }; 3914} 3915 3916/// \brief Constructs a 256-bit integer vector initialized with the specified 3917/// 64-bit integral values. 3918/// 3919/// \headerfile <x86intrin.h> 3920/// 3921/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3922/// instruction. 3923/// 3924/// \param __a 3925/// A 64-bit integral value used to initialize bits [255:192] of the result. 3926/// \param __b 3927/// A 64-bit integral value used to initialize bits [191:128] of the result. 3928/// \param __c 3929/// A 64-bit integral value used to initialize bits [127:64] of the result. 3930/// \param __d 3931/// A 64-bit integral value used to initialize bits [63:0] of the result. 3932/// \returns An initialized 256-bit integer vector. 3933static __inline __m256i __DEFAULT_FN_ATTRS 3934_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3935{ 3936 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3937} 3938 3939/* Create vectors with elements in reverse order */ 3940/// \brief Constructs a 256-bit floating-point vector of [4 x double], 3941/// initialized in reverse order with the specified double-precision 3942/// floating-point values. 3943/// 3944/// \headerfile <x86intrin.h> 3945/// 3946/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3947/// instruction. 3948/// 3949/// \param __a 3950/// A double-precision floating-point value used to initialize bits [63:0] 3951/// of the result. 3952/// \param __b 3953/// A double-precision floating-point value used to initialize bits [127:64] 3954/// of the result. 3955/// \param __c 3956/// A double-precision floating-point value used to initialize bits [191:128] 3957/// of the result. 3958/// \param __d 3959/// A double-precision floating-point value used to initialize bits [255:192] 3960/// of the result. 3961/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3962static __inline __m256d __DEFAULT_FN_ATTRS 3963_mm256_setr_pd(double __a, double __b, double __c, double __d) 3964{ 3965 return (__m256d){ __a, __b, __c, __d }; 3966} 3967 3968/// \brief Constructs a 256-bit floating-point vector of [8 x float], 3969/// initialized in reverse order with the specified single-precision 3970/// float-point values. 3971/// 3972/// \headerfile <x86intrin.h> 3973/// 3974/// This intrinsic is a utility function and does not correspond to a specific 3975/// instruction. 3976/// 3977/// \param __a 3978/// A single-precision floating-point value used to initialize bits [31:0] 3979/// of the result. 3980/// \param __b 3981/// A single-precision floating-point value used to initialize bits [63:32] 3982/// of the result. 3983/// \param __c 3984/// A single-precision floating-point value used to initialize bits [95:64] 3985/// of the result. 3986/// \param __d 3987/// A single-precision floating-point value used to initialize bits [127:96] 3988/// of the result. 3989/// \param __e 3990/// A single-precision floating-point value used to initialize bits [159:128] 3991/// of the result. 3992/// \param __f 3993/// A single-precision floating-point value used to initialize bits [191:160] 3994/// of the result. 3995/// \param __g 3996/// A single-precision floating-point value used to initialize bits [223:192] 3997/// of the result. 3998/// \param __h 3999/// A single-precision floating-point value used to initialize bits [255:224] 4000/// of the result. 4001/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4002static __inline __m256 __DEFAULT_FN_ATTRS 4003_mm256_setr_ps(float __a, float __b, float __c, float __d, 4004 float __e, float __f, float __g, float __h) 4005{ 4006 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 4007} 4008 4009/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4010/// with the specified 32-bit integral values. 4011/// 4012/// \headerfile <x86intrin.h> 4013/// 4014/// This intrinsic is a utility function and does not correspond to a specific 4015/// instruction. 4016/// 4017/// \param __i0 4018/// A 32-bit integral value used to initialize bits [31:0] of the result. 4019/// \param __i1 4020/// A 32-bit integral value used to initialize bits [63:32] of the result. 4021/// \param __i2 4022/// A 32-bit integral value used to initialize bits [95:64] of the result. 4023/// \param __i3 4024/// A 32-bit integral value used to initialize bits [127:96] of the result. 4025/// \param __i4 4026/// A 32-bit integral value used to initialize bits [159:128] of the result. 4027/// \param __i5 4028/// A 32-bit integral value used to initialize bits [191:160] of the result. 4029/// \param __i6 4030/// A 32-bit integral value used to initialize bits [223:192] of the result. 4031/// \param __i7 4032/// A 32-bit integral value used to initialize bits [255:224] of the result. 4033/// \returns An initialized 256-bit integer vector. 4034static __inline __m256i __DEFAULT_FN_ATTRS 4035_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 4036 int __i4, int __i5, int __i6, int __i7) 4037{ 4038 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 4039} 4040 4041/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4042/// with the specified 16-bit integral values. 4043/// 4044/// \headerfile <x86intrin.h> 4045/// 4046/// This intrinsic is a utility function and does not correspond to a specific 4047/// instruction. 4048/// 4049/// \param __w15 4050/// A 16-bit integral value used to initialize bits [15:0] of the result. 4051/// \param __w14 4052/// A 16-bit integral value used to initialize bits [31:16] of the result. 4053/// \param __w13 4054/// A 16-bit integral value used to initialize bits [47:32] of the result. 4055/// \param __w12 4056/// A 16-bit integral value used to initialize bits [63:48] of the result. 4057/// \param __w11 4058/// A 16-bit integral value used to initialize bits [79:64] of the result. 4059/// \param __w10 4060/// A 16-bit integral value used to initialize bits [95:80] of the result. 4061/// \param __w09 4062/// A 16-bit integral value used to initialize bits [111:96] of the result. 4063/// \param __w08 4064/// A 16-bit integral value used to initialize bits [127:112] of the result. 4065/// \param __w07 4066/// A 16-bit integral value used to initialize bits [143:128] of the result. 4067/// \param __w06 4068/// A 16-bit integral value used to initialize bits [159:144] of the result. 4069/// \param __w05 4070/// A 16-bit integral value used to initialize bits [175:160] of the result. 4071/// \param __w04 4072/// A 16-bit integral value used to initialize bits [191:176] of the result. 4073/// \param __w03 4074/// A 16-bit integral value used to initialize bits [207:192] of the result. 4075/// \param __w02 4076/// A 16-bit integral value used to initialize bits [223:208] of the result. 4077/// \param __w01 4078/// A 16-bit integral value used to initialize bits [239:224] of the result. 4079/// \param __w00 4080/// A 16-bit integral value used to initialize bits [255:240] of the result. 4081/// \returns An initialized 256-bit integer vector. 4082static __inline __m256i __DEFAULT_FN_ATTRS 4083_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4084 short __w11, short __w10, short __w09, short __w08, 4085 short __w07, short __w06, short __w05, short __w04, 4086 short __w03, short __w02, short __w01, short __w00) 4087{ 4088 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 4089 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 4090} 4091 4092/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4093/// with the specified 8-bit integral values. 4094/// 4095/// \headerfile <x86intrin.h> 4096/// 4097/// This intrinsic is a utility function and does not correspond to a specific 4098/// instruction. 4099/// 4100/// \param __b31 4101/// An 8-bit integral value used to initialize bits [7:0] of the result. 4102/// \param __b30 4103/// An 8-bit integral value used to initialize bits [15:8] of the result. 4104/// \param __b29 4105/// An 8-bit integral value used to initialize bits [23:16] of the result. 4106/// \param __b28 4107/// An 8-bit integral value used to initialize bits [31:24] of the result. 4108/// \param __b27 4109/// An 8-bit integral value used to initialize bits [39:32] of the result. 4110/// \param __b26 4111/// An 8-bit integral value used to initialize bits [47:40] of the result. 4112/// \param __b25 4113/// An 8-bit integral value used to initialize bits [55:48] of the result. 4114/// \param __b24 4115/// An 8-bit integral value used to initialize bits [63:56] of the result. 4116/// \param __b23 4117/// An 8-bit integral value used to initialize bits [71:64] of the result. 4118/// \param __b22 4119/// An 8-bit integral value used to initialize bits [79:72] of the result. 4120/// \param __b21 4121/// An 8-bit integral value used to initialize bits [87:80] of the result. 4122/// \param __b20 4123/// An 8-bit integral value used to initialize bits [95:88] of the result. 4124/// \param __b19 4125/// An 8-bit integral value used to initialize bits [103:96] of the result. 4126/// \param __b18 4127/// An 8-bit integral value used to initialize bits [111:104] of the result. 4128/// \param __b17 4129/// An 8-bit integral value used to initialize bits [119:112] of the result. 4130/// \param __b16 4131/// An 8-bit integral value used to initialize bits [127:120] of the result. 4132/// \param __b15 4133/// An 8-bit integral value used to initialize bits [135:128] of the result. 4134/// \param __b14 4135/// An 8-bit integral value used to initialize bits [143:136] of the result. 4136/// \param __b13 4137/// An 8-bit integral value used to initialize bits [151:144] of the result. 4138/// \param __b12 4139/// An 8-bit integral value used to initialize bits [159:152] of the result. 4140/// \param __b11 4141/// An 8-bit integral value used to initialize bits [167:160] of the result. 4142/// \param __b10 4143/// An 8-bit integral value used to initialize bits [175:168] of the result. 4144/// \param __b09 4145/// An 8-bit integral value used to initialize bits [183:176] of the result. 4146/// \param __b08 4147/// An 8-bit integral value used to initialize bits [191:184] of the result. 4148/// \param __b07 4149/// An 8-bit integral value used to initialize bits [199:192] of the result. 4150/// \param __b06 4151/// An 8-bit integral value used to initialize bits [207:200] of the result. 4152/// \param __b05 4153/// An 8-bit integral value used to initialize bits [215:208] of the result. 4154/// \param __b04 4155/// An 8-bit integral value used to initialize bits [223:216] of the result. 4156/// \param __b03 4157/// An 8-bit integral value used to initialize bits [231:224] of the result. 4158/// \param __b02 4159/// An 8-bit integral value used to initialize bits [239:232] of the result. 4160/// \param __b01 4161/// An 8-bit integral value used to initialize bits [247:240] of the result. 4162/// \param __b00 4163/// An 8-bit integral value used to initialize bits [255:248] of the result. 4164/// \returns An initialized 256-bit integer vector. 4165static __inline __m256i __DEFAULT_FN_ATTRS 4166_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4167 char __b27, char __b26, char __b25, char __b24, 4168 char __b23, char __b22, char __b21, char __b20, 4169 char __b19, char __b18, char __b17, char __b16, 4170 char __b15, char __b14, char __b13, char __b12, 4171 char __b11, char __b10, char __b09, char __b08, 4172 char __b07, char __b06, char __b05, char __b04, 4173 char __b03, char __b02, char __b01, char __b00) 4174{ 4175 return (__m256i)(__v32qi){ 4176 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 4177 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 4178 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 4179 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 4180} 4181 4182/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4183/// with the specified 64-bit integral values. 4184/// 4185/// \headerfile <x86intrin.h> 4186/// 4187/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4188/// instruction. 4189/// 4190/// \param __a 4191/// A 64-bit integral value used to initialize bits [63:0] of the result. 4192/// \param __b 4193/// A 64-bit integral value used to initialize bits [127:64] of the result. 4194/// \param __c 4195/// A 64-bit integral value used to initialize bits [191:128] of the result. 4196/// \param __d 4197/// A 64-bit integral value used to initialize bits [255:192] of the result. 4198/// \returns An initialized 256-bit integer vector. 4199static __inline __m256i __DEFAULT_FN_ATTRS 4200_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4201{ 4202 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4203} 4204 4205/* Create vectors with repeated elements */ 4206/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4207/// of the four double-precision floating-point vector elements set to the 4208/// specified double-precision floating-point value. 4209/// 4210/// \headerfile <x86intrin.h> 4211/// 4212/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4213/// 4214/// \param __w 4215/// A double-precision floating-point value used to initialize each vector 4216/// element of the result. 4217/// \returns An initialized 256-bit floating-point vector of [4 x double]. 4218static __inline __m256d __DEFAULT_FN_ATTRS 4219_mm256_set1_pd(double __w) 4220{ 4221 return (__m256d){ __w, __w, __w, __w }; 4222} 4223 4224/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4225/// of the eight single-precision floating-point vector elements set to the 4226/// specified single-precision floating-point value. 4227/// 4228/// \headerfile <x86intrin.h> 4229/// 4230/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4231/// instruction. 4232/// 4233/// \param __w 4234/// A single-precision floating-point value used to initialize each vector 4235/// element of the result. 4236/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4237static __inline __m256 __DEFAULT_FN_ATTRS 4238_mm256_set1_ps(float __w) 4239{ 4240 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4241} 4242 4243/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4244/// 32-bit integral vector elements set to the specified 32-bit integral 4245/// value. 4246/// 4247/// \headerfile <x86intrin.h> 4248/// 4249/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4250/// instruction. 4251/// 4252/// \param __i 4253/// A 32-bit integral value used to initialize each vector element of the 4254/// result. 4255/// \returns An initialized 256-bit integer vector of [8 x i32]. 4256static __inline __m256i __DEFAULT_FN_ATTRS 4257_mm256_set1_epi32(int __i) 4258{ 4259 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4260} 4261 4262/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4263/// 16-bit integral vector elements set to the specified 16-bit integral 4264/// value. 4265/// 4266/// \headerfile <x86intrin.h> 4267/// 4268/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4269/// 4270/// \param __w 4271/// A 16-bit integral value used to initialize each vector element of the 4272/// result. 4273/// \returns An initialized 256-bit integer vector of [16 x i16]. 4274static __inline __m256i __DEFAULT_FN_ATTRS 4275_mm256_set1_epi16(short __w) 4276{ 4277 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4278 __w, __w, __w, __w, __w, __w }; 4279} 4280 4281/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4282/// 8-bit integral vector elements set to the specified 8-bit integral value. 4283/// 4284/// \headerfile <x86intrin.h> 4285/// 4286/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4287/// 4288/// \param __b 4289/// An 8-bit integral value used to initialize each vector element of the 4290/// result. 4291/// \returns An initialized 256-bit integer vector of [32 x i8]. 4292static __inline __m256i __DEFAULT_FN_ATTRS 4293_mm256_set1_epi8(char __b) 4294{ 4295 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4296 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4297 __b, __b, __b, __b, __b, __b, __b }; 4298} 4299 4300/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4301/// 64-bit integral vector elements set to the specified 64-bit integral 4302/// value. 4303/// 4304/// \headerfile <x86intrin.h> 4305/// 4306/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4307/// 4308/// \param __q 4309/// A 64-bit integral value used to initialize each vector element of the 4310/// result. 4311/// \returns An initialized 256-bit integer vector of [4 x i64]. 4312static __inline __m256i __DEFAULT_FN_ATTRS 4313_mm256_set1_epi64x(long long __q) 4314{ 4315 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4316} 4317 4318/* Create __zeroed vectors */ 4319/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4320/// vector elements initialized to zero. 4321/// 4322/// \headerfile <x86intrin.h> 4323/// 4324/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4325/// 4326/// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4327static __inline __m256d __DEFAULT_FN_ATTRS 4328_mm256_setzero_pd(void) 4329{ 4330 return (__m256d){ 0, 0, 0, 0 }; 4331} 4332 4333/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4334/// vector elements initialized to zero. 4335/// 4336/// \headerfile <x86intrin.h> 4337/// 4338/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4339/// 4340/// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4341static __inline __m256 __DEFAULT_FN_ATTRS 4342_mm256_setzero_ps(void) 4343{ 4344 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4345} 4346 4347/// \brief Constructs a 256-bit integer vector initialized to zero. 4348/// 4349/// \headerfile <x86intrin.h> 4350/// 4351/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4352/// 4353/// \returns A 256-bit integer vector initialized to zero. 4354static __inline __m256i __DEFAULT_FN_ATTRS 4355_mm256_setzero_si256(void) 4356{ 4357 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4358} 4359 4360/* Cast between vector types */ 4361/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4362/// floating-point vector of [8 x float]. 4363/// 4364/// \headerfile <x86intrin.h> 4365/// 4366/// This intrinsic has no corresponding instruction. 4367/// 4368/// \param __a 4369/// A 256-bit floating-point vector of [4 x double]. 4370/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4371/// bitwise pattern as the parameter. 4372static __inline __m256 __DEFAULT_FN_ATTRS 4373_mm256_castpd_ps(__m256d __a) 4374{ 4375 return (__m256)__a; 4376} 4377 4378/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4379/// integer vector. 4380/// 4381/// \headerfile <x86intrin.h> 4382/// 4383/// This intrinsic has no corresponding instruction. 4384/// 4385/// \param __a 4386/// A 256-bit floating-point vector of [4 x double]. 4387/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4388/// parameter. 4389static __inline __m256i __DEFAULT_FN_ATTRS 4390_mm256_castpd_si256(__m256d __a) 4391{ 4392 return (__m256i)__a; 4393} 4394 4395/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4396/// floating-point vector of [4 x double]. 4397/// 4398/// \headerfile <x86intrin.h> 4399/// 4400/// This intrinsic has no corresponding instruction. 4401/// 4402/// \param __a 4403/// A 256-bit floating-point vector of [8 x float]. 4404/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4405/// bitwise pattern as the parameter. 4406static __inline __m256d __DEFAULT_FN_ATTRS 4407_mm256_castps_pd(__m256 __a) 4408{ 4409 return (__m256d)__a; 4410} 4411 4412/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4413/// integer vector. 4414/// 4415/// \headerfile <x86intrin.h> 4416/// 4417/// This intrinsic has no corresponding instruction. 4418/// 4419/// \param __a 4420/// A 256-bit floating-point vector of [8 x float]. 4421/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4422/// parameter. 4423static __inline __m256i __DEFAULT_FN_ATTRS 4424_mm256_castps_si256(__m256 __a) 4425{ 4426 return (__m256i)__a; 4427} 4428 4429/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4430/// of [8 x float]. 4431/// 4432/// \headerfile <x86intrin.h> 4433/// 4434/// This intrinsic has no corresponding instruction. 4435/// 4436/// \param __a 4437/// A 256-bit integer vector. 4438/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4439/// bitwise pattern as the parameter. 4440static __inline __m256 __DEFAULT_FN_ATTRS 4441_mm256_castsi256_ps(__m256i __a) 4442{ 4443 return (__m256)__a; 4444} 4445 4446/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4447/// of [4 x double]. 4448/// 4449/// \headerfile <x86intrin.h> 4450/// 4451/// This intrinsic has no corresponding instruction. 4452/// 4453/// \param __a 4454/// A 256-bit integer vector. 4455/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4456/// bitwise pattern as the parameter. 4457static __inline __m256d __DEFAULT_FN_ATTRS 4458_mm256_castsi256_pd(__m256i __a) 4459{ 4460 return (__m256d)__a; 4461} 4462 4463/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4464/// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4465/// 4466/// \headerfile <x86intrin.h> 4467/// 4468/// This intrinsic has no corresponding instruction. 4469/// 4470/// \param __a 4471/// A 256-bit floating-point vector of [4 x double]. 4472/// \returns A 128-bit floating-point vector of [2 x double] containing the 4473/// lower 128 bits of the parameter. 4474static __inline __m128d __DEFAULT_FN_ATTRS 4475_mm256_castpd256_pd128(__m256d __a) 4476{ 4477 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4478} 4479 4480/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4481/// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4482/// 4483/// \headerfile <x86intrin.h> 4484/// 4485/// This intrinsic has no corresponding instruction. 4486/// 4487/// \param __a 4488/// A 256-bit floating-point vector of [8 x float]. 4489/// \returns A 128-bit floating-point vector of [4 x float] containing the 4490/// lower 128 bits of the parameter. 4491static __inline __m128 __DEFAULT_FN_ATTRS 4492_mm256_castps256_ps128(__m256 __a) 4493{ 4494 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4495} 4496 4497/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4498/// 4499/// \headerfile <x86intrin.h> 4500/// 4501/// This intrinsic has no corresponding instruction. 4502/// 4503/// \param __a 4504/// A 256-bit integer vector. 4505/// \returns A 128-bit integer vector containing the lower 128 bits of the 4506/// parameter. 4507static __inline __m128i __DEFAULT_FN_ATTRS 4508_mm256_castsi256_si128(__m256i __a) 4509{ 4510 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4511} 4512 4513/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4514/// 128-bit floating-point vector of [2 x double]. 4515/// 4516/// The lower 128 bits contain the value of the source vector. The contents 4517/// of the upper 128 bits are undefined. 4518/// 4519/// \headerfile <x86intrin.h> 4520/// 4521/// This intrinsic has no corresponding instruction. 4522/// 4523/// \param __a 4524/// A 128-bit vector of [2 x double]. 4525/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4526/// contain the value of the parameter. The contents of the upper 128 bits 4527/// are undefined. 4528static __inline __m256d __DEFAULT_FN_ATTRS 4529_mm256_castpd128_pd256(__m128d __a) 4530{ 4531 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4532} 4533 4534/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4535/// 128-bit floating-point vector of [4 x float]. 4536/// 4537/// The lower 128 bits contain the value of the source vector. The contents 4538/// of the upper 128 bits are undefined. 4539/// 4540/// \headerfile <x86intrin.h> 4541/// 4542/// This intrinsic has no corresponding instruction. 4543/// 4544/// \param __a 4545/// A 128-bit vector of [4 x float]. 4546/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4547/// contain the value of the parameter. The contents of the upper 128 bits 4548/// are undefined. 4549static __inline __m256 __DEFAULT_FN_ATTRS 4550_mm256_castps128_ps256(__m128 __a) 4551{ 4552 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4553} 4554 4555/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4556/// 4557/// The lower 128 bits contain the value of the source vector. The contents 4558/// of the upper 128 bits are undefined. 4559/// 4560/// \headerfile <x86intrin.h> 4561/// 4562/// This intrinsic has no corresponding instruction. 4563/// 4564/// \param __a 4565/// A 128-bit integer vector. 4566/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4567/// the parameter. The contents of the upper 128 bits are undefined. 4568static __inline __m256i __DEFAULT_FN_ATTRS 4569_mm256_castsi128_si256(__m128i __a) 4570{ 4571 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4572} 4573 4574/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4575/// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4576/// contain the value of the source vector. The upper 128 bits are set 4577/// to zero. 4578/// 4579/// \headerfile <x86intrin.h> 4580/// 4581/// This intrinsic has no corresponding instruction. 4582/// 4583/// \param __a 4584/// A 128-bit vector of [2 x double]. 4585/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4586/// contain the value of the parameter. The upper 128 bits are set to zero. 4587static __inline __m256d __DEFAULT_FN_ATTRS 4588_mm256_zextpd128_pd256(__m128d __a) 4589{ 4590 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4591} 4592 4593/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4594/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4595/// the value of the source vector. The upper 128 bits are set to zero. 4596/// 4597/// \headerfile <x86intrin.h> 4598/// 4599/// This intrinsic has no corresponding instruction. 4600/// 4601/// \param __a 4602/// A 128-bit vector of [4 x float]. 4603/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4604/// contain the value of the parameter. The upper 128 bits are set to zero. 4605static __inline __m256 __DEFAULT_FN_ATTRS 4606_mm256_zextps128_ps256(__m128 __a) 4607{ 4608 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4609} 4610 4611/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4612/// The lower 128 bits contain the value of the source vector. The upper 4613/// 128 bits are set to zero. 4614/// 4615/// \headerfile <x86intrin.h> 4616/// 4617/// This intrinsic has no corresponding instruction. 4618/// 4619/// \param __a 4620/// A 128-bit integer vector. 4621/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4622/// the parameter. The upper 128 bits are set to zero. 4623static __inline __m256i __DEFAULT_FN_ATTRS 4624_mm256_zextsi128_si256(__m128i __a) 4625{ 4626 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4627} 4628 4629/* 4630 Vector insert. 4631 We use macros rather than inlines because we only want to accept 4632 invocations where the immediate M is a constant expression. 4633*/ 4634/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4635/// a 256-bit vector of [8 x float] given in the first parameter, and then 4636/// replacing either the upper or the lower 128 bits with the contents of a 4637/// 128-bit vector of [4 x float] in the second parameter. 4638/// 4639/// The immediate integer parameter determines between the upper or the lower 4640/// 128 bits. 4641/// 4642/// \headerfile <x86intrin.h> 4643/// 4644/// \code 4645/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4646/// \endcode 4647/// 4648/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4649/// 4650/// \param V1 4651/// A 256-bit vector of [8 x float]. This vector is copied to the result 4652/// first, and then either the upper or the lower 128 bits of the result will 4653/// be replaced by the contents of \a V2. 4654/// \param V2 4655/// A 128-bit vector of [4 x float]. The contents of this parameter are 4656/// written to either the upper or the lower 128 bits of the result depending 4657/// on the value of parameter \a M. 4658/// \param M 4659/// An immediate integer. The least significant bit determines how the values 4660/// from the two parameters are interleaved: \n 4661/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4662/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4663/// result. \n 4664/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4665/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4666/// result. 4667/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4668#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4669 (__m256)__builtin_shufflevector( \ 4670 (__v8sf)(__m256)(V1), \ 4671 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4672 (((M) & 1) ? 0 : 8), \ 4673 (((M) & 1) ? 1 : 9), \ 4674 (((M) & 1) ? 2 : 10), \ 4675 (((M) & 1) ? 3 : 11), \ 4676 (((M) & 1) ? 8 : 4), \ 4677 (((M) & 1) ? 9 : 5), \ 4678 (((M) & 1) ? 10 : 6), \ 4679 (((M) & 1) ? 11 : 7) );}) 4680 4681/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4682/// a 256-bit vector of [4 x double] given in the first parameter, and then 4683/// replacing either the upper or the lower 128 bits with the contents of a 4684/// 128-bit vector of [2 x double] in the second parameter. 4685/// 4686/// The immediate integer parameter determines between the upper or the lower 4687/// 128 bits. 4688/// 4689/// \headerfile <x86intrin.h> 4690/// 4691/// \code 4692/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4693/// \endcode 4694/// 4695/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4696/// 4697/// \param V1 4698/// A 256-bit vector of [4 x double]. This vector is copied to the result 4699/// first, and then either the upper or the lower 128 bits of the result will 4700/// be replaced by the contents of \a V2. 4701/// \param V2 4702/// A 128-bit vector of [2 x double]. The contents of this parameter are 4703/// written to either the upper or the lower 128 bits of the result depending 4704/// on the value of parameter \a M. 4705/// \param M 4706/// An immediate integer. The least significant bit determines how the values 4707/// from the two parameters are interleaved: \n 4708/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4709/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4710/// result. \n 4711/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4712/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4713/// result. 4714/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4715#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4716 (__m256d)__builtin_shufflevector( \ 4717 (__v4df)(__m256d)(V1), \ 4718 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4719 (((M) & 1) ? 0 : 4), \ 4720 (((M) & 1) ? 1 : 5), \ 4721 (((M) & 1) ? 4 : 2), \ 4722 (((M) & 1) ? 5 : 3) );}) 4723 4724/// \brief Constructs a new 256-bit integer vector by first duplicating a 4725/// 256-bit integer vector given in the first parameter, and then replacing 4726/// either the upper or the lower 128 bits with the contents of a 128-bit 4727/// integer vector in the second parameter. 4728/// 4729/// The immediate integer parameter determines between the upper or the lower 4730/// 128 bits. 4731/// 4732/// \headerfile <x86intrin.h> 4733/// 4734/// \code 4735/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4736/// \endcode 4737/// 4738/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4739/// 4740/// \param V1 4741/// A 256-bit integer vector. This vector is copied to the result first, and 4742/// then either the upper or the lower 128 bits of the result will be 4743/// replaced by the contents of \a V2. 4744/// \param V2 4745/// A 128-bit integer vector. The contents of this parameter are written to 4746/// either the upper or the lower 128 bits of the result depending on the 4747/// value of parameter \a M. 4748/// \param M 4749/// An immediate integer. The least significant bit determines how the values 4750/// from the two parameters are interleaved: \n 4751/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4752/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4753/// result. \n 4754/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4755/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4756/// result. 4757/// \returns A 256-bit integer vector containing the interleaved values. 4758#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4759 (__m256i)__builtin_shufflevector( \ 4760 (__v4di)(__m256i)(V1), \ 4761 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4762 (((M) & 1) ? 0 : 4), \ 4763 (((M) & 1) ? 1 : 5), \ 4764 (((M) & 1) ? 4 : 2), \ 4765 (((M) & 1) ? 5 : 3) );}) 4766 4767/* 4768 Vector extract. 4769 We use macros rather than inlines because we only want to accept 4770 invocations where the immediate M is a constant expression. 4771*/ 4772/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4773/// of [8 x float], as determined by the immediate integer parameter, and 4774/// returns the extracted bits as a 128-bit vector of [4 x float]. 4775/// 4776/// \headerfile <x86intrin.h> 4777/// 4778/// \code 4779/// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4780/// \endcode 4781/// 4782/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4783/// 4784/// \param V 4785/// A 256-bit vector of [8 x float]. 4786/// \param M 4787/// An immediate integer. The least significant bit determines which bits are 4788/// extracted from the first parameter: \n 4789/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4790/// result. \n 4791/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4792/// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4793#define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4794 (__m128)__builtin_shufflevector( \ 4795 (__v8sf)(__m256)(V), \ 4796 (__v8sf)(_mm256_undefined_ps()), \ 4797 (((M) & 1) ? 4 : 0), \ 4798 (((M) & 1) ? 5 : 1), \ 4799 (((M) & 1) ? 6 : 2), \ 4800 (((M) & 1) ? 7 : 3) );}) 4801 4802/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4803/// of [4 x double], as determined by the immediate integer parameter, and 4804/// returns the extracted bits as a 128-bit vector of [2 x double]. 4805/// 4806/// \headerfile <x86intrin.h> 4807/// 4808/// \code 4809/// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4810/// \endcode 4811/// 4812/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4813/// 4814/// \param V 4815/// A 256-bit vector of [4 x double]. 4816/// \param M 4817/// An immediate integer. The least significant bit determines which bits are 4818/// extracted from the first parameter: \n 4819/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4820/// result. \n 4821/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4822/// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4823#define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4824 (__m128d)__builtin_shufflevector( \ 4825 (__v4df)(__m256d)(V), \ 4826 (__v4df)(_mm256_undefined_pd()), \ 4827 (((M) & 1) ? 2 : 0), \ 4828 (((M) & 1) ? 3 : 1) );}) 4829 4830/// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4831/// integer vector, as determined by the immediate integer parameter, and 4832/// returns the extracted bits as a 128-bit integer vector. 4833/// 4834/// \headerfile <x86intrin.h> 4835/// 4836/// \code 4837/// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4838/// \endcode 4839/// 4840/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4841/// 4842/// \param V 4843/// A 256-bit integer vector. 4844/// \param M 4845/// An immediate integer. The least significant bit determines which bits are 4846/// extracted from the first parameter: \n 4847/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4848/// result. \n 4849/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4850/// \returns A 128-bit integer vector containing the extracted bits. 4851#define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4852 (__m128i)__builtin_shufflevector( \ 4853 (__v4di)(__m256i)(V), \ 4854 (__v4di)(_mm256_undefined_si256()), \ 4855 (((M) & 1) ? 2 : 0), \ 4856 (((M) & 1) ? 3 : 1) );}) 4857 4858/* SIMD load ops (unaligned) */ 4859/// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4860/// unaligned memory locations and constructs a 256-bit floating-point vector 4861/// of [8 x float] by concatenating the two 128-bit vectors. 4862/// 4863/// \headerfile <x86intrin.h> 4864/// 4865/// This intrinsic corresponds to load instructions followed by the 4866/// <c> VINSERTF128 </c> instruction. 4867/// 4868/// \param __addr_hi 4869/// A pointer to a 128-bit memory location containing 4 consecutive 4870/// single-precision floating-point values. These values are to be copied to 4871/// bits[255:128] of the result. The address of the memory location does not 4872/// have to be aligned. 4873/// \param __addr_lo 4874/// A pointer to a 128-bit memory location containing 4 consecutive 4875/// single-precision floating-point values. These values are to be copied to 4876/// bits[127:0] of the result. The address of the memory location does not 4877/// have to be aligned. 4878/// \returns A 256-bit floating-point vector of [8 x float] containing the 4879/// concatenated result. 4880static __inline __m256 __DEFAULT_FN_ATTRS 4881_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4882{ 4883 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4884 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4885} 4886 4887/// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4888/// unaligned memory locations and constructs a 256-bit floating-point vector 4889/// of [4 x double] by concatenating the two 128-bit vectors. 4890/// 4891/// \headerfile <x86intrin.h> 4892/// 4893/// This intrinsic corresponds to load instructions followed by the 4894/// <c> VINSERTF128 </c> instruction. 4895/// 4896/// \param __addr_hi 4897/// A pointer to a 128-bit memory location containing two consecutive 4898/// double-precision floating-point values. These values are to be copied to 4899/// bits[255:128] of the result. The address of the memory location does not 4900/// have to be aligned. 4901/// \param __addr_lo 4902/// A pointer to a 128-bit memory location containing two consecutive 4903/// double-precision floating-point values. These values are to be copied to 4904/// bits[127:0] of the result. The address of the memory location does not 4905/// have to be aligned. 4906/// \returns A 256-bit floating-point vector of [4 x double] containing the 4907/// concatenated result. 4908static __inline __m256d __DEFAULT_FN_ATTRS 4909_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4910{ 4911 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4912 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4913} 4914 4915/// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4916/// constructs a 256-bit integer vector by concatenating the two 128-bit 4917/// vectors. 4918/// 4919/// \headerfile <x86intrin.h> 4920/// 4921/// This intrinsic corresponds to load instructions followed by the 4922/// <c> VINSERTF128 </c> instruction. 4923/// 4924/// \param __addr_hi 4925/// A pointer to a 128-bit memory location containing a 128-bit integer 4926/// vector. This vector is to be copied to bits[255:128] of the result. The 4927/// address of the memory location does not have to be aligned. 4928/// \param __addr_lo 4929/// A pointer to a 128-bit memory location containing a 128-bit integer 4930/// vector. This vector is to be copied to bits[127:0] of the result. The 4931/// address of the memory location does not have to be aligned. 4932/// \returns A 256-bit integer vector containing the concatenated result. 4933static __inline __m256i __DEFAULT_FN_ATTRS 4934_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4935{ 4936 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4937 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4938} 4939 4940/* SIMD store ops (unaligned) */ 4941/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4942/// vector of [8 x float] into two different unaligned memory locations. 4943/// 4944/// \headerfile <x86intrin.h> 4945/// 4946/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4947/// store instructions. 4948/// 4949/// \param __addr_hi 4950/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4951/// copied to this memory location. The address of this memory location does 4952/// not have to be aligned. 4953/// \param __addr_lo 4954/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4955/// copied to this memory location. The address of this memory location does 4956/// not have to be aligned. 4957/// \param __a 4958/// A 256-bit floating-point vector of [8 x float]. 4959static __inline void __DEFAULT_FN_ATTRS 4960_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4961{ 4962 __m128 __v128; 4963 4964 __v128 = _mm256_castps256_ps128(__a); 4965 _mm_storeu_ps(__addr_lo, __v128); 4966 __v128 = _mm256_extractf128_ps(__a, 1); 4967 _mm_storeu_ps(__addr_hi, __v128); 4968} 4969 4970/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4971/// vector of [4 x double] into two different unaligned memory locations. 4972/// 4973/// \headerfile <x86intrin.h> 4974/// 4975/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4976/// store instructions. 4977/// 4978/// \param __addr_hi 4979/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4980/// copied to this memory location. The address of this memory location does 4981/// not have to be aligned. 4982/// \param __addr_lo 4983/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4984/// copied to this memory location. The address of this memory location does 4985/// not have to be aligned. 4986/// \param __a 4987/// A 256-bit floating-point vector of [4 x double]. 4988static __inline void __DEFAULT_FN_ATTRS 4989_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4990{ 4991 __m128d __v128; 4992 4993 __v128 = _mm256_castpd256_pd128(__a); 4994 _mm_storeu_pd(__addr_lo, __v128); 4995 __v128 = _mm256_extractf128_pd(__a, 1); 4996 _mm_storeu_pd(__addr_hi, __v128); 4997} 4998 4999/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 5000/// two different unaligned memory locations. 5001/// 5002/// \headerfile <x86intrin.h> 5003/// 5004/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5005/// store instructions. 5006/// 5007/// \param __addr_hi 5008/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5009/// copied to this memory location. The address of this memory location does 5010/// not have to be aligned. 5011/// \param __addr_lo 5012/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5013/// copied to this memory location. The address of this memory location does 5014/// not have to be aligned. 5015/// \param __a 5016/// A 256-bit integer vector. 5017static __inline void __DEFAULT_FN_ATTRS 5018_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 5019{ 5020 __m128i __v128; 5021 5022 __v128 = _mm256_castsi256_si128(__a); 5023 _mm_storeu_si128(__addr_lo, __v128); 5024 __v128 = _mm256_extractf128_si256(__a, 1); 5025 _mm_storeu_si128(__addr_hi, __v128); 5026} 5027 5028/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5029/// concatenating two 128-bit floating-point vectors of [4 x float]. 5030/// 5031/// \headerfile <x86intrin.h> 5032/// 5033/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5034/// 5035/// \param __hi 5036/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5037/// 128 bits of the result. 5038/// \param __lo 5039/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5040/// 128 bits of the result. 5041/// \returns A 256-bit floating-point vector of [8 x float] containing the 5042/// concatenated result. 5043static __inline __m256 __DEFAULT_FN_ATTRS 5044_mm256_set_m128 (__m128 __hi, __m128 __lo) 5045{ 5046 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 5047} 5048 5049/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5050/// concatenating two 128-bit floating-point vectors of [2 x double]. 5051/// 5052/// \headerfile <x86intrin.h> 5053/// 5054/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5055/// 5056/// \param __hi 5057/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5058/// 128 bits of the result. 5059/// \param __lo 5060/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5061/// 128 bits of the result. 5062/// \returns A 256-bit floating-point vector of [4 x double] containing the 5063/// concatenated result. 5064static __inline __m256d __DEFAULT_FN_ATTRS 5065_mm256_set_m128d (__m128d __hi, __m128d __lo) 5066{ 5067 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5068} 5069 5070/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5071/// integer vectors. 5072/// 5073/// \headerfile <x86intrin.h> 5074/// 5075/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5076/// 5077/// \param __hi 5078/// A 128-bit integer vector to be copied to the upper 128 bits of the 5079/// result. 5080/// \param __lo 5081/// A 128-bit integer vector to be copied to the lower 128 bits of the 5082/// result. 5083/// \returns A 256-bit integer vector containing the concatenated result. 5084static __inline __m256i __DEFAULT_FN_ATTRS 5085_mm256_set_m128i (__m128i __hi, __m128i __lo) 5086{ 5087 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5088} 5089 5090/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 5091/// concatenating two 128-bit floating-point vectors of [4 x float]. This is 5092/// similar to _mm256_set_m128, but the order of the input parameters is 5093/// swapped. 5094/// 5095/// \headerfile <x86intrin.h> 5096/// 5097/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5098/// 5099/// \param __lo 5100/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 5101/// 128 bits of the result. 5102/// \param __hi 5103/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 5104/// 128 bits of the result. 5105/// \returns A 256-bit floating-point vector of [8 x float] containing the 5106/// concatenated result. 5107static __inline __m256 __DEFAULT_FN_ATTRS 5108_mm256_setr_m128 (__m128 __lo, __m128 __hi) 5109{ 5110 return _mm256_set_m128(__hi, __lo); 5111} 5112 5113/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5114/// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5115/// similar to _mm256_set_m128d, but the order of the input parameters is 5116/// swapped. 5117/// 5118/// \headerfile <x86intrin.h> 5119/// 5120/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5121/// 5122/// \param __lo 5123/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5124/// 128 bits of the result. 5125/// \param __hi 5126/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5127/// 128 bits of the result. 5128/// \returns A 256-bit floating-point vector of [4 x double] containing the 5129/// concatenated result. 5130static __inline __m256d __DEFAULT_FN_ATTRS 5131_mm256_setr_m128d (__m128d __lo, __m128d __hi) 5132{ 5133 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5134} 5135 5136/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5137/// integer vectors. This is similar to _mm256_set_m128i, but the order of 5138/// the input parameters is swapped. 5139/// 5140/// \headerfile <x86intrin.h> 5141/// 5142/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5143/// 5144/// \param __lo 5145/// A 128-bit integer vector to be copied to the lower 128 bits of the 5146/// result. 5147/// \param __hi 5148/// A 128-bit integer vector to be copied to the upper 128 bits of the 5149/// result. 5150/// \returns A 256-bit integer vector containing the concatenated result. 5151static __inline __m256i __DEFAULT_FN_ATTRS 5152_mm256_setr_m128i (__m128i __lo, __m128i __hi) 5153{ 5154 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5155} 5156 5157#undef __DEFAULT_FN_ATTRS 5158 5159#endif /* __AVXINTRIN_H */ 5160