avxintrin.h revision 314564
195003Smux/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 295041Sru * 395003Smux * Permission is hereby granted, free of charge, to any person obtaining a copy 495003Smux * of this software and associated documentation files (the "Software"), to deal 595003Smux * in the Software without restriction, including without limitation the rights 695003Smux * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 795003Smux * copies of the Software, and to permit persons to whom the Software is 895003Smux * furnished to do so, subject to the following conditions: 995003Smux * 1095003Smux * The above copyright notice and this permission notice shall be included in 1195003Smux * all copies or substantial portions of the Software. 1295003Smux * 1395003Smux * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1495003Smux * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1595003Smux * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1695003Smux * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1795003Smux * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1895003Smux * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 1995003Smux * THE SOFTWARE. 2095003Smux * 2195003Smux *===-----------------------------------------------------------------------=== 2295003Smux */ 2395003Smux 2495003Smux#ifndef __IMMINTRIN_H 2595003Smux#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 2695003Smux#endif 2795003Smux 2895003Smux#ifndef __AVXINTRIN_H 2995003Smux#define __AVXINTRIN_H 3095003Smux 3195003Smuxtypedef double __v4df __attribute__ ((__vector_size__ (32))); 3295003Smuxtypedef float __v8sf __attribute__ ((__vector_size__ (32))); 3395003Smuxtypedef long long __v4di __attribute__ ((__vector_size__ (32))); 3495003Smuxtypedef int __v8si __attribute__ ((__vector_size__ (32))); 3595003Smuxtypedef short __v16hi __attribute__ ((__vector_size__ (32))); 3695003Smuxtypedef char __v32qi __attribute__ ((__vector_size__ (32))); 3795003Smux 3895003Smux/* Unsigned types */ 3995003Smuxtypedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 4095003Smuxtypedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 4195003Smuxtypedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 4295003Smuxtypedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 4395003Smux 44108028Sru/* We need an explicitly signed variant for char. Note that this shouldn't 4595003Smux * appear in the interface though. */ 4695003Smuxtypedef signed char __v32qs __attribute__((__vector_size__(32))); 4795003Smux 4895003Smuxtypedef float __m256 __attribute__ ((__vector_size__ (32))); 4995003Smuxtypedef double __m256d __attribute__((__vector_size__(32))); 5095003Smuxtypedef long long __m256i __attribute__((__vector_size__(32))); 51108087Sru 5295041Sru/* Define the default attributes for the functions in this file. */ 5395003Smux#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 5495003Smux 5595041Sru/* Arithmetic */ 5695041Sru/// \brief Adds two 256-bit vectors of [4 x double]. 5795003Smux/// 5895003Smux/// \headerfile <x86intrin.h> 5995003Smux/// 6095003Smux/// This intrinsic corresponds to the <c> VADDPD </c> instruction. 6195003Smux/// 6295003Smux/// \param __a 6395003Smux/// A 256-bit vector of [4 x double] containing one of the source operands. 6495003Smux/// \param __b 6595041Sru/// A 256-bit vector of [4 x double] containing one of the source operands. 6695003Smux/// \returns A 256-bit vector of [4 x double] containing the sums of both 6795003Smux/// operands. 6895003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS 6995041Sru_mm256_add_pd(__m256d __a, __m256d __b) 7095041Sru{ 7195003Smux return (__m256d)((__v4df)__a+(__v4df)__b); 7295003Smux} 7395003Smux 7495003Smux/// \brief Adds two 256-bit vectors of [8 x float]. 7595003Smux/// 7695003Smux/// \headerfile <x86intrin.h> 7795003Smux/// 7895003Smux/// This intrinsic corresponds to the <c> VADDPS </c> instruction. 7995003Smux/// 8095003Smux/// \param __a 8195003Smux/// A 256-bit vector of [8 x float] containing one of the source operands. 8295003Smux/// \param __b 8395003Smux/// A 256-bit vector of [8 x float] containing one of the source operands. 8495003Smux/// \returns A 256-bit vector of [8 x float] containing the sums of both 8595003Smux/// operands. 8695003Smuxstatic __inline __m256 __DEFAULT_FN_ATTRS 8795003Smux_mm256_add_ps(__m256 __a, __m256 __b) 88108087Sru{ 8995003Smux return (__m256)((__v8sf)__a+(__v8sf)__b); 9095003Smux} 9195003Smux 9295003Smux/// \brief Subtracts two 256-bit vectors of [4 x double]. 9395003Smux/// 9495003Smux/// \headerfile <x86intrin.h> 95108087Sru/// 9695003Smux/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 9795003Smux/// 9895003Smux/// \param __a 9995003Smux/// A 256-bit vector of [4 x double] containing the minuend. 10095003Smux/// \param __b 10195003Smux/// A 256-bit vector of [4 x double] containing the subtrahend. 10295003Smux/// \returns A 256-bit vector of [4 x double] containing the differences between 10395003Smux/// both operands. 10495003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS 10595041Sru_mm256_sub_pd(__m256d __a, __m256d __b) 10695003Smux{ 10795003Smux return (__m256d)((__v4df)__a-(__v4df)__b); 10895003Smux} 10995003Smux 11095003Smux/// \brief Subtracts two 256-bit vectors of [8 x float]. 11195003Smux/// 11295003Smux/// \headerfile <x86intrin.h> 11395003Smux/// 11495003Smux/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 11595003Smux/// 116108087Sru/// \param __a 11795003Smux/// A 256-bit vector of [8 x float] containing the minuend. 11895003Smux/// \param __b 11995003Smux/// A 256-bit vector of [8 x float] containing the subtrahend. 12095003Smux/// \returns A 256-bit vector of [8 x float] containing the differences between 12195003Smux/// both operands. 122108028Srustatic __inline __m256 __DEFAULT_FN_ATTRS 12395003Smux_mm256_sub_ps(__m256 __a, __m256 __b) 12495003Smux{ 12595041Sru return (__m256)((__v8sf)__a-(__v8sf)__b); 12695003Smux} 12795003Smux 12895003Smux/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 12995003Smux/// two 256-bit vectors of [4 x double]. 13095003Smux/// 13195003Smux/// \headerfile <x86intrin.h> 13295041Sru/// 13395041Sru/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 13495003Smux/// 13595003Smux/// \param __a 13695003Smux/// A 256-bit vector of [4 x double] containing the left source operand. 137108028Sru/// \param __b 13895003Smux/// A 256-bit vector of [4 x double] containing the right source operand. 139108028Sru/// \returns A 256-bit vector of [4 x double] containing the alternating sums 14095003Smux/// and differences between both operands. 14195003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS 14295003Smux_mm256_addsub_pd(__m256d __a, __m256d __b) 143108087Sru{ 14495003Smux return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145108087Sru} 14695003Smux 14795003Smux/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 14895003Smux/// two 256-bit vectors of [8 x float]. 14995003Smux/// 15095003Smux/// \headerfile <x86intrin.h> 15195003Smux/// 15295003Smux/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 15395003Smux/// 15495003Smux/// \param __a 15595003Smux/// A 256-bit vector of [8 x float] containing the left source operand. 15695003Smux/// \param __b 15795003Smux/// A 256-bit vector of [8 x float] containing the right source operand. 15895041Sru/// \returns A 256-bit vector of [8 x float] containing the alternating sums and 15995003Smux/// differences between both operands. 16095003Smuxstatic __inline __m256 __DEFAULT_FN_ATTRS 16195041Sru_mm256_addsub_ps(__m256 __a, __m256 __b) 16295003Smux{ 16395003Smux return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 16495003Smux} 16595003Smux 16695003Smux/// \brief Divides two 256-bit vectors of [4 x double]. 16795003Smux/// 16895003Smux/// \headerfile <x86intrin.h> 16995003Smux/// 17095003Smux/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 17195003Smux/// 17295003Smux/// \param __a 173147700Shmp/// A 256-bit vector of [4 x double] containing the dividend. 17495003Smux/// \param __b 17595003Smux/// A 256-bit vector of [4 x double] containing the divisor. 17695041Sru/// \returns A 256-bit vector of [4 x double] containing the quotients of both 17795003Smux/// operands. 17895003Smuxstatic __inline __m256d __DEFAULT_FN_ATTRS 17995003Smux_mm256_div_pd(__m256d __a, __m256d __b) 180{ 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182} 183 184/// \brief Divides two 256-bit vectors of [8 x float]. 185/// 186/// \headerfile <x86intrin.h> 187/// 188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189/// 190/// \param __a 191/// A 256-bit vector of [8 x float] containing the dividend. 192/// \param __b 193/// A 256-bit vector of [8 x float] containing the divisor. 194/// \returns A 256-bit vector of [8 x float] containing the quotients of both 195/// operands. 196static __inline __m256 __DEFAULT_FN_ATTRS 197_mm256_div_ps(__m256 __a, __m256 __b) 198{ 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200} 201 202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203/// of each pair of values. 204/// 205/// \headerfile <x86intrin.h> 206/// 207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208/// 209/// \param __a 210/// A 256-bit vector of [4 x double] containing one of the operands. 211/// \param __b 212/// A 256-bit vector of [4 x double] containing one of the operands. 213/// \returns A 256-bit vector of [4 x double] containing the maximum values 214/// between both operands. 215static __inline __m256d __DEFAULT_FN_ATTRS 216_mm256_max_pd(__m256d __a, __m256d __b) 217{ 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219} 220 221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222/// of each pair of values. 223/// 224/// \headerfile <x86intrin.h> 225/// 226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227/// 228/// \param __a 229/// A 256-bit vector of [8 x float] containing one of the operands. 230/// \param __b 231/// A 256-bit vector of [8 x float] containing one of the operands. 232/// \returns A 256-bit vector of [8 x float] containing the maximum values 233/// between both operands. 234static __inline __m256 __DEFAULT_FN_ATTRS 235_mm256_max_ps(__m256 __a, __m256 __b) 236{ 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238} 239 240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241/// of each pair of values. 242/// 243/// \headerfile <x86intrin.h> 244/// 245/// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246/// 247/// \param __a 248/// A 256-bit vector of [4 x double] containing one of the operands. 249/// \param __b 250/// A 256-bit vector of [4 x double] containing one of the operands. 251/// \returns A 256-bit vector of [4 x double] containing the minimum values 252/// between both operands. 253static __inline __m256d __DEFAULT_FN_ATTRS 254_mm256_min_pd(__m256d __a, __m256d __b) 255{ 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257} 258 259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260/// of each pair of values. 261/// 262/// \headerfile <x86intrin.h> 263/// 264/// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265/// 266/// \param __a 267/// A 256-bit vector of [8 x float] containing one of the operands. 268/// \param __b 269/// A 256-bit vector of [8 x float] containing one of the operands. 270/// \returns A 256-bit vector of [8 x float] containing the minimum values 271/// between both operands. 272static __inline __m256 __DEFAULT_FN_ATTRS 273_mm256_min_ps(__m256 __a, __m256 __b) 274{ 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276} 277 278/// \brief Multiplies two 256-bit vectors of [4 x double]. 279/// 280/// \headerfile <x86intrin.h> 281/// 282/// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283/// 284/// \param __a 285/// A 256-bit vector of [4 x double] containing one of the operands. 286/// \param __b 287/// A 256-bit vector of [4 x double] containing one of the operands. 288/// \returns A 256-bit vector of [4 x double] containing the products of both 289/// operands. 290static __inline __m256d __DEFAULT_FN_ATTRS 291_mm256_mul_pd(__m256d __a, __m256d __b) 292{ 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294} 295 296/// \brief Multiplies two 256-bit vectors of [8 x float]. 297/// 298/// \headerfile <x86intrin.h> 299/// 300/// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301/// 302/// \param __a 303/// A 256-bit vector of [8 x float] containing one of the operands. 304/// \param __b 305/// A 256-bit vector of [8 x float] containing one of the operands. 306/// \returns A 256-bit vector of [8 x float] containing the products of both 307/// operands. 308static __inline __m256 __DEFAULT_FN_ATTRS 309_mm256_mul_ps(__m256 __a, __m256 __b) 310{ 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312} 313 314/// \brief Calculates the square roots of the values in a 256-bit vector of 315/// [4 x double]. 316/// 317/// \headerfile <x86intrin.h> 318/// 319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320/// 321/// \param __a 322/// A 256-bit vector of [4 x double]. 323/// \returns A 256-bit vector of [4 x double] containing the square roots of the 324/// values in the operand. 325static __inline __m256d __DEFAULT_FN_ATTRS 326_mm256_sqrt_pd(__m256d __a) 327{ 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329} 330 331/// \brief Calculates the square roots of the values in a 256-bit vector of 332/// [8 x float]. 333/// 334/// \headerfile <x86intrin.h> 335/// 336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337/// 338/// \param __a 339/// A 256-bit vector of [8 x float]. 340/// \returns A 256-bit vector of [8 x float] containing the square roots of the 341/// values in the operand. 342static __inline __m256 __DEFAULT_FN_ATTRS 343_mm256_sqrt_ps(__m256 __a) 344{ 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346} 347 348/// \brief Calculates the reciprocal square roots of the values in a 256-bit 349/// vector of [8 x float]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354/// 355/// \param __a 356/// A 256-bit vector of [8 x float]. 357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358/// roots of the values in the operand. 359static __inline __m256 __DEFAULT_FN_ATTRS 360_mm256_rsqrt_ps(__m256 __a) 361{ 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363} 364 365/// \brief Calculates the reciprocals of the values in a 256-bit vector of 366/// [8 x float]. 367/// 368/// \headerfile <x86intrin.h> 369/// 370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371/// 372/// \param __a 373/// A 256-bit vector of [8 x float]. 374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375/// values in the operand. 376static __inline __m256 __DEFAULT_FN_ATTRS 377_mm256_rcp_ps(__m256 __a) 378{ 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380} 381 382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383/// by the byte operand. The source values are rounded to integer values and 384/// returned as 64-bit double-precision floating-point values. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// \code 389/// __m256d _mm256_round_pd(__m256d V, const int M); 390/// \endcode 391/// 392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393/// 394/// \param V 395/// A 256-bit vector of [4 x double]. 396/// \param M 397/// An integer value that specifies the rounding operation. \n 398/// Bits [7:4] are reserved. \n 399/// Bit [3] is a precision exception value: \n 400/// 0: A normal PE exception is used. \n 401/// 1: The PE field is not updated. \n 402/// Bit [2] is the rounding control source: \n 403/// 0: Use bits [1:0] of \a M. \n 404/// 1: Use the current MXCSR setting. \n 405/// Bits [1:0] contain the rounding control definition: \n 406/// 00: Nearest. \n 407/// 01: Downward (toward negative infinity). \n 408/// 10: Upward (toward positive infinity). \n 409/// 11: Truncated. 410/// \returns A 256-bit vector of [4 x double] containing the rounded values. 411#define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415/// specified by the byte operand. The source values are rounded to integer 416/// values and returned as floating-point values. 417/// 418/// \headerfile <x86intrin.h> 419/// 420/// \code 421/// __m256 _mm256_round_ps(__m256 V, const int M); 422/// \endcode 423/// 424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425/// 426/// \param V 427/// A 256-bit vector of [8 x float]. 428/// \param M 429/// An integer value that specifies the rounding operation. \n 430/// Bits [7:4] are reserved. \n 431/// Bit [3] is a precision exception value: \n 432/// 0: A normal PE exception is used. \n 433/// 1: The PE field is not updated. \n 434/// Bit [2] is the rounding control source: \n 435/// 0: Use bits [1:0] of \a M. \n 436/// 1: Use the current MXCSR setting. \n 437/// Bits [1:0] contain the rounding control definition: \n 438/// 00: Nearest. \n 439/// 01: Downward (toward negative infinity). \n 440/// 10: Upward (toward positive infinity). \n 441/// 11: Truncated. 442/// \returns A 256-bit vector of [8 x float] containing the rounded values. 443#define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447/// source values are rounded up to integer values and returned as 64-bit 448/// double-precision floating-point values. 449/// 450/// \headerfile <x86intrin.h> 451/// 452/// \code 453/// __m256d _mm256_ceil_pd(__m256d V); 454/// \endcode 455/// 456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457/// 458/// \param V 459/// A 256-bit vector of [4 x double]. 460/// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464/// The source values are rounded down to integer values and returned as 465/// 64-bit double-precision floating-point values. 466/// 467/// \headerfile <x86intrin.h> 468/// 469/// \code 470/// __m256d _mm256_floor_pd(__m256d V); 471/// \endcode 472/// 473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474/// 475/// \param V 476/// A 256-bit vector of [4 x double]. 477/// \returns A 256-bit vector of [4 x double] containing the rounded down 478/// values. 479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482/// source values are rounded up to integer values and returned as 483/// floating-point values. 484/// 485/// \headerfile <x86intrin.h> 486/// 487/// \code 488/// __m256 _mm256_ceil_ps(__m256 V); 489/// \endcode 490/// 491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492/// 493/// \param V 494/// A 256-bit vector of [8 x float]. 495/// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499/// source values are rounded down to integer values and returned as 500/// floating-point values. 501/// 502/// \headerfile <x86intrin.h> 503/// 504/// \code 505/// __m256 _mm256_floor_ps(__m256 V); 506/// \endcode 507/// 508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509/// 510/// \param V 511/// A 256-bit vector of [8 x float]. 512/// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515/* Logical */ 516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517/// 518/// \headerfile <x86intrin.h> 519/// 520/// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521/// 522/// \param __a 523/// A 256-bit vector of [4 x double] containing one of the source operands. 524/// \param __b 525/// A 256-bit vector of [4 x double] containing one of the source operands. 526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527/// values between both operands. 528static __inline __m256d __DEFAULT_FN_ATTRS 529_mm256_and_pd(__m256d __a, __m256d __b) 530{ 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532} 533 534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535/// 536/// \headerfile <x86intrin.h> 537/// 538/// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539/// 540/// \param __a 541/// A 256-bit vector of [8 x float] containing one of the source operands. 542/// \param __b 543/// A 256-bit vector of [8 x float] containing one of the source operands. 544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545/// values between both operands. 546static __inline __m256 __DEFAULT_FN_ATTRS 547_mm256_and_ps(__m256 __a, __m256 __b) 548{ 549 return (__m256)((__v8su)__a & (__v8su)__b); 550} 551 552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553/// the one's complement of the values contained in the first source operand. 554/// 555/// \headerfile <x86intrin.h> 556/// 557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558/// 559/// \param __a 560/// A 256-bit vector of [4 x double] containing the left source operand. The 561/// one's complement of this value is used in the bitwise AND. 562/// \param __b 563/// A 256-bit vector of [4 x double] containing the right source operand. 564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565/// values of the second operand and the one's complement of the first 566/// operand. 567static __inline __m256d __DEFAULT_FN_ATTRS 568_mm256_andnot_pd(__m256d __a, __m256d __b) 569{ 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571} 572 573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574/// the one's complement of the values contained in the first source operand. 575/// 576/// \headerfile <x86intrin.h> 577/// 578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579/// 580/// \param __a 581/// A 256-bit vector of [8 x float] containing the left source operand. The 582/// one's complement of this value is used in the bitwise AND. 583/// \param __b 584/// A 256-bit vector of [8 x float] containing the right source operand. 585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586/// values of the second operand and the one's complement of the first 587/// operand. 588static __inline __m256 __DEFAULT_FN_ATTRS 589_mm256_andnot_ps(__m256 __a, __m256 __b) 590{ 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592} 593 594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595/// 596/// \headerfile <x86intrin.h> 597/// 598/// This intrinsic corresponds to the <c> VORPD </c> instruction. 599/// 600/// \param __a 601/// A 256-bit vector of [4 x double] containing one of the source operands. 602/// \param __b 603/// A 256-bit vector of [4 x double] containing one of the source operands. 604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605/// values between both operands. 606static __inline __m256d __DEFAULT_FN_ATTRS 607_mm256_or_pd(__m256d __a, __m256d __b) 608{ 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610} 611 612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613/// 614/// \headerfile <x86intrin.h> 615/// 616/// This intrinsic corresponds to the <c> VORPS </c> instruction. 617/// 618/// \param __a 619/// A 256-bit vector of [8 x float] containing one of the source operands. 620/// \param __b 621/// A 256-bit vector of [8 x float] containing one of the source operands. 622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623/// values between both operands. 624static __inline __m256 __DEFAULT_FN_ATTRS 625_mm256_or_ps(__m256 __a, __m256 __b) 626{ 627 return (__m256)((__v8su)__a | (__v8su)__b); 628} 629 630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631/// 632/// \headerfile <x86intrin.h> 633/// 634/// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635/// 636/// \param __a 637/// A 256-bit vector of [4 x double] containing one of the source operands. 638/// \param __b 639/// A 256-bit vector of [4 x double] containing one of the source operands. 640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641/// values between both operands. 642static __inline __m256d __DEFAULT_FN_ATTRS 643_mm256_xor_pd(__m256d __a, __m256d __b) 644{ 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646} 647 648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649/// 650/// \headerfile <x86intrin.h> 651/// 652/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653/// 654/// \param __a 655/// A 256-bit vector of [8 x float] containing one of the source operands. 656/// \param __b 657/// A 256-bit vector of [8 x float] containing one of the source operands. 658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659/// values between both operands. 660static __inline __m256 __DEFAULT_FN_ATTRS 661_mm256_xor_ps(__m256 __a, __m256 __b) 662{ 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664} 665 666/* Horizontal arithmetic */ 667/// \brief Horizontally adds the adjacent pairs of values contained in two 668/// 256-bit vectors of [4 x double]. 669/// 670/// \headerfile <x86intrin.h> 671/// 672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673/// 674/// \param __a 675/// A 256-bit vector of [4 x double] containing one of the source operands. 676/// The horizontal sums of the values are returned in the even-indexed 677/// elements of a vector of [4 x double]. 678/// \param __b 679/// A 256-bit vector of [4 x double] containing one of the source operands. 680/// The horizontal sums of the values are returned in the odd-indexed 681/// elements of a vector of [4 x double]. 682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683/// both operands. 684static __inline __m256d __DEFAULT_FN_ATTRS 685_mm256_hadd_pd(__m256d __a, __m256d __b) 686{ 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688} 689 690/// \brief Horizontally adds the adjacent pairs of values contained in two 691/// 256-bit vectors of [8 x float]. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696/// 697/// \param __a 698/// A 256-bit vector of [8 x float] containing one of the source operands. 699/// The horizontal sums of the values are returned in the elements with 700/// index 0, 1, 4, 5 of a vector of [8 x float]. 701/// \param __b 702/// A 256-bit vector of [8 x float] containing one of the source operands. 703/// The horizontal sums of the values are returned in the elements with 704/// index 2, 3, 6, 7 of a vector of [8 x float]. 705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706/// both operands. 707static __inline __m256 __DEFAULT_FN_ATTRS 708_mm256_hadd_ps(__m256 __a, __m256 __b) 709{ 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711} 712 713/// \brief Horizontally subtracts the adjacent pairs of values contained in two 714/// 256-bit vectors of [4 x double]. 715/// 716/// \headerfile <x86intrin.h> 717/// 718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719/// 720/// \param __a 721/// A 256-bit vector of [4 x double] containing one of the source operands. 722/// The horizontal differences between the values are returned in the 723/// even-indexed elements of a vector of [4 x double]. 724/// \param __b 725/// A 256-bit vector of [4 x double] containing one of the source operands. 726/// The horizontal differences between the values are returned in the 727/// odd-indexed elements of a vector of [4 x double]. 728/// \returns A 256-bit vector of [4 x double] containing the horizontal 729/// differences of both operands. 730static __inline __m256d __DEFAULT_FN_ATTRS 731_mm256_hsub_pd(__m256d __a, __m256d __b) 732{ 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734} 735 736/// \brief Horizontally subtracts the adjacent pairs of values contained in two 737/// 256-bit vectors of [8 x float]. 738/// 739/// \headerfile <x86intrin.h> 740/// 741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742/// 743/// \param __a 744/// A 256-bit vector of [8 x float] containing one of the source operands. 745/// The horizontal differences between the values are returned in the 746/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747/// \param __b 748/// A 256-bit vector of [8 x float] containing one of the source operands. 749/// The horizontal differences between the values are returned in the 750/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751/// \returns A 256-bit vector of [8 x float] containing the horizontal 752/// differences of both operands. 753static __inline __m256 __DEFAULT_FN_ATTRS 754_mm256_hsub_ps(__m256 __a, __m256 __b) 755{ 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757} 758 759/* Vector permutations */ 760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761/// by the 128-bit integer vector operand. 762/// 763/// \headerfile <x86intrin.h> 764/// 765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766/// 767/// \param __a 768/// A 128-bit vector of [2 x double]. 769/// \param __c 770/// A 128-bit integer vector operand specifying how the values are to be 771/// copied. \n 772/// Bit [1]: \n 773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774/// vector. \n 775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776/// returned vector. \n 777/// Bit [65]: \n 778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779/// returned vector. \n 780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781/// returned vector. 782/// \returns A 128-bit vector of [2 x double] containing the copied values. 783static __inline __m128d __DEFAULT_FN_ATTRS 784_mm_permutevar_pd(__m128d __a, __m128i __c) 785{ 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787} 788 789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790/// by the 256-bit integer vector operand. 791/// 792/// \headerfile <x86intrin.h> 793/// 794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795/// 796/// \param __a 797/// A 256-bit vector of [4 x double]. 798/// \param __c 799/// A 256-bit integer vector operand specifying how the values are to be 800/// copied. \n 801/// Bit [1]: \n 802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803/// vector. \n 804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805/// returned vector. \n 806/// Bit [65]: \n 807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808/// returned vector. \n 809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810/// returned vector. \n 811/// Bit [129]: \n 812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813/// returned vector. \n 814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815/// returned vector. \n 816/// Bit [193]: \n 817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818/// returned vector. \n 819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820/// returned vector. 821/// \returns A 256-bit vector of [4 x double] containing the copied values. 822static __inline __m256d __DEFAULT_FN_ATTRS 823_mm256_permutevar_pd(__m256d __a, __m256i __c) 824{ 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826} 827 828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829/// specified by the 128-bit integer vector operand. 830/// \headerfile <x86intrin.h> 831/// 832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833/// 834/// \param __a 835/// A 128-bit vector of [4 x float]. 836/// \param __c 837/// A 128-bit integer vector operand specifying how the values are to be 838/// copied. \n 839/// Bits [1:0]: \n 840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841/// returned vector. \n 842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843/// returned vector. \n 844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845/// returned vector. \n 846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847/// returned vector. \n 848/// Bits [33:32]: \n 849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850/// returned vector. \n 851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852/// returned vector. \n 853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854/// returned vector. \n 855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856/// returned vector. \n 857/// Bits [65:64]: \n 858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859/// returned vector. \n 860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861/// returned vector. \n 862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863/// returned vector. \n 864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865/// returned vector. \n 866/// Bits [97:96]: \n 867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868/// returned vector. \n 869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870/// returned vector. \n 871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872/// returned vector. \n 873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874/// returned vector. 875/// \returns A 128-bit vector of [4 x float] containing the copied values. 876static __inline __m128 __DEFAULT_FN_ATTRS 877_mm_permutevar_ps(__m128 __a, __m128i __c) 878{ 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880} 881 882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883/// specified by the 256-bit integer vector operand. 884/// 885/// \headerfile <x86intrin.h> 886/// 887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888/// 889/// \param __a 890/// A 256-bit vector of [8 x float]. 891/// \param __c 892/// A 256-bit integer vector operand specifying how the values are to be 893/// copied. \n 894/// Bits [1:0]: \n 895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896/// returned vector. \n 897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898/// returned vector. \n 899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900/// returned vector. \n 901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902/// returned vector. \n 903/// Bits [33:32]: \n 904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905/// returned vector. \n 906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907/// returned vector. \n 908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909/// returned vector. \n 910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911/// returned vector. \n 912/// Bits [65:64]: \n 913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914/// returned vector. \n 915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916/// returned vector. \n 917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918/// returned vector. \n 919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920/// returned vector. \n 921/// Bits [97:96]: \n 922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923/// returned vector. \n 924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925/// returned vector. \n 926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927/// returned vector. \n 928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929/// returned vector. \n 930/// Bits [129:128]: \n 931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932/// returned vector. \n 933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934/// returned vector. \n 935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936/// returned vector. \n 937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938/// returned vector. \n 939/// Bits [161:160]: \n 940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941/// returned vector. \n 942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943/// returned vector. \n 944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945/// returned vector. \n 946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947/// returned vector. \n 948/// Bits [193:192]: \n 949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950/// returned vector. \n 951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952/// returned vector. \n 953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954/// returned vector. \n 955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956/// returned vector. \n 957/// Bits [225:224]: \n 958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959/// returned vector. \n 960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961/// returned vector. \n 962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963/// returned vector. \n 964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965/// returned vector. 966/// \returns A 256-bit vector of [8 x float] containing the copied values. 967static __inline __m256 __DEFAULT_FN_ATTRS 968_mm256_permutevar_ps(__m256 __a, __m256i __c) 969{ 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971} 972 973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974/// by the immediate integer operand. 975/// 976/// \headerfile <x86intrin.h> 977/// 978/// \code 979/// __m128d _mm_permute_pd(__m128d A, const int C); 980/// \endcode 981/// 982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983/// 984/// \param A 985/// A 128-bit vector of [2 x double]. 986/// \param C 987/// An immediate integer operand specifying how the values are to be 988/// copied. \n 989/// Bit [0]: \n 990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991/// vector. \n 992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993/// returned vector. \n 994/// Bit [1]: \n 995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996/// returned vector. \n 997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998/// returned vector. 999/// \returns A 128-bit vector of [2 x double] containing the copied values. 1000#define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006/// the immediate integer operand. 1007/// 1008/// \headerfile <x86intrin.h> 1009/// 1010/// \code 1011/// __m256d _mm256_permute_pd(__m256d A, const int C); 1012/// \endcode 1013/// 1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015/// 1016/// \param A 1017/// A 256-bit vector of [4 x double]. 1018/// \param C 1019/// An immediate integer operand specifying how the values are to be 1020/// copied. \n 1021/// Bit [0]: \n 1022/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023/// vector. \n 1024/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025/// returned vector. \n 1026/// Bit [1]: \n 1027/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028/// returned vector. \n 1029/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030/// returned vector. \n 1031/// Bit [2]: \n 1032/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033/// returned vector. \n 1034/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035/// returned vector. \n 1036/// Bit [3]: \n 1037/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038/// returned vector. \n 1039/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040/// returned vector. 1041/// \returns A 256-bit vector of [4 x double] containing the copied values. 1042#define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051/// the immediate integer operand. 1052/// 1053/// \headerfile <x86intrin.h> 1054/// 1055/// \code 1056/// __m128 _mm_permute_ps(__m128 A, const int C); 1057/// \endcode 1058/// 1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060/// 1061/// \param A 1062/// A 128-bit vector of [4 x float]. 1063/// \param C 1064/// An immediate integer operand specifying how the values are to be 1065/// copied. \n 1066/// Bits [1:0]: \n 1067/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068/// returned vector. \n 1069/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070/// returned vector. \n 1071/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072/// returned vector. \n 1073/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074/// returned vector. \n 1075/// Bits [3:2]: \n 1076/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077/// returned vector. \n 1078/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079/// returned vector. \n 1080/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081/// returned vector. \n 1082/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083/// returned vector. \n 1084/// Bits [5:4]: \n 1085/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086/// returned vector. \n 1087/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088/// returned vector. \n 1089/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090/// returned vector. \n 1091/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092/// returned vector. \n 1093/// Bits [7:6]: \n 1094/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095/// returned vector. \n 1096/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097/// returned vector. \n 1098/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099/// returned vector. \n 1100/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101/// returned vector. 1102/// \returns A 128-bit vector of [4 x float] containing the copied values. 1103#define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110/// the immediate integer operand. 1111/// 1112/// \headerfile <x86intrin.h> 1113/// 1114/// \code 1115/// __m256 _mm256_permute_ps(__m256 A, const int C); 1116/// \endcode 1117/// 1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119/// 1120/// \param A 1121/// A 256-bit vector of [8 x float]. 1122/// \param C 1123/// An immediate integer operand specifying how the values are to be \n 1124/// copied. \n 1125/// Bits [1:0]: \n 1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127/// returned vector. \n 1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129/// returned vector. \n 1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131/// returned vector. \n 1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133/// returned vector. \n 1134/// Bits [3:2]: \n 1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136/// returned vector. \n 1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138/// returned vector. \n 1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140/// returned vector. \n 1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142/// returned vector. \n 1143/// Bits [5:4]: \n 1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145/// returned vector. \n 1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147/// returned vector. \n 1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149/// returned vector. \n 1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151/// returned vector. \n 1152/// Bits [7:6]: \n 1153/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154/// returned vector. \n 1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156/// returned vector. \n 1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158/// returned vector. \n 1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160/// returned vector. \n 1161/// Bits [1:0]: \n 1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163/// returned vector. \n 1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165/// returned vector. \n 1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167/// returned vector. \n 1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169/// returned vector. \n 1170/// Bits [3:2]: \n 1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172/// returned vector. \n 1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174/// returned vector. \n 1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176/// returned vector. \n 1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178/// returned vector. \n 1179/// Bits [5:4]: \n 1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181/// returned vector. \n 1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183/// returned vector. \n 1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185/// returned vector. \n 1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187/// returned vector. \n 1188/// Bits [7:6]: \n 1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190/// returned vector. \n 1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192/// returned vector. \n 1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194/// returned vector. \n 1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196/// returned vector. 1197/// \returns A 256-bit vector of [8 x float] containing the copied values. 1198#define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211/// [4 x double], as specified by the immediate integer operand. 1212/// 1213/// \headerfile <x86intrin.h> 1214/// 1215/// \code 1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217/// \endcode 1218/// 1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220/// 1221/// \param V1 1222/// A 256-bit vector of [4 x double]. 1223/// \param V2 1224/// A 256-bit vector of [4 x double. 1225/// \param M 1226/// An immediate integer operand specifying how the values are to be 1227/// permuted. \n 1228/// Bits [1:0]: \n 1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230/// destination. \n 1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232/// destination. \n 1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234/// destination. \n 1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236/// destination. \n 1237/// Bits [5:4]: \n 1238/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239/// destination. \n 1240/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241/// destination. \n 1242/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243/// destination. \n 1244/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245/// destination. 1246/// \returns A 256-bit vector of [4 x double] containing the copied values. 1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252/// [8 x float], as specified by the immediate integer operand. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// \code 1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258/// \endcode 1259/// 1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261/// 1262/// \param V1 1263/// A 256-bit vector of [8 x float]. 1264/// \param V2 1265/// A 256-bit vector of [8 x float]. 1266/// \param M 1267/// An immediate integer operand specifying how the values are to be 1268/// permuted. \n 1269/// Bits [1:0]: \n 1270/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271/// destination. \n 1272/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273/// destination. \n 1274/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275/// destination. \n 1276/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277/// destination. \n 1278/// Bits [5:4]: \n 1279/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280/// destination. \n 1281/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282/// destination. \n 1283/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284/// destination. \n 1285/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286/// destination. 1287/// \returns A 256-bit vector of [8 x float] containing the copied values. 1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293/// as specified by the immediate integer operand. 1294/// 1295/// \headerfile <x86intrin.h> 1296/// 1297/// \code 1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299/// \endcode 1300/// 1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302/// 1303/// \param V1 1304/// A 256-bit integer vector. 1305/// \param V2 1306/// A 256-bit integer vector. 1307/// \param M 1308/// An immediate integer operand specifying how the values are to be copied. 1309/// Bits [1:0]: \n 1310/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311/// destination. \n 1312/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313/// destination. \n 1314/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315/// destination. \n 1316/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317/// destination. \n 1318/// Bits [5:4]: \n 1319/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320/// destination. \n 1321/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322/// destination. \n 1323/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324/// destination. \n 1325/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326/// destination. 1327/// \returns A 256-bit integer vector containing the copied values. 1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332/* Vector Blend */ 1333/// \brief Merges 64-bit double-precision data values stored in either of the 1334/// two 256-bit vectors of [4 x double], as specified by the immediate 1335/// integer operand. 1336/// 1337/// \headerfile <x86intrin.h> 1338/// 1339/// \code 1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341/// \endcode 1342/// 1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344/// 1345/// \param V1 1346/// A 256-bit vector of [4 x double]. 1347/// \param V2 1348/// A 256-bit vector of [4 x double]. 1349/// \param M 1350/// An immediate integer operand, with mask bits [3:0] specifying how the 1351/// values are to be copied. The position of the mask bit corresponds to the 1352/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353/// element in operand \a V1 is copied to the same position in the 1354/// destination. When a mask bit is 1, the corresponding 64-bit element in 1355/// operand \a V2 is copied to the same position in the destination. 1356/// \returns A 256-bit vector of [4 x double] containing the copied values. 1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365/// \brief Merges 32-bit single-precision data values stored in either of the 1366/// two 256-bit vectors of [8 x float], as specified by the immediate 1367/// integer operand. 1368/// 1369/// \headerfile <x86intrin.h> 1370/// 1371/// \code 1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373/// \endcode 1374/// 1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376/// 1377/// \param V1 1378/// A 256-bit vector of [8 x float]. 1379/// \param V2 1380/// A 256-bit vector of [8 x float]. 1381/// \param M 1382/// An immediate integer operand, with mask bits [7:0] specifying how the 1383/// values are to be copied. The position of the mask bit corresponds to the 1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385/// element in operand \a V1 is copied to the same position in the 1386/// destination. When a mask bit is 1, the corresponding 32-bit element in 1387/// operand \a V2 is copied to the same position in the destination. 1388/// \returns A 256-bit vector of [8 x float] containing the copied values. 1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401/// \brief Merges 64-bit double-precision data values stored in either of the 1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403/// operand. 1404/// 1405/// \headerfile <x86intrin.h> 1406/// 1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408/// 1409/// \param __a 1410/// A 256-bit vector of [4 x double]. 1411/// \param __b 1412/// A 256-bit vector of [4 x double]. 1413/// \param __c 1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415/// how the values are to be copied. The position of the mask bit corresponds 1416/// to the most significant bit of a copied value. When a mask bit is 0, the 1417/// corresponding 64-bit element in operand \a __a is copied to the same 1418/// position in the destination. When a mask bit is 1, the corresponding 1419/// 64-bit element in operand \a __b is copied to the same position in the 1420/// destination. 1421/// \returns A 256-bit vector of [4 x double] containing the copied values. 1422static __inline __m256d __DEFAULT_FN_ATTRS 1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424{ 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427} 1428 1429/// \brief Merges 32-bit single-precision data values stored in either of the 1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431/// operand. 1432/// 1433/// \headerfile <x86intrin.h> 1434/// 1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436/// 1437/// \param __a 1438/// A 256-bit vector of [8 x float]. 1439/// \param __b 1440/// A 256-bit vector of [8 x float]. 1441/// \param __c 1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443/// and 31 specifying how the values are to be copied. The position of the 1444/// mask bit corresponds to the most significant bit of a copied value. When 1445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446/// copied to the same position in the destination. When a mask bit is 1, the 1447/// corresponding 32-bit element in operand \a __b is copied to the same 1448/// position in the destination. 1449/// \returns A 256-bit vector of [8 x float] containing the copied values. 1450static __inline __m256 __DEFAULT_FN_ATTRS 1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452{ 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455} 1456 1457/* Vector Dot Product */ 1458/// \brief Computes two dot products in parallel, using the lower and upper 1459/// halves of two [8 x float] vectors as input to the two computations, and 1460/// returning the two dot products in the lower and upper halves of the 1461/// [8 x float] result. The immediate integer operand controls which input 1462/// elements will contribute to the dot product, and where the final results 1463/// are returned. In general, for each dot product, the four corresponding 1464/// elements of the input vectors are multiplied; the first two and second 1465/// two products are summed, then the two sums are added to form the final 1466/// result. 1467/// 1468/// \headerfile <x86intrin.h> 1469/// 1470/// \code 1471/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1472/// \endcode 1473/// 1474/// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1475/// 1476/// \param V1 1477/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1478/// \param V2 1479/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1480/// \param M 1481/// An immediate integer argument. Bits [7:4] determine which elements of 1482/// the input vectors are used, with bit [4] corresponding to the lowest 1483/// element and bit [7] corresponding to the highest element of each [4 x 1484/// float] subvector. If a bit is set, the corresponding elements from the 1485/// two input vectors are used as an input for dot product; otherwise that 1486/// input is treated as zero. Bits [3:0] determine which elements of the 1487/// result will receive a copy of the final dot product, with bit [0] 1488/// corresponding to the lowest element and bit [3] corresponding to the 1489/// highest element of each [4 x float] subvector. If a bit is set, the dot 1490/// product is returned in the corresponding element; otherwise that element 1491/// is set to zero. The bitmask is applied in the same way to each of the 1492/// two parallel dot product computations. 1493/// \returns A 256-bit vector of [8 x float] containing the two dot products. 1494#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1495 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1496 (__v8sf)(__m256)(V2), (M)); }) 1497 1498/* Vector shuffle */ 1499/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1500/// specified by the immediate value operand. The four selected elements in 1501/// each operand are copied to the destination according to the bits 1502/// specified in the immediate operand. The selected elements from the first 1503/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the 1504/// destination, and the selected elements from the second 256-bit operand 1505/// are copied to bits [127:64] and bits [255:192] of the destination. For 1506/// example, if bits [7:0] of the immediate operand contain a value of 0xFF, 1507/// the 256-bit destination vector would contain the following values: b[7], 1508/// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1509/// 1510/// \headerfile <x86intrin.h> 1511/// 1512/// \code 1513/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1514/// \endcode 1515/// 1516/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1517/// 1518/// \param a 1519/// A 256-bit vector of [8 x float]. The four selected elements in this 1520/// operand are copied to bits [63:0] and bits [191:128] in the destination, 1521/// according to the bits specified in the immediate operand. 1522/// \param b 1523/// A 256-bit vector of [8 x float]. The four selected elements in this 1524/// operand are copied to bits [127:64] and bits [255:192] in the 1525/// destination, according to the bits specified in the immediate operand. 1526/// \param mask 1527/// An immediate value containing an 8-bit value specifying which elements to 1528/// copy from \a a and \a b \n. 1529/// Bits [3:0] specify the values copied from operand \a a. \n 1530/// Bits [7:4] specify the values copied from operand \a b. \n 1531/// The destinations within the 256-bit destination are assigned values as 1532/// follows, according to the bit value assignments described below: \n 1533/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1534/// destination. \n 1535/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1536/// destination. \n 1537/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1538/// destination. \n 1539/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1540/// the destination. \n 1541/// Bit value assignments: \n 1542/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1543/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1544/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1545/// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1546/// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1547#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1548 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1549 (__v8sf)(__m256)(b), \ 1550 0 + (((mask) >> 0) & 0x3), \ 1551 0 + (((mask) >> 2) & 0x3), \ 1552 8 + (((mask) >> 4) & 0x3), \ 1553 8 + (((mask) >> 6) & 0x3), \ 1554 4 + (((mask) >> 0) & 0x3), \ 1555 4 + (((mask) >> 2) & 0x3), \ 1556 12 + (((mask) >> 4) & 0x3), \ 1557 12 + (((mask) >> 6) & 0x3)); }) 1558 1559/// \brief Selects four double-precision values from the 256-bit operands of 1560/// [4 x double], as specified by the immediate value operand. The selected 1561/// elements from the first 256-bit operand are copied to bits [63:0] and 1562/// bits [191:128] in the destination, and the selected elements from the 1563/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in 1564/// the destination. For example, if bits [3:0] of the immediate operand 1565/// contain a value of 0xF, the 256-bit destination vector would contain the 1566/// following values: b[3], a[3], b[1], a[1]. 1567/// 1568/// \headerfile <x86intrin.h> 1569/// 1570/// \code 1571/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1572/// \endcode 1573/// 1574/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1575/// 1576/// \param a 1577/// A 256-bit vector of [4 x double]. 1578/// \param b 1579/// A 256-bit vector of [4 x double]. 1580/// \param mask 1581/// An immediate value containing 8-bit values specifying which elements to 1582/// copy from \a a and \a b: \n 1583/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1584/// destination. \n 1585/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1586/// destination. \n 1587/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1588/// destination. \n 1589/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1590/// destination. \n 1591/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1592/// destination. \n 1593/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1594/// destination. \n 1595/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1596/// destination. \n 1597/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1598/// destination. 1599/// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1600#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1601 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1602 (__v4df)(__m256d)(b), \ 1603 0 + (((mask) >> 0) & 0x1), \ 1604 4 + (((mask) >> 1) & 0x1), \ 1605 2 + (((mask) >> 2) & 0x1), \ 1606 6 + (((mask) >> 3) & 0x1)); }) 1607 1608/* Compare */ 1609#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1610#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1611#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1612#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1613#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1614#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1615#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1616#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 1617#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1618#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 1619#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1620#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1621#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1622#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1623#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1624#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1625#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1626#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1627#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1628#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1629#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1630#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1631#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 1632#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1633#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1634#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 1635#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1636#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1637#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1638#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1639#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1640#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1641 1642/// \brief Compares each of the corresponding double-precision values of two 1643/// 128-bit vectors of [2 x double], using the operation specified by the 1644/// immediate integer operand. Returns a [2 x double] vector consisting of 1645/// two doubles corresponding to the two comparison results: zero if the 1646/// comparison is false, and all 1's if the comparison is true. 1647/// 1648/// \headerfile <x86intrin.h> 1649/// 1650/// \code 1651/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1652/// \endcode 1653/// 1654/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1655/// 1656/// \param a 1657/// A 128-bit vector of [2 x double]. 1658/// \param b 1659/// A 128-bit vector of [2 x double]. 1660/// \param c 1661/// An immediate integer operand, with bits [4:0] specifying which comparison 1662/// operation to use: \n 1663/// 00h, 08h, 10h, 18h: Equal \n 1664/// 01h, 09h, 11h, 19h: Less than \n 1665/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1666/// (swapped operands) \n 1667/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1668/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1669/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1670/// (swapped operands) \n 1671/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1672/// (swapped operands) \n 1673/// 07h, 0Fh, 17h, 1Fh: Ordered 1674/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1675#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1676 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1677 (__v2df)(__m128d)(b), (c)); }) 1678 1679/// \brief Compares each of the corresponding values of two 128-bit vectors of 1680/// [4 x float], using the operation specified by the immediate integer 1681/// operand. Returns a [4 x float] vector consisting of four floats 1682/// corresponding to the four comparison results: zero if the comparison is 1683/// false, and all 1's if the comparison is true. 1684/// 1685/// \headerfile <x86intrin.h> 1686/// 1687/// \code 1688/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1689/// \endcode 1690/// 1691/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1692/// 1693/// \param a 1694/// A 128-bit vector of [4 x float]. 1695/// \param b 1696/// A 128-bit vector of [4 x float]. 1697/// \param c 1698/// An immediate integer operand, with bits [4:0] specifying which comparison 1699/// operation to use: \n 1700/// 00h, 08h, 10h, 18h: Equal \n 1701/// 01h, 09h, 11h, 19h: Less than \n 1702/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1703/// (swapped operands) \n 1704/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1705/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1706/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1707/// (swapped operands) \n 1708/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1709/// (swapped operands) \n 1710/// 07h, 0Fh, 17h, 1Fh: Ordered 1711/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1712#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1713 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1714 (__v4sf)(__m128)(b), (c)); }) 1715 1716/// \brief Compares each of the corresponding double-precision values of two 1717/// 256-bit vectors of [4 x double], using the operation specified by the 1718/// immediate integer operand. Returns a [4 x double] vector consisting of 1719/// four doubles corresponding to the four comparison results: zero if the 1720/// comparison is false, and all 1's if the comparison is true. 1721/// 1722/// \headerfile <x86intrin.h> 1723/// 1724/// \code 1725/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1726/// \endcode 1727/// 1728/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1729/// 1730/// \param a 1731/// A 256-bit vector of [4 x double]. 1732/// \param b 1733/// A 256-bit vector of [4 x double]. 1734/// \param c 1735/// An immediate integer operand, with bits [4:0] specifying which comparison 1736/// operation to use: \n 1737/// 00h, 08h, 10h, 18h: Equal \n 1738/// 01h, 09h, 11h, 19h: Less than \n 1739/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1740/// (swapped operands) \n 1741/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1742/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1743/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1744/// (swapped operands) \n 1745/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1746/// (swapped operands) \n 1747/// 07h, 0Fh, 17h, 1Fh: Ordered 1748/// \returns A 256-bit vector of [4 x double] containing the comparison results. 1749#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1750 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1751 (__v4df)(__m256d)(b), (c)); }) 1752 1753/// \brief Compares each of the corresponding values of two 256-bit vectors of 1754/// [8 x float], using the operation specified by the immediate integer 1755/// operand. Returns a [8 x float] vector consisting of eight floats 1756/// corresponding to the eight comparison results: zero if the comparison is 1757/// false, and all 1's if the comparison is true. 1758/// 1759/// \headerfile <x86intrin.h> 1760/// 1761/// \code 1762/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1763/// \endcode 1764/// 1765/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1766/// 1767/// \param a 1768/// A 256-bit vector of [8 x float]. 1769/// \param b 1770/// A 256-bit vector of [8 x float]. 1771/// \param c 1772/// An immediate integer operand, with bits [4:0] specifying which comparison 1773/// operation to use: \n 1774/// 00h, 08h, 10h, 18h: Equal \n 1775/// 01h, 09h, 11h, 19h: Less than \n 1776/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1777/// (swapped operands) \n 1778/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1779/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1780/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1781/// (swapped operands) \n 1782/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1783/// (swapped operands) \n 1784/// 07h, 0Fh, 17h, 1Fh: Ordered 1785/// \returns A 256-bit vector of [8 x float] containing the comparison results. 1786#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1787 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1788 (__v8sf)(__m256)(b), (c)); }) 1789 1790/// \brief Compares each of the corresponding scalar double-precision values of 1791/// two 128-bit vectors of [2 x double], using the operation specified by the 1792/// immediate integer operand. If the result is true, all 64 bits of the 1793/// destination vector are set; otherwise they are cleared. 1794/// 1795/// \headerfile <x86intrin.h> 1796/// 1797/// \code 1798/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1799/// \endcode 1800/// 1801/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1802/// 1803/// \param a 1804/// A 128-bit vector of [2 x double]. 1805/// \param b 1806/// A 128-bit vector of [2 x double]. 1807/// \param c 1808/// An immediate integer operand, with bits [4:0] specifying which comparison 1809/// operation to use: \n 1810/// 00h, 08h, 10h, 18h: Equal \n 1811/// 01h, 09h, 11h, 19h: Less than \n 1812/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1813/// (swapped operands) \n 1814/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1815/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1816/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1817/// (swapped operands) \n 1818/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1819/// (swapped operands) \n 1820/// 07h, 0Fh, 17h, 1Fh: Ordered 1821/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1822#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1823 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1824 (__v2df)(__m128d)(b), (c)); }) 1825 1826/// \brief Compares each of the corresponding scalar values of two 128-bit 1827/// vectors of [4 x float], using the operation specified by the immediate 1828/// integer operand. If the result is true, all 32 bits of the destination 1829/// vector are set; otherwise they are cleared. 1830/// 1831/// \headerfile <x86intrin.h> 1832/// 1833/// \code 1834/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1835/// \endcode 1836/// 1837/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1838/// 1839/// \param a 1840/// A 128-bit vector of [4 x float]. 1841/// \param b 1842/// A 128-bit vector of [4 x float]. 1843/// \param c 1844/// An immediate integer operand, with bits [4:0] specifying which comparison 1845/// operation to use: \n 1846/// 00h, 08h, 10h, 18h: Equal \n 1847/// 01h, 09h, 11h, 19h: Less than \n 1848/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal 1849/// (swapped operands) \n 1850/// 03h, 0Bh, 13h, 1Bh: Unordered \n 1851/// 04h, 0Ch, 14h, 1Ch: Not equal \n 1852/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than 1853/// (swapped operands) \n 1854/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal 1855/// (swapped operands) \n 1856/// 07h, 0Fh, 17h, 1Fh: Ordered 1857/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1858#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 1859 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1860 (__v4sf)(__m128)(b), (c)); }) 1861 1862/// \brief Takes a [8 x i32] vector and returns the vector element value 1863/// indexed by the immediate constant operand. 1864/// 1865/// \headerfile <x86intrin.h> 1866/// 1867/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1868/// instruction. 1869/// 1870/// \param __a 1871/// A 256-bit vector of [8 x i32]. 1872/// \param __imm 1873/// An immediate integer operand with bits [2:0] determining which vector 1874/// element is extracted and returned. 1875/// \returns A 32-bit integer containing the extracted 32 bits of extended 1876/// packed data. 1877static __inline int __DEFAULT_FN_ATTRS 1878_mm256_extract_epi32(__m256i __a, const int __imm) 1879{ 1880 __v8si __b = (__v8si)__a; 1881 return __b[__imm & 7]; 1882} 1883 1884/// \brief Takes a [16 x i16] vector and returns the vector element value 1885/// indexed by the immediate constant operand. 1886/// 1887/// \headerfile <x86intrin.h> 1888/// 1889/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1890/// instruction. 1891/// 1892/// \param __a 1893/// A 256-bit integer vector of [16 x i16]. 1894/// \param __imm 1895/// An immediate integer operand with bits [3:0] determining which vector 1896/// element is extracted and returned. 1897/// \returns A 32-bit integer containing the extracted 16 bits of zero extended 1898/// packed data. 1899static __inline int __DEFAULT_FN_ATTRS 1900_mm256_extract_epi16(__m256i __a, const int __imm) 1901{ 1902 __v16hi __b = (__v16hi)__a; 1903 return (unsigned short)__b[__imm & 15]; 1904} 1905 1906/// \brief Takes a [32 x i8] vector and returns the vector element value 1907/// indexed by the immediate constant operand. 1908/// 1909/// \headerfile <x86intrin.h> 1910/// 1911/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1912/// instruction. 1913/// 1914/// \param __a 1915/// A 256-bit integer vector of [32 x i8]. 1916/// \param __imm 1917/// An immediate integer operand with bits [4:0] determining which vector 1918/// element is extracted and returned. 1919/// \returns A 32-bit integer containing the extracted 8 bits of zero extended 1920/// packed data. 1921static __inline int __DEFAULT_FN_ATTRS 1922_mm256_extract_epi8(__m256i __a, const int __imm) 1923{ 1924 __v32qi __b = (__v32qi)__a; 1925 return (unsigned char)__b[__imm & 31]; 1926} 1927 1928#ifdef __x86_64__ 1929/// \brief Takes a [4 x i64] vector and returns the vector element value 1930/// indexed by the immediate constant operand. 1931/// 1932/// \headerfile <x86intrin.h> 1933/// 1934/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1935/// instruction. 1936/// 1937/// \param __a 1938/// A 256-bit integer vector of [4 x i64]. 1939/// \param __imm 1940/// An immediate integer operand with bits [1:0] determining which vector 1941/// element is extracted and returned. 1942/// \returns A 64-bit integer containing the extracted 64 bits of extended 1943/// packed data. 1944static __inline long long __DEFAULT_FN_ATTRS 1945_mm256_extract_epi64(__m256i __a, const int __imm) 1946{ 1947 __v4di __b = (__v4di)__a; 1948 return __b[__imm & 3]; 1949} 1950#endif 1951 1952/// \brief Takes a [8 x i32] vector and replaces the vector element value 1953/// indexed by the immediate constant operand by a new value. Returns the 1954/// modified vector. 1955/// 1956/// \headerfile <x86intrin.h> 1957/// 1958/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 1959/// instruction. 1960/// 1961/// \param __a 1962/// A vector of [8 x i32] to be used by the insert operation. 1963/// \param __b 1964/// An integer value. The replacement value for the insert operation. 1965/// \param __imm 1966/// An immediate integer specifying the index of the vector element to be 1967/// replaced. 1968/// \returns A copy of vector \a __a, after replacing its element indexed by 1969/// \a __imm with \a __b. 1970static __inline __m256i __DEFAULT_FN_ATTRS 1971_mm256_insert_epi32(__m256i __a, int __b, int const __imm) 1972{ 1973 __v8si __c = (__v8si)__a; 1974 __c[__imm & 7] = __b; 1975 return (__m256i)__c; 1976} 1977 1978 1979/// \brief Takes a [16 x i16] vector and replaces the vector element value 1980/// indexed by the immediate constant operand with a new value. Returns the 1981/// modified vector. 1982/// 1983/// \headerfile <x86intrin.h> 1984/// 1985/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 1986/// instruction. 1987/// 1988/// \param __a 1989/// A vector of [16 x i16] to be used by the insert operation. 1990/// \param __b 1991/// An i16 integer value. The replacement value for the insert operation. 1992/// \param __imm 1993/// An immediate integer specifying the index of the vector element to be 1994/// replaced. 1995/// \returns A copy of vector \a __a, after replacing its element indexed by 1996/// \a __imm with \a __b. 1997static __inline __m256i __DEFAULT_FN_ATTRS 1998_mm256_insert_epi16(__m256i __a, int __b, int const __imm) 1999{ 2000 __v16hi __c = (__v16hi)__a; 2001 __c[__imm & 15] = __b; 2002 return (__m256i)__c; 2003} 2004 2005/// \brief Takes a [32 x i8] vector and replaces the vector element value 2006/// indexed by the immediate constant operand with a new value. Returns the 2007/// modified vector. 2008/// 2009/// \headerfile <x86intrin.h> 2010/// 2011/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2012/// instruction. 2013/// 2014/// \param __a 2015/// A vector of [32 x i8] to be used by the insert operation. 2016/// \param __b 2017/// An i8 integer value. The replacement value for the insert operation. 2018/// \param __imm 2019/// An immediate integer specifying the index of the vector element to be 2020/// replaced. 2021/// \returns A copy of vector \a __a, after replacing its element indexed by 2022/// \a __imm with \a __b. 2023static __inline __m256i __DEFAULT_FN_ATTRS 2024_mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2025{ 2026 __v32qi __c = (__v32qi)__a; 2027 __c[__imm & 31] = __b; 2028 return (__m256i)__c; 2029} 2030 2031#ifdef __x86_64__ 2032/// \brief Takes a [4 x i64] vector and replaces the vector element value 2033/// indexed by the immediate constant operand with a new value. Returns the 2034/// modified vector. 2035/// 2036/// \headerfile <x86intrin.h> 2037/// 2038/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2039/// instruction. 2040/// 2041/// \param __a 2042/// A vector of [4 x i64] to be used by the insert operation. 2043/// \param __b 2044/// A 64-bit integer value. The replacement value for the insert operation. 2045/// \param __imm 2046/// An immediate integer specifying the index of the vector element to be 2047/// replaced. 2048/// \returns A copy of vector \a __a, after replacing its element indexed by 2049/// \a __imm with \a __b. 2050static __inline __m256i __DEFAULT_FN_ATTRS 2051_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2052{ 2053 __v4di __c = (__v4di)__a; 2054 __c[__imm & 3] = __b; 2055 return (__m256i)__c; 2056} 2057#endif 2058 2059/* Conversion */ 2060/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2061/// 2062/// \headerfile <x86intrin.h> 2063/// 2064/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2065/// 2066/// \param __a 2067/// A 128-bit integer vector of [4 x i32]. 2068/// \returns A 256-bit vector of [4 x double] containing the converted values. 2069static __inline __m256d __DEFAULT_FN_ATTRS 2070_mm256_cvtepi32_pd(__m128i __a) 2071{ 2072 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2073} 2074 2075/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2076/// 2077/// \headerfile <x86intrin.h> 2078/// 2079/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2080/// 2081/// \param __a 2082/// A 256-bit integer vector. 2083/// \returns A 256-bit vector of [8 x float] containing the converted values. 2084static __inline __m256 __DEFAULT_FN_ATTRS 2085_mm256_cvtepi32_ps(__m256i __a) 2086{ 2087 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2088} 2089 2090/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2091/// [4 x float]. 2092/// 2093/// \headerfile <x86intrin.h> 2094/// 2095/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2096/// 2097/// \param __a 2098/// A 256-bit vector of [4 x double]. 2099/// \returns A 128-bit vector of [4 x float] containing the converted values. 2100static __inline __m128 __DEFAULT_FN_ATTRS 2101_mm256_cvtpd_ps(__m256d __a) 2102{ 2103 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2104} 2105 2106/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2107/// 2108/// \headerfile <x86intrin.h> 2109/// 2110/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2111/// 2112/// \param __a 2113/// A 256-bit vector of [8 x float]. 2114/// \returns A 256-bit integer vector containing the converted values. 2115static __inline __m256i __DEFAULT_FN_ATTRS 2116_mm256_cvtps_epi32(__m256 __a) 2117{ 2118 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2119} 2120 2121/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2122/// x double]. 2123/// 2124/// \headerfile <x86intrin.h> 2125/// 2126/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2127/// 2128/// \param __a 2129/// A 128-bit vector of [4 x float]. 2130/// \returns A 256-bit vector of [4 x double] containing the converted values. 2131static __inline __m256d __DEFAULT_FN_ATTRS 2132_mm256_cvtps_pd(__m128 __a) 2133{ 2134 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2135} 2136 2137/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2138/// x i32], truncating the result by rounding towards zero when it is 2139/// inexact. 2140/// 2141/// \headerfile <x86intrin.h> 2142/// 2143/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2144/// 2145/// \param __a 2146/// A 256-bit vector of [4 x double]. 2147/// \returns A 128-bit integer vector containing the converted values. 2148static __inline __m128i __DEFAULT_FN_ATTRS 2149_mm256_cvttpd_epi32(__m256d __a) 2150{ 2151 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2152} 2153 2154/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2155/// x i32]. When a conversion is inexact, the value returned is rounded 2156/// according to the rounding control bits in the MXCSR register. 2157/// 2158/// \headerfile <x86intrin.h> 2159/// 2160/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2161/// 2162/// \param __a 2163/// A 256-bit vector of [4 x double]. 2164/// \returns A 128-bit integer vector containing the converted values. 2165static __inline __m128i __DEFAULT_FN_ATTRS 2166_mm256_cvtpd_epi32(__m256d __a) 2167{ 2168 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2169} 2170 2171/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2172/// truncating the result by rounding towards zero when it is inexact. 2173/// 2174/// \headerfile <x86intrin.h> 2175/// 2176/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2177/// 2178/// \param __a 2179/// A 256-bit vector of [8 x float]. 2180/// \returns A 256-bit integer vector containing the converted values. 2181static __inline __m256i __DEFAULT_FN_ATTRS 2182_mm256_cvttps_epi32(__m256 __a) 2183{ 2184 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2185} 2186 2187static __inline double __DEFAULT_FN_ATTRS 2188_mm256_cvtsd_f64(__m256d __a) 2189{ 2190 return __a[0]; 2191} 2192 2193static __inline int __DEFAULT_FN_ATTRS 2194_mm256_cvtsi256_si32(__m256i __a) 2195{ 2196 __v8si __b = (__v8si)__a; 2197 return __b[0]; 2198} 2199 2200static __inline float __DEFAULT_FN_ATTRS 2201_mm256_cvtss_f32(__m256 __a) 2202{ 2203 return __a[0]; 2204} 2205 2206/* Vector replicate */ 2207/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2208/// vector of [8 x float] to float values in a 256-bit vector of 2209/// [8 x float]. 2210/// 2211/// \headerfile <x86intrin.h> 2212/// 2213/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2214/// 2215/// \param __a 2216/// A 256-bit vector of [8 x float]. \n 2217/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2218/// the return value. \n 2219/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2220/// the return value. \n 2221/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2222/// return value. \n 2223/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2224/// return value. 2225/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2226/// values. 2227static __inline __m256 __DEFAULT_FN_ATTRS 2228_mm256_movehdup_ps(__m256 __a) 2229{ 2230 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2231} 2232 2233/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2234/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2235/// 2236/// \headerfile <x86intrin.h> 2237/// 2238/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2239/// 2240/// \param __a 2241/// A 256-bit vector of [8 x float]. \n 2242/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2243/// the return value. \n 2244/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2245/// the return value. \n 2246/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2247/// return value. \n 2248/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2249/// return value. 2250/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2251/// values. 2252static __inline __m256 __DEFAULT_FN_ATTRS 2253_mm256_moveldup_ps(__m256 __a) 2254{ 2255 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2256} 2257 2258/// \brief Moves and duplicates double-precision floating point values from a 2259/// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2260/// vector of [4 x double]. 2261/// 2262/// \headerfile <x86intrin.h> 2263/// 2264/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2265/// 2266/// \param __a 2267/// A 256-bit vector of [4 x double]. \n 2268/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2269/// return value. \n 2270/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2271/// the return value. 2272/// \returns A 256-bit vector of [4 x double] containing the moved and 2273/// duplicated values. 2274static __inline __m256d __DEFAULT_FN_ATTRS 2275_mm256_movedup_pd(__m256d __a) 2276{ 2277 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2278} 2279 2280/* Unpack and Interleave */ 2281/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2282/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2283/// 2284/// \headerfile <x86intrin.h> 2285/// 2286/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2287/// 2288/// \param __a 2289/// A 256-bit floating-point vector of [4 x double]. \n 2290/// Bits [127:64] are written to bits [63:0] of the return value. \n 2291/// Bits [255:192] are written to bits [191:128] of the return value. \n 2292/// \param __b 2293/// A 256-bit floating-point vector of [4 x double]. \n 2294/// Bits [127:64] are written to bits [127:64] of the return value. \n 2295/// Bits [255:192] are written to bits [255:192] of the return value. \n 2296/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2297static __inline __m256d __DEFAULT_FN_ATTRS 2298_mm256_unpackhi_pd(__m256d __a, __m256d __b) 2299{ 2300 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2301} 2302 2303/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2304/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2305/// 2306/// \headerfile <x86intrin.h> 2307/// 2308/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2309/// 2310/// \param __a 2311/// A 256-bit floating-point vector of [4 x double]. \n 2312/// Bits [63:0] are written to bits [63:0] of the return value. \n 2313/// Bits [191:128] are written to bits [191:128] of the return value. 2314/// \param __b 2315/// A 256-bit floating-point vector of [4 x double]. \n 2316/// Bits [63:0] are written to bits [127:64] of the return value. \n 2317/// Bits [191:128] are written to bits [255:192] of the return value. \n 2318/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2319static __inline __m256d __DEFAULT_FN_ATTRS 2320_mm256_unpacklo_pd(__m256d __a, __m256d __b) 2321{ 2322 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2323} 2324 2325/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2326/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2327/// vector of [8 x float]. 2328/// 2329/// \headerfile <x86intrin.h> 2330/// 2331/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2332/// 2333/// \param __a 2334/// A 256-bit vector of [8 x float]. \n 2335/// Bits [95:64] are written to bits [31:0] of the return value. \n 2336/// Bits [127:96] are written to bits [95:64] of the return value. \n 2337/// Bits [223:192] are written to bits [159:128] of the return value. \n 2338/// Bits [255:224] are written to bits [223:192] of the return value. 2339/// \param __b 2340/// A 256-bit vector of [8 x float]. \n 2341/// Bits [95:64] are written to bits [63:32] of the return value. \n 2342/// Bits [127:96] are written to bits [127:96] of the return value. \n 2343/// Bits [223:192] are written to bits [191:160] of the return value. \n 2344/// Bits [255:224] are written to bits [255:224] of the return value. 2345/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2346static __inline __m256 __DEFAULT_FN_ATTRS 2347_mm256_unpackhi_ps(__m256 __a, __m256 __b) 2348{ 2349 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2350} 2351 2352/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2353/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2354/// vector of [8 x float]. 2355/// 2356/// \headerfile <x86intrin.h> 2357/// 2358/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2359/// 2360/// \param __a 2361/// A 256-bit vector of [8 x float]. \n 2362/// Bits [31:0] are written to bits [31:0] of the return value. \n 2363/// Bits [63:32] are written to bits [95:64] of the return value. \n 2364/// Bits [159:128] are written to bits [159:128] of the return value. \n 2365/// Bits [191:160] are written to bits [223:192] of the return value. 2366/// \param __b 2367/// A 256-bit vector of [8 x float]. \n 2368/// Bits [31:0] are written to bits [63:32] of the return value. \n 2369/// Bits [63:32] are written to bits [127:96] of the return value. \n 2370/// Bits [159:128] are written to bits [191:160] of the return value. \n 2371/// Bits [191:160] are written to bits [255:224] of the return value. 2372/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2373static __inline __m256 __DEFAULT_FN_ATTRS 2374_mm256_unpacklo_ps(__m256 __a, __m256 __b) 2375{ 2376 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2377} 2378 2379/* Bit Test */ 2380/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2381/// element-by-element comparison of the double-precision element in the 2382/// first source vector and the corresponding element in the second source 2383/// vector. The EFLAGS register is updated as follows: \n 2384/// If there is at least one pair of double-precision elements where the 2385/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2386/// ZF flag is set to 1. \n 2387/// If there is at least one pair of double-precision elements where the 2388/// sign-bit of the first element is 0 and the sign-bit of the second element 2389/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2390/// This intrinsic returns the value of the ZF flag. 2391/// 2392/// \headerfile <x86intrin.h> 2393/// 2394/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2395/// 2396/// \param __a 2397/// A 128-bit vector of [2 x double]. 2398/// \param __b 2399/// A 128-bit vector of [2 x double]. 2400/// \returns the ZF flag in the EFLAGS register. 2401static __inline int __DEFAULT_FN_ATTRS 2402_mm_testz_pd(__m128d __a, __m128d __b) 2403{ 2404 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2405} 2406 2407/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2408/// element-by-element comparison of the double-precision element in the 2409/// first source vector and the corresponding element in the second source 2410/// vector. The EFLAGS register is updated as follows: \n 2411/// If there is at least one pair of double-precision elements where the 2412/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2413/// ZF flag is set to 1. \n 2414/// If there is at least one pair of double-precision elements where the 2415/// sign-bit of the first element is 0 and the sign-bit of the second element 2416/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2417/// This intrinsic returns the value of the CF flag. 2418/// 2419/// \headerfile <x86intrin.h> 2420/// 2421/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2422/// 2423/// \param __a 2424/// A 128-bit vector of [2 x double]. 2425/// \param __b 2426/// A 128-bit vector of [2 x double]. 2427/// \returns the CF flag in the EFLAGS register. 2428static __inline int __DEFAULT_FN_ATTRS 2429_mm_testc_pd(__m128d __a, __m128d __b) 2430{ 2431 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2432} 2433 2434/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2435/// element-by-element comparison of the double-precision element in the 2436/// first source vector and the corresponding element in the second source 2437/// vector. The EFLAGS register is updated as follows: \n 2438/// If there is at least one pair of double-precision elements where the 2439/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2440/// ZF flag is set to 1. \n 2441/// If there is at least one pair of double-precision elements where the 2442/// sign-bit of the first element is 0 and the sign-bit of the second element 2443/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2444/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2445/// otherwise it returns 0. 2446/// 2447/// \headerfile <x86intrin.h> 2448/// 2449/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2450/// 2451/// \param __a 2452/// A 128-bit vector of [2 x double]. 2453/// \param __b 2454/// A 128-bit vector of [2 x double]. 2455/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2456static __inline int __DEFAULT_FN_ATTRS 2457_mm_testnzc_pd(__m128d __a, __m128d __b) 2458{ 2459 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2460} 2461 2462/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2463/// element-by-element comparison of the single-precision element in the 2464/// first source vector and the corresponding element in the second source 2465/// vector. The EFLAGS register is updated as follows: \n 2466/// If there is at least one pair of single-precision elements where the 2467/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2468/// ZF flag is set to 1. \n 2469/// If there is at least one pair of single-precision elements where the 2470/// sign-bit of the first element is 0 and the sign-bit of the second element 2471/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2472/// This intrinsic returns the value of the ZF flag. 2473/// 2474/// \headerfile <x86intrin.h> 2475/// 2476/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2477/// 2478/// \param __a 2479/// A 128-bit vector of [4 x float]. 2480/// \param __b 2481/// A 128-bit vector of [4 x float]. 2482/// \returns the ZF flag. 2483static __inline int __DEFAULT_FN_ATTRS 2484_mm_testz_ps(__m128 __a, __m128 __b) 2485{ 2486 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2487} 2488 2489/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2490/// element-by-element comparison of the single-precision element in the 2491/// first source vector and the corresponding element in the second source 2492/// vector. The EFLAGS register is updated as follows: \n 2493/// If there is at least one pair of single-precision elements where the 2494/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2495/// ZF flag is set to 1. \n 2496/// If there is at least one pair of single-precision elements where the 2497/// sign-bit of the first element is 0 and the sign-bit of the second element 2498/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2499/// This intrinsic returns the value of the CF flag. 2500/// 2501/// \headerfile <x86intrin.h> 2502/// 2503/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2504/// 2505/// \param __a 2506/// A 128-bit vector of [4 x float]. 2507/// \param __b 2508/// A 128-bit vector of [4 x float]. 2509/// \returns the CF flag. 2510static __inline int __DEFAULT_FN_ATTRS 2511_mm_testc_ps(__m128 __a, __m128 __b) 2512{ 2513 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2514} 2515 2516/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2517/// element-by-element comparison of the single-precision element in the 2518/// first source vector and the corresponding element in the second source 2519/// vector. The EFLAGS register is updated as follows: \n 2520/// If there is at least one pair of single-precision elements where the 2521/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2522/// ZF flag is set to 1. \n 2523/// If there is at least one pair of single-precision elements where the 2524/// sign-bit of the first element is 0 and the sign-bit of the second element 2525/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2526/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2527/// otherwise it returns 0. 2528/// 2529/// \headerfile <x86intrin.h> 2530/// 2531/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2532/// 2533/// \param __a 2534/// A 128-bit vector of [4 x float]. 2535/// \param __b 2536/// A 128-bit vector of [4 x float]. 2537/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2538static __inline int __DEFAULT_FN_ATTRS 2539_mm_testnzc_ps(__m128 __a, __m128 __b) 2540{ 2541 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2542} 2543 2544/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2545/// element-by-element comparison of the double-precision elements in the 2546/// first source vector and the corresponding elements in the second source 2547/// vector. The EFLAGS register is updated as follows: \n 2548/// If there is at least one pair of double-precision elements where the 2549/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2550/// ZF flag is set to 1. \n 2551/// If there is at least one pair of double-precision elements where the 2552/// sign-bit of the first element is 0 and the sign-bit of the second element 2553/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2554/// This intrinsic returns the value of the ZF flag. 2555/// 2556/// \headerfile <x86intrin.h> 2557/// 2558/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2559/// 2560/// \param __a 2561/// A 256-bit vector of [4 x double]. 2562/// \param __b 2563/// A 256-bit vector of [4 x double]. 2564/// \returns the ZF flag. 2565static __inline int __DEFAULT_FN_ATTRS 2566_mm256_testz_pd(__m256d __a, __m256d __b) 2567{ 2568 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2569} 2570 2571/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2572/// element-by-element comparison of the double-precision elements in the 2573/// first source vector and the corresponding elements in the second source 2574/// vector. The EFLAGS register is updated as follows: \n 2575/// If there is at least one pair of double-precision elements where the 2576/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2577/// ZF flag is set to 1. \n 2578/// If there is at least one pair of double-precision elements where the 2579/// sign-bit of the first element is 0 and the sign-bit of the second element 2580/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2581/// This intrinsic returns the value of the CF flag. 2582/// 2583/// \headerfile <x86intrin.h> 2584/// 2585/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2586/// 2587/// \param __a 2588/// A 256-bit vector of [4 x double]. 2589/// \param __b 2590/// A 256-bit vector of [4 x double]. 2591/// \returns the CF flag. 2592static __inline int __DEFAULT_FN_ATTRS 2593_mm256_testc_pd(__m256d __a, __m256d __b) 2594{ 2595 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2596} 2597 2598/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2599/// element-by-element comparison of the double-precision elements in the 2600/// first source vector and the corresponding elements in the second source 2601/// vector. The EFLAGS register is updated as follows: \n 2602/// If there is at least one pair of double-precision elements where the 2603/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2604/// ZF flag is set to 1. \n 2605/// If there is at least one pair of double-precision elements where the 2606/// sign-bit of the first element is 0 and the sign-bit of the second element 2607/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2608/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2609/// otherwise it returns 0. 2610/// 2611/// \headerfile <x86intrin.h> 2612/// 2613/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2614/// 2615/// \param __a 2616/// A 256-bit vector of [4 x double]. 2617/// \param __b 2618/// A 256-bit vector of [4 x double]. 2619/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2620static __inline int __DEFAULT_FN_ATTRS 2621_mm256_testnzc_pd(__m256d __a, __m256d __b) 2622{ 2623 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2624} 2625 2626/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2627/// element-by-element comparison of the single-precision element in the 2628/// first source vector and the corresponding element in the second source 2629/// vector. The EFLAGS register is updated as follows: \n 2630/// If there is at least one pair of single-precision elements where the 2631/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2632/// ZF flag is set to 1. \n 2633/// If there is at least one pair of single-precision elements where the 2634/// sign-bit of the first element is 0 and the sign-bit of the second element 2635/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2636/// This intrinsic returns the value of the ZF flag. 2637/// 2638/// \headerfile <x86intrin.h> 2639/// 2640/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2641/// 2642/// \param __a 2643/// A 256-bit vector of [8 x float]. 2644/// \param __b 2645/// A 256-bit vector of [8 x float]. 2646/// \returns the ZF flag. 2647static __inline int __DEFAULT_FN_ATTRS 2648_mm256_testz_ps(__m256 __a, __m256 __b) 2649{ 2650 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2651} 2652 2653/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2654/// element-by-element comparison of the single-precision element in the 2655/// first source vector and the corresponding element in the second source 2656/// vector. The EFLAGS register is updated as follows: \n 2657/// If there is at least one pair of single-precision elements where the 2658/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2659/// ZF flag is set to 1. \n 2660/// If there is at least one pair of single-precision elements where the 2661/// sign-bit of the first element is 0 and the sign-bit of the second element 2662/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2663/// This intrinsic returns the value of the CF flag. 2664/// 2665/// \headerfile <x86intrin.h> 2666/// 2667/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2668/// 2669/// \param __a 2670/// A 256-bit vector of [8 x float]. 2671/// \param __b 2672/// A 256-bit vector of [8 x float]. 2673/// \returns the CF flag. 2674static __inline int __DEFAULT_FN_ATTRS 2675_mm256_testc_ps(__m256 __a, __m256 __b) 2676{ 2677 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2678} 2679 2680/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2681/// element-by-element comparison of the single-precision elements in the 2682/// first source vector and the corresponding elements in the second source 2683/// vector. The EFLAGS register is updated as follows: \n 2684/// If there is at least one pair of single-precision elements where the 2685/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2686/// ZF flag is set to 1. \n 2687/// If there is at least one pair of single-precision elements where the 2688/// sign-bit of the first element is 0 and the sign-bit of the second element 2689/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2690/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2691/// otherwise it returns 0. 2692/// 2693/// \headerfile <x86intrin.h> 2694/// 2695/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2696/// 2697/// \param __a 2698/// A 256-bit vector of [8 x float]. 2699/// \param __b 2700/// A 256-bit vector of [8 x float]. 2701/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2702static __inline int __DEFAULT_FN_ATTRS 2703_mm256_testnzc_ps(__m256 __a, __m256 __b) 2704{ 2705 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2706} 2707 2708/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2709/// of the two source vectors and update the EFLAGS register as follows: \n 2710/// If there is at least one pair of bits where both bits are 1, the ZF flag 2711/// is set to 0. Otherwise the ZF flag is set to 1. \n 2712/// If there is at least one pair of bits where the bit from the first source 2713/// vector is 0 and the bit from the second source vector is 1, the CF flag 2714/// is set to 0. Otherwise the CF flag is set to 1. \n 2715/// This intrinsic returns the value of the ZF flag. 2716/// 2717/// \headerfile <x86intrin.h> 2718/// 2719/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2720/// 2721/// \param __a 2722/// A 256-bit integer vector. 2723/// \param __b 2724/// A 256-bit integer vector. 2725/// \returns the ZF flag. 2726static __inline int __DEFAULT_FN_ATTRS 2727_mm256_testz_si256(__m256i __a, __m256i __b) 2728{ 2729 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2730} 2731 2732/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2733/// of the two source vectors and update the EFLAGS register as follows: \n 2734/// If there is at least one pair of bits where both bits are 1, the ZF flag 2735/// is set to 0. Otherwise the ZF flag is set to 1. \n 2736/// If there is at least one pair of bits where the bit from the first source 2737/// vector is 0 and the bit from the second source vector is 1, the CF flag 2738/// is set to 0. Otherwise the CF flag is set to 1. \n 2739/// This intrinsic returns the value of the CF flag. 2740/// 2741/// \headerfile <x86intrin.h> 2742/// 2743/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2744/// 2745/// \param __a 2746/// A 256-bit integer vector. 2747/// \param __b 2748/// A 256-bit integer vector. 2749/// \returns the CF flag. 2750static __inline int __DEFAULT_FN_ATTRS 2751_mm256_testc_si256(__m256i __a, __m256i __b) 2752{ 2753 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2754} 2755 2756/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2757/// of the two source vectors and update the EFLAGS register as follows: \n 2758/// If there is at least one pair of bits where both bits are 1, the ZF flag 2759/// is set to 0. Otherwise the ZF flag is set to 1. \n 2760/// If there is at least one pair of bits where the bit from the first source 2761/// vector is 0 and the bit from the second source vector is 1, the CF flag 2762/// is set to 0. Otherwise the CF flag is set to 1. \n 2763/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2764/// otherwise it returns 0. 2765/// 2766/// \headerfile <x86intrin.h> 2767/// 2768/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2769/// 2770/// \param __a 2771/// A 256-bit integer vector. 2772/// \param __b 2773/// A 256-bit integer vector. 2774/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2775static __inline int __DEFAULT_FN_ATTRS 2776_mm256_testnzc_si256(__m256i __a, __m256i __b) 2777{ 2778 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2779} 2780 2781/* Vector extract sign mask */ 2782/// \brief Extracts the sign bits of double-precision floating point elements 2783/// in a 256-bit vector of [4 x double] and writes them to the lower order 2784/// bits of the return value. 2785/// 2786/// \headerfile <x86intrin.h> 2787/// 2788/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2789/// 2790/// \param __a 2791/// A 256-bit vector of [4 x double] containing the double-precision 2792/// floating point values with sign bits to be extracted. 2793/// \returns The sign bits from the operand, written to bits [3:0]. 2794static __inline int __DEFAULT_FN_ATTRS 2795_mm256_movemask_pd(__m256d __a) 2796{ 2797 return __builtin_ia32_movmskpd256((__v4df)__a); 2798} 2799 2800/// \brief Extracts the sign bits of double-precision floating point elements 2801/// in a 256-bit vector of [8 x float] and writes them to the lower order 2802/// bits of the return value. 2803/// 2804/// \headerfile <x86intrin.h> 2805/// 2806/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 2807/// 2808/// \param __a 2809/// A 256-bit vector of [8 x float] containing the double-precision floating 2810/// point values with sign bits to be extracted. 2811/// \returns The sign bits from the operand, written to bits [7:0]. 2812static __inline int __DEFAULT_FN_ATTRS 2813_mm256_movemask_ps(__m256 __a) 2814{ 2815 return __builtin_ia32_movmskps256((__v8sf)__a); 2816} 2817 2818/* Vector __zero */ 2819/// \brief Zeroes the contents of all XMM or YMM registers. 2820/// 2821/// \headerfile <x86intrin.h> 2822/// 2823/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 2824static __inline void __DEFAULT_FN_ATTRS 2825_mm256_zeroall(void) 2826{ 2827 __builtin_ia32_vzeroall(); 2828} 2829 2830/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 2831/// 2832/// \headerfile <x86intrin.h> 2833/// 2834/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 2835static __inline void __DEFAULT_FN_ATTRS 2836_mm256_zeroupper(void) 2837{ 2838 __builtin_ia32_vzeroupper(); 2839} 2840 2841/* Vector load with broadcast */ 2842/// \brief Loads a scalar single-precision floating point value from the 2843/// specified address pointed to by \a __a and broadcasts it to the elements 2844/// of a [4 x float] vector. 2845/// 2846/// \headerfile <x86intrin.h> 2847/// 2848/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 2849/// 2850/// \param __a 2851/// The single-precision floating point value to be broadcast. 2852/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 2853/// equal to the broadcast value. 2854static __inline __m128 __DEFAULT_FN_ATTRS 2855_mm_broadcast_ss(float const *__a) 2856{ 2857 float __f = *__a; 2858 return (__m128)(__v4sf){ __f, __f, __f, __f }; 2859} 2860 2861/// \brief Loads a scalar double-precision floating point value from the 2862/// specified address pointed to by \a __a and broadcasts it to the elements 2863/// of a [4 x double] vector. 2864/// 2865/// \headerfile <x86intrin.h> 2866/// 2867/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 2868/// 2869/// \param __a 2870/// The double-precision floating point value to be broadcast. 2871/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 2872/// equal to the broadcast value. 2873static __inline __m256d __DEFAULT_FN_ATTRS 2874_mm256_broadcast_sd(double const *__a) 2875{ 2876 double __d = *__a; 2877 return (__m256d)(__v4df){ __d, __d, __d, __d }; 2878} 2879 2880/// \brief Loads a scalar single-precision floating point value from the 2881/// specified address pointed to by \a __a and broadcasts it to the elements 2882/// of a [8 x float] vector. 2883/// 2884/// \headerfile <x86intrin.h> 2885/// 2886/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 2887/// 2888/// \param __a 2889/// The single-precision floating point value to be broadcast. 2890/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 2891/// equal to the broadcast value. 2892static __inline __m256 __DEFAULT_FN_ATTRS 2893_mm256_broadcast_ss(float const *__a) 2894{ 2895 float __f = *__a; 2896 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 2897} 2898 2899/// \brief Loads the data from a 128-bit vector of [2 x double] from the 2900/// specified address pointed to by \a __a and broadcasts it to 128-bit 2901/// elements in a 256-bit vector of [4 x double]. 2902/// 2903/// \headerfile <x86intrin.h> 2904/// 2905/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 2906/// 2907/// \param __a 2908/// The 128-bit vector of [2 x double] to be broadcast. 2909/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 2910/// equal to the broadcast value. 2911static __inline __m256d __DEFAULT_FN_ATTRS 2912_mm256_broadcast_pd(__m128d const *__a) 2913{ 2914 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 2915} 2916 2917/// \brief Loads the data from a 128-bit vector of [4 x float] from the 2918/// specified address pointed to by \a __a and broadcasts it to 128-bit 2919/// elements in a 256-bit vector of [8 x float]. 2920/// 2921/// \headerfile <x86intrin.h> 2922/// 2923/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 2924/// 2925/// \param __a 2926/// The 128-bit vector of [4 x float] to be broadcast. 2927/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 2928/// equal to the broadcast value. 2929static __inline __m256 __DEFAULT_FN_ATTRS 2930_mm256_broadcast_ps(__m128 const *__a) 2931{ 2932 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 2933} 2934 2935/* SIMD load ops */ 2936/// \brief Loads 4 double-precision floating point values from a 32-byte aligned 2937/// memory location pointed to by \a __p into a vector of [4 x double]. 2938/// 2939/// \headerfile <x86intrin.h> 2940/// 2941/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 2942/// 2943/// \param __p 2944/// A 32-byte aligned pointer to a memory location containing 2945/// double-precision floating point values. 2946/// \returns A 256-bit vector of [4 x double] containing the moved values. 2947static __inline __m256d __DEFAULT_FN_ATTRS 2948_mm256_load_pd(double const *__p) 2949{ 2950 return *(__m256d *)__p; 2951} 2952 2953/// \brief Loads 8 single-precision floating point values from a 32-byte aligned 2954/// memory location pointed to by \a __p into a vector of [8 x float]. 2955/// 2956/// \headerfile <x86intrin.h> 2957/// 2958/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 2959/// 2960/// \param __p 2961/// A 32-byte aligned pointer to a memory location containing float values. 2962/// \returns A 256-bit vector of [8 x float] containing the moved values. 2963static __inline __m256 __DEFAULT_FN_ATTRS 2964_mm256_load_ps(float const *__p) 2965{ 2966 return *(__m256 *)__p; 2967} 2968 2969/// \brief Loads 4 double-precision floating point values from an unaligned 2970/// memory location pointed to by \a __p into a vector of [4 x double]. 2971/// 2972/// \headerfile <x86intrin.h> 2973/// 2974/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 2975/// 2976/// \param __p 2977/// A pointer to a memory location containing double-precision floating 2978/// point values. 2979/// \returns A 256-bit vector of [4 x double] containing the moved values. 2980static __inline __m256d __DEFAULT_FN_ATTRS 2981_mm256_loadu_pd(double const *__p) 2982{ 2983 struct __loadu_pd { 2984 __m256d __v; 2985 } __attribute__((__packed__, __may_alias__)); 2986 return ((struct __loadu_pd*)__p)->__v; 2987} 2988 2989/// \brief Loads 8 single-precision floating point values from an unaligned 2990/// memory location pointed to by \a __p into a vector of [8 x float]. 2991/// 2992/// \headerfile <x86intrin.h> 2993/// 2994/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 2995/// 2996/// \param __p 2997/// A pointer to a memory location containing single-precision floating 2998/// point values. 2999/// \returns A 256-bit vector of [8 x float] containing the moved values. 3000static __inline __m256 __DEFAULT_FN_ATTRS 3001_mm256_loadu_ps(float const *__p) 3002{ 3003 struct __loadu_ps { 3004 __m256 __v; 3005 } __attribute__((__packed__, __may_alias__)); 3006 return ((struct __loadu_ps*)__p)->__v; 3007} 3008 3009/// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3010/// location pointed to by \a __p into elements of a 256-bit integer vector. 3011/// 3012/// \headerfile <x86intrin.h> 3013/// 3014/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3015/// 3016/// \param __p 3017/// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3018/// values. 3019/// \returns A 256-bit integer vector containing the moved values. 3020static __inline __m256i __DEFAULT_FN_ATTRS 3021_mm256_load_si256(__m256i const *__p) 3022{ 3023 return *__p; 3024} 3025 3026/// \brief Loads 256 bits of integer data from an unaligned memory location 3027/// pointed to by \a __p into a 256-bit integer vector. 3028/// 3029/// \headerfile <x86intrin.h> 3030/// 3031/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3032/// 3033/// \param __p 3034/// A pointer to a 256-bit integer vector containing integer values. 3035/// \returns A 256-bit integer vector containing the moved values. 3036static __inline __m256i __DEFAULT_FN_ATTRS 3037_mm256_loadu_si256(__m256i const *__p) 3038{ 3039 struct __loadu_si256 { 3040 __m256i __v; 3041 } __attribute__((__packed__, __may_alias__)); 3042 return ((struct __loadu_si256*)__p)->__v; 3043} 3044 3045/// \brief Loads 256 bits of integer data from an unaligned memory location 3046/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3047/// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3048/// line boundary. 3049/// 3050/// \headerfile <x86intrin.h> 3051/// 3052/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3053/// 3054/// \param __p 3055/// A pointer to a 256-bit integer vector containing integer values. 3056/// \returns A 256-bit integer vector containing the moved values. 3057static __inline __m256i __DEFAULT_FN_ATTRS 3058_mm256_lddqu_si256(__m256i const *__p) 3059{ 3060 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3061} 3062 3063/* SIMD store ops */ 3064/// \brief Stores double-precision floating point values from a 256-bit vector 3065/// of [4 x double] to a 32-byte aligned memory location pointed to by 3066/// \a __p. 3067/// 3068/// \headerfile <x86intrin.h> 3069/// 3070/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3071/// 3072/// \param __p 3073/// A 32-byte aligned pointer to a memory location that will receive the 3074/// double-precision floaing point values. 3075/// \param __a 3076/// A 256-bit vector of [4 x double] containing the values to be moved. 3077static __inline void __DEFAULT_FN_ATTRS 3078_mm256_store_pd(double *__p, __m256d __a) 3079{ 3080 *(__m256d *)__p = __a; 3081} 3082 3083/// \brief Stores single-precision floating point values from a 256-bit vector 3084/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3085/// 3086/// \headerfile <x86intrin.h> 3087/// 3088/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3089/// 3090/// \param __p 3091/// A 32-byte aligned pointer to a memory location that will receive the 3092/// float values. 3093/// \param __a 3094/// A 256-bit vector of [8 x float] containing the values to be moved. 3095static __inline void __DEFAULT_FN_ATTRS 3096_mm256_store_ps(float *__p, __m256 __a) 3097{ 3098 *(__m256 *)__p = __a; 3099} 3100 3101/// \brief Stores double-precision floating point values from a 256-bit vector 3102/// of [4 x double] to an unaligned memory location pointed to by \a __p. 3103/// 3104/// \headerfile <x86intrin.h> 3105/// 3106/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3107/// 3108/// \param __p 3109/// A pointer to a memory location that will receive the double-precision 3110/// floating point values. 3111/// \param __a 3112/// A 256-bit vector of [4 x double] containing the values to be moved. 3113static __inline void __DEFAULT_FN_ATTRS 3114_mm256_storeu_pd(double *__p, __m256d __a) 3115{ 3116 struct __storeu_pd { 3117 __m256d __v; 3118 } __attribute__((__packed__, __may_alias__)); 3119 ((struct __storeu_pd*)__p)->__v = __a; 3120} 3121 3122/// \brief Stores single-precision floating point values from a 256-bit vector 3123/// of [8 x float] to an unaligned memory location pointed to by \a __p. 3124/// 3125/// \headerfile <x86intrin.h> 3126/// 3127/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3128/// 3129/// \param __p 3130/// A pointer to a memory location that will receive the float values. 3131/// \param __a 3132/// A 256-bit vector of [8 x float] containing the values to be moved. 3133static __inline void __DEFAULT_FN_ATTRS 3134_mm256_storeu_ps(float *__p, __m256 __a) 3135{ 3136 struct __storeu_ps { 3137 __m256 __v; 3138 } __attribute__((__packed__, __may_alias__)); 3139 ((struct __storeu_ps*)__p)->__v = __a; 3140} 3141 3142/// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3143/// aligned memory location pointed to by \a __p. 3144/// 3145/// \headerfile <x86intrin.h> 3146/// 3147/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3148/// 3149/// \param __p 3150/// A 32-byte aligned pointer to a memory location that will receive the 3151/// integer values. 3152/// \param __a 3153/// A 256-bit integer vector containing the values to be moved. 3154static __inline void __DEFAULT_FN_ATTRS 3155_mm256_store_si256(__m256i *__p, __m256i __a) 3156{ 3157 *__p = __a; 3158} 3159 3160/// \brief Stores integer values from a 256-bit integer vector to an unaligned 3161/// memory location pointed to by \a __p. 3162/// 3163/// \headerfile <x86intrin.h> 3164/// 3165/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3166/// 3167/// \param __p 3168/// A pointer to a memory location that will receive the integer values. 3169/// \param __a 3170/// A 256-bit integer vector containing the values to be moved. 3171static __inline void __DEFAULT_FN_ATTRS 3172_mm256_storeu_si256(__m256i *__p, __m256i __a) 3173{ 3174 struct __storeu_si256 { 3175 __m256i __v; 3176 } __attribute__((__packed__, __may_alias__)); 3177 ((struct __storeu_si256*)__p)->__v = __a; 3178} 3179 3180/* Conditional load ops */ 3181/// \brief Conditionally loads double-precision floating point elements from a 3182/// memory location pointed to by \a __p into a 128-bit vector of 3183/// [2 x double], depending on the mask bits associated with each data 3184/// element. 3185/// 3186/// \headerfile <x86intrin.h> 3187/// 3188/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3189/// 3190/// \param __p 3191/// A pointer to a memory location that contains the double-precision 3192/// floating point values. 3193/// \param __m 3194/// A 128-bit integer vector containing the mask. The most significant bit of 3195/// each data element represents the mask bits. If a mask bit is zero, the 3196/// corresponding value in the memory location is not loaded and the 3197/// corresponding field in the return value is set to zero. 3198/// \returns A 128-bit vector of [2 x double] containing the loaded values. 3199static __inline __m128d __DEFAULT_FN_ATTRS 3200_mm_maskload_pd(double const *__p, __m128i __m) 3201{ 3202 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3203} 3204 3205/// \brief Conditionally loads double-precision floating point elements from a 3206/// memory location pointed to by \a __p into a 256-bit vector of 3207/// [4 x double], depending on the mask bits associated with each data 3208/// element. 3209/// 3210/// \headerfile <x86intrin.h> 3211/// 3212/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3213/// 3214/// \param __p 3215/// A pointer to a memory location that contains the double-precision 3216/// floating point values. 3217/// \param __m 3218/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3219/// significant bit of each quadword element represents the mask bits. If a 3220/// mask bit is zero, the corresponding value in the memory location is not 3221/// loaded and the corresponding field in the return value is set to zero. 3222/// \returns A 256-bit vector of [4 x double] containing the loaded values. 3223static __inline __m256d __DEFAULT_FN_ATTRS 3224_mm256_maskload_pd(double const *__p, __m256i __m) 3225{ 3226 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3227 (__v4di)__m); 3228} 3229 3230/// \brief Conditionally loads single-precision floating point elements from a 3231/// memory location pointed to by \a __p into a 128-bit vector of 3232/// [4 x float], depending on the mask bits associated with each data 3233/// element. 3234/// 3235/// \headerfile <x86intrin.h> 3236/// 3237/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3238/// 3239/// \param __p 3240/// A pointer to a memory location that contains the single-precision 3241/// floating point values. 3242/// \param __m 3243/// A 128-bit integer vector containing the mask. The most significant bit of 3244/// each data element represents the mask bits. If a mask bit is zero, the 3245/// corresponding value in the memory location is not loaded and the 3246/// corresponding field in the return value is set to zero. 3247/// \returns A 128-bit vector of [4 x float] containing the loaded values. 3248static __inline __m128 __DEFAULT_FN_ATTRS 3249_mm_maskload_ps(float const *__p, __m128i __m) 3250{ 3251 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3252} 3253 3254/// \brief Conditionally loads single-precision floating point elements from a 3255/// memory location pointed to by \a __p into a 256-bit vector of 3256/// [8 x float], depending on the mask bits associated with each data 3257/// element. 3258/// 3259/// \headerfile <x86intrin.h> 3260/// 3261/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3262/// 3263/// \param __p 3264/// A pointer to a memory location that contains the single-precision 3265/// floating point values. 3266/// \param __m 3267/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3268/// significant bit of each dword element represents the mask bits. If a mask 3269/// bit is zero, the corresponding value in the memory location is not loaded 3270/// and the corresponding field in the return value is set to zero. 3271/// \returns A 256-bit vector of [8 x float] containing the loaded values. 3272static __inline __m256 __DEFAULT_FN_ATTRS 3273_mm256_maskload_ps(float const *__p, __m256i __m) 3274{ 3275 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3276} 3277 3278/* Conditional store ops */ 3279/// \brief Moves single-precision floating point values from a 256-bit vector 3280/// of [8 x float] to a memory location pointed to by \a __p, according to 3281/// the specified mask. 3282/// 3283/// \headerfile <x86intrin.h> 3284/// 3285/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3286/// 3287/// \param __p 3288/// A pointer to a memory location that will receive the float values. 3289/// \param __m 3290/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3291/// significant bit of each dword element in the mask vector represents the 3292/// mask bits. If a mask bit is zero, the corresponding value from vector 3293/// \a __a is not stored and the corresponding field in the memory location 3294/// pointed to by \a __p is not changed. 3295/// \param __a 3296/// A 256-bit vector of [8 x float] containing the values to be stored. 3297static __inline void __DEFAULT_FN_ATTRS 3298_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3299{ 3300 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3301} 3302 3303/// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3304/// to a memory location pointed to by \a __p, according to the specified 3305/// mask. 3306/// 3307/// \headerfile <x86intrin.h> 3308/// 3309/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3310/// 3311/// \param __p 3312/// A pointer to a memory location that will receive the float values. 3313/// \param __m 3314/// A 128-bit integer vector containing the mask. The most significant bit of 3315/// each field in the mask vector represents the mask bits. If a mask bit is 3316/// zero, the corresponding value from vector \a __a is not stored and the 3317/// corresponding field in the memory location pointed to by \a __p is not 3318/// changed. 3319/// \param __a 3320/// A 128-bit vector of [2 x double] containing the values to be stored. 3321static __inline void __DEFAULT_FN_ATTRS 3322_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3323{ 3324 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3325} 3326 3327/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3328/// to a memory location pointed to by \a __p, according to the specified 3329/// mask. 3330/// 3331/// \headerfile <x86intrin.h> 3332/// 3333/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3334/// 3335/// \param __p 3336/// A pointer to a memory location that will receive the float values. 3337/// \param __m 3338/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3339/// significant bit of each quadword element in the mask vector represents 3340/// the mask bits. If a mask bit is zero, the corresponding value from vector 3341/// __a is not stored and the corresponding field in the memory location 3342/// pointed to by \a __p is not changed. 3343/// \param __a 3344/// A 256-bit vector of [4 x double] containing the values to be stored. 3345static __inline void __DEFAULT_FN_ATTRS 3346_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3347{ 3348 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3349} 3350 3351/// \brief Moves single-precision floating point values from a 128-bit vector 3352/// of [4 x float] to a memory location pointed to by \a __p, according to 3353/// the specified mask. 3354/// 3355/// \headerfile <x86intrin.h> 3356/// 3357/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3358/// 3359/// \param __p 3360/// A pointer to a memory location that will receive the float values. 3361/// \param __m 3362/// A 128-bit integer vector containing the mask. The most significant bit of 3363/// each field in the mask vector represents the mask bits. If a mask bit is 3364/// zero, the corresponding value from vector __a is not stored and the 3365/// corresponding field in the memory location pointed to by \a __p is not 3366/// changed. 3367/// \param __a 3368/// A 128-bit vector of [4 x float] containing the values to be stored. 3369static __inline void __DEFAULT_FN_ATTRS 3370_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3371{ 3372 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3373} 3374 3375/* Cacheability support ops */ 3376/// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3377/// aligned memory location. To minimize caching, the data is flagged as 3378/// non-temporal (unlikely to be used again soon). 3379/// 3380/// \headerfile <x86intrin.h> 3381/// 3382/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3383/// 3384/// \param __a 3385/// A pointer to a 32-byte aligned memory location that will receive the 3386/// integer values. 3387/// \param __b 3388/// A 256-bit integer vector containing the values to be moved. 3389static __inline void __DEFAULT_FN_ATTRS 3390_mm256_stream_si256(__m256i *__a, __m256i __b) 3391{ 3392 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 3393} 3394 3395/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3396/// to a 32-byte aligned memory location. To minimize caching, the data is 3397/// flagged as non-temporal (unlikely to be used again soon). 3398/// 3399/// \headerfile <x86intrin.h> 3400/// 3401/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3402/// 3403/// \param __a 3404/// A pointer to a 32-byte aligned memory location that will receive the 3405/// integer values. 3406/// \param __b 3407/// A 256-bit vector of [4 x double] containing the values to be moved. 3408static __inline void __DEFAULT_FN_ATTRS 3409_mm256_stream_pd(double *__a, __m256d __b) 3410{ 3411 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 3412} 3413 3414/// \brief Moves single-precision floating point values from a 256-bit vector 3415/// of [8 x float] to a 32-byte aligned memory location. To minimize 3416/// caching, the data is flagged as non-temporal (unlikely to be used again 3417/// soon). 3418/// 3419/// \headerfile <x86intrin.h> 3420/// 3421/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3422/// 3423/// \param __p 3424/// A pointer to a 32-byte aligned memory location that will receive the 3425/// single-precision floating point values. 3426/// \param __a 3427/// A 256-bit vector of [8 x float] containing the values to be moved. 3428static __inline void __DEFAULT_FN_ATTRS 3429_mm256_stream_ps(float *__p, __m256 __a) 3430{ 3431 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 3432} 3433 3434/* Create vectors */ 3435/// \brief Create a 256-bit vector of [4 x double] with undefined values. 3436/// 3437/// \headerfile <x86intrin.h> 3438/// 3439/// This intrinsic has no corresponding instruction. 3440/// 3441/// \returns A 256-bit vector of [4 x double] containing undefined values. 3442static __inline__ __m256d __DEFAULT_FN_ATTRS 3443_mm256_undefined_pd(void) 3444{ 3445 return (__m256d)__builtin_ia32_undef256(); 3446} 3447 3448/// \brief Create a 256-bit vector of [8 x float] with undefined values. 3449/// 3450/// \headerfile <x86intrin.h> 3451/// 3452/// This intrinsic has no corresponding instruction. 3453/// 3454/// \returns A 256-bit vector of [8 x float] containing undefined values. 3455static __inline__ __m256 __DEFAULT_FN_ATTRS 3456_mm256_undefined_ps(void) 3457{ 3458 return (__m256)__builtin_ia32_undef256(); 3459} 3460 3461/// \brief Create a 256-bit integer vector with undefined values. 3462/// 3463/// \headerfile <x86intrin.h> 3464/// 3465/// This intrinsic has no corresponding instruction. 3466/// 3467/// \returns A 256-bit integer vector containing undefined values. 3468static __inline__ __m256i __DEFAULT_FN_ATTRS 3469_mm256_undefined_si256(void) 3470{ 3471 return (__m256i)__builtin_ia32_undef256(); 3472} 3473 3474/// \brief Constructs a 256-bit floating-point vector of [4 x double] 3475/// initialized with the specified double-precision floating-point values. 3476/// 3477/// \headerfile <x86intrin.h> 3478/// 3479/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3480/// instruction. 3481/// 3482/// \param __a 3483/// A double-precision floating-point value used to initialize bits [255:192] 3484/// of the result. 3485/// \param __b 3486/// A double-precision floating-point value used to initialize bits [191:128] 3487/// of the result. 3488/// \param __c 3489/// A double-precision floating-point value used to initialize bits [127:64] 3490/// of the result. 3491/// \param __d 3492/// A double-precision floating-point value used to initialize bits [63:0] 3493/// of the result. 3494/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3495static __inline __m256d __DEFAULT_FN_ATTRS 3496_mm256_set_pd(double __a, double __b, double __c, double __d) 3497{ 3498 return (__m256d){ __d, __c, __b, __a }; 3499} 3500 3501/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3502/// with the specified single-precision floating-point values. 3503/// 3504/// \headerfile <x86intrin.h> 3505/// 3506/// This intrinsic is a utility function and does not correspond to a specific 3507/// instruction. 3508/// 3509/// \param __a 3510/// A single-precision floating-point value used to initialize bits [255:224] 3511/// of the result. 3512/// \param __b 3513/// A single-precision floating-point value used to initialize bits [223:192] 3514/// of the result. 3515/// \param __c 3516/// A single-precision floating-point value used to initialize bits [191:160] 3517/// of the result. 3518/// \param __d 3519/// A single-precision floating-point value used to initialize bits [159:128] 3520/// of the result. 3521/// \param __e 3522/// A single-precision floating-point value used to initialize bits [127:96] 3523/// of the result. 3524/// \param __f 3525/// A single-precision floating-point value used to initialize bits [95:64] 3526/// of the result. 3527/// \param __g 3528/// A single-precision floating-point value used to initialize bits [63:32] 3529/// of the result. 3530/// \param __h 3531/// A single-precision floating-point value used to initialize bits [31:0] 3532/// of the result. 3533/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3534static __inline __m256 __DEFAULT_FN_ATTRS 3535_mm256_set_ps(float __a, float __b, float __c, float __d, 3536 float __e, float __f, float __g, float __h) 3537{ 3538 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3539} 3540 3541/// \brief Constructs a 256-bit integer vector initialized with the specified 3542/// 32-bit integral values. 3543/// 3544/// \headerfile <x86intrin.h> 3545/// 3546/// This intrinsic is a utility function and does not correspond to a specific 3547/// instruction. 3548/// 3549/// \param __i0 3550/// A 32-bit integral value used to initialize bits [255:224] of the result. 3551/// \param __i1 3552/// A 32-bit integral value used to initialize bits [223:192] of the result. 3553/// \param __i2 3554/// A 32-bit integral value used to initialize bits [191:160] of the result. 3555/// \param __i3 3556/// A 32-bit integral value used to initialize bits [159:128] of the result. 3557/// \param __i4 3558/// A 32-bit integral value used to initialize bits [127:96] of the result. 3559/// \param __i5 3560/// A 32-bit integral value used to initialize bits [95:64] of the result. 3561/// \param __i6 3562/// A 32-bit integral value used to initialize bits [63:32] of the result. 3563/// \param __i7 3564/// A 32-bit integral value used to initialize bits [31:0] of the result. 3565/// \returns An initialized 256-bit integer vector. 3566static __inline __m256i __DEFAULT_FN_ATTRS 3567_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3568 int __i4, int __i5, int __i6, int __i7) 3569{ 3570 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3571} 3572 3573/// \brief Constructs a 256-bit integer vector initialized with the specified 3574/// 16-bit integral values. 3575/// 3576/// \headerfile <x86intrin.h> 3577/// 3578/// This intrinsic is a utility function and does not correspond to a specific 3579/// instruction. 3580/// 3581/// \param __w15 3582/// A 16-bit integral value used to initialize bits [255:240] of the result. 3583/// \param __w14 3584/// A 16-bit integral value used to initialize bits [239:224] of the result. 3585/// \param __w13 3586/// A 16-bit integral value used to initialize bits [223:208] of the result. 3587/// \param __w12 3588/// A 16-bit integral value used to initialize bits [207:192] of the result. 3589/// \param __w11 3590/// A 16-bit integral value used to initialize bits [191:176] of the result. 3591/// \param __w10 3592/// A 16-bit integral value used to initialize bits [175:160] of the result. 3593/// \param __w09 3594/// A 16-bit integral value used to initialize bits [159:144] of the result. 3595/// \param __w08 3596/// A 16-bit integral value used to initialize bits [143:128] of the result. 3597/// \param __w07 3598/// A 16-bit integral value used to initialize bits [127:112] of the result. 3599/// \param __w06 3600/// A 16-bit integral value used to initialize bits [111:96] of the result. 3601/// \param __w05 3602/// A 16-bit integral value used to initialize bits [95:80] of the result. 3603/// \param __w04 3604/// A 16-bit integral value used to initialize bits [79:64] of the result. 3605/// \param __w03 3606/// A 16-bit integral value used to initialize bits [63:48] of the result. 3607/// \param __w02 3608/// A 16-bit integral value used to initialize bits [47:32] of the result. 3609/// \param __w01 3610/// A 16-bit integral value used to initialize bits [31:16] of the result. 3611/// \param __w00 3612/// A 16-bit integral value used to initialize bits [15:0] of the result. 3613/// \returns An initialized 256-bit integer vector. 3614static __inline __m256i __DEFAULT_FN_ATTRS 3615_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3616 short __w11, short __w10, short __w09, short __w08, 3617 short __w07, short __w06, short __w05, short __w04, 3618 short __w03, short __w02, short __w01, short __w00) 3619{ 3620 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3621 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3622} 3623 3624/// \brief Constructs a 256-bit integer vector initialized with the specified 3625/// 8-bit integral values. 3626/// 3627/// \headerfile <x86intrin.h> 3628/// 3629/// This intrinsic is a utility function and does not correspond to a specific 3630/// instruction. 3631/// 3632/// \param __b31 3633/// An 8-bit integral value used to initialize bits [255:248] of the result. 3634/// \param __b30 3635/// An 8-bit integral value used to initialize bits [247:240] of the result. 3636/// \param __b29 3637/// An 8-bit integral value used to initialize bits [239:232] of the result. 3638/// \param __b28 3639/// An 8-bit integral value used to initialize bits [231:224] of the result. 3640/// \param __b27 3641/// An 8-bit integral value used to initialize bits [223:216] of the result. 3642/// \param __b26 3643/// An 8-bit integral value used to initialize bits [215:208] of the result. 3644/// \param __b25 3645/// An 8-bit integral value used to initialize bits [207:200] of the result. 3646/// \param __b24 3647/// An 8-bit integral value used to initialize bits [199:192] of the result. 3648/// \param __b23 3649/// An 8-bit integral value used to initialize bits [191:184] of the result. 3650/// \param __b22 3651/// An 8-bit integral value used to initialize bits [183:176] of the result. 3652/// \param __b21 3653/// An 8-bit integral value used to initialize bits [175:168] of the result. 3654/// \param __b20 3655/// An 8-bit integral value used to initialize bits [167:160] of the result. 3656/// \param __b19 3657/// An 8-bit integral value used to initialize bits [159:152] of the result. 3658/// \param __b18 3659/// An 8-bit integral value used to initialize bits [151:144] of the result. 3660/// \param __b17 3661/// An 8-bit integral value used to initialize bits [143:136] of the result. 3662/// \param __b16 3663/// An 8-bit integral value used to initialize bits [135:128] of the result. 3664/// \param __b15 3665/// An 8-bit integral value used to initialize bits [127:120] of the result. 3666/// \param __b14 3667/// An 8-bit integral value used to initialize bits [119:112] of the result. 3668/// \param __b13 3669/// An 8-bit integral value used to initialize bits [111:104] of the result. 3670/// \param __b12 3671/// An 8-bit integral value used to initialize bits [103:96] of the result. 3672/// \param __b11 3673/// An 8-bit integral value used to initialize bits [95:88] of the result. 3674/// \param __b10 3675/// An 8-bit integral value used to initialize bits [87:80] of the result. 3676/// \param __b09 3677/// An 8-bit integral value used to initialize bits [79:72] of the result. 3678/// \param __b08 3679/// An 8-bit integral value used to initialize bits [71:64] of the result. 3680/// \param __b07 3681/// An 8-bit integral value used to initialize bits [63:56] of the result. 3682/// \param __b06 3683/// An 8-bit integral value used to initialize bits [55:48] of the result. 3684/// \param __b05 3685/// An 8-bit integral value used to initialize bits [47:40] of the result. 3686/// \param __b04 3687/// An 8-bit integral value used to initialize bits [39:32] of the result. 3688/// \param __b03 3689/// An 8-bit integral value used to initialize bits [31:24] of the result. 3690/// \param __b02 3691/// An 8-bit integral value used to initialize bits [23:16] of the result. 3692/// \param __b01 3693/// An 8-bit integral value used to initialize bits [15:8] of the result. 3694/// \param __b00 3695/// An 8-bit integral value used to initialize bits [7:0] of the result. 3696/// \returns An initialized 256-bit integer vector. 3697static __inline __m256i __DEFAULT_FN_ATTRS 3698_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3699 char __b27, char __b26, char __b25, char __b24, 3700 char __b23, char __b22, char __b21, char __b20, 3701 char __b19, char __b18, char __b17, char __b16, 3702 char __b15, char __b14, char __b13, char __b12, 3703 char __b11, char __b10, char __b09, char __b08, 3704 char __b07, char __b06, char __b05, char __b04, 3705 char __b03, char __b02, char __b01, char __b00) 3706{ 3707 return (__m256i)(__v32qi){ 3708 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3709 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3710 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3711 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3712 }; 3713} 3714 3715/// \brief Constructs a 256-bit integer vector initialized with the specified 3716/// 64-bit integral values. 3717/// 3718/// \headerfile <x86intrin.h> 3719/// 3720/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3721/// instruction. 3722/// 3723/// \param __a 3724/// A 64-bit integral value used to initialize bits [255:192] of the result. 3725/// \param __b 3726/// A 64-bit integral value used to initialize bits [191:128] of the result. 3727/// \param __c 3728/// A 64-bit integral value used to initialize bits [127:64] of the result. 3729/// \param __d 3730/// A 64-bit integral value used to initialize bits [63:0] of the result. 3731/// \returns An initialized 256-bit integer vector. 3732static __inline __m256i __DEFAULT_FN_ATTRS 3733_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3734{ 3735 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3736} 3737 3738/* Create vectors with elements in reverse order */ 3739/// \brief Constructs a 256-bit floating-point vector of [4 x double], 3740/// initialized in reverse order with the specified double-precision 3741/// floating-point values. 3742/// 3743/// \headerfile <x86intrin.h> 3744/// 3745/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3746/// instruction. 3747/// 3748/// \param __a 3749/// A double-precision floating-point value used to initialize bits [63:0] 3750/// of the result. 3751/// \param __b 3752/// A double-precision floating-point value used to initialize bits [127:64] 3753/// of the result. 3754/// \param __c 3755/// A double-precision floating-point value used to initialize bits [191:128] 3756/// of the result. 3757/// \param __d 3758/// A double-precision floating-point value used to initialize bits [255:192] 3759/// of the result. 3760/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3761static __inline __m256d __DEFAULT_FN_ATTRS 3762_mm256_setr_pd(double __a, double __b, double __c, double __d) 3763{ 3764 return (__m256d){ __a, __b, __c, __d }; 3765} 3766 3767/// \brief Constructs a 256-bit floating-point vector of [8 x float], 3768/// initialized in reverse order with the specified single-precision 3769/// float-point values. 3770/// 3771/// \headerfile <x86intrin.h> 3772/// 3773/// This intrinsic is a utility function and does not correspond to a specific 3774/// instruction. 3775/// 3776/// \param __a 3777/// A single-precision floating-point value used to initialize bits [31:0] 3778/// of the result. 3779/// \param __b 3780/// A single-precision floating-point value used to initialize bits [63:32] 3781/// of the result. 3782/// \param __c 3783/// A single-precision floating-point value used to initialize bits [95:64] 3784/// of the result. 3785/// \param __d 3786/// A single-precision floating-point value used to initialize bits [127:96] 3787/// of the result. 3788/// \param __e 3789/// A single-precision floating-point value used to initialize bits [159:128] 3790/// of the result. 3791/// \param __f 3792/// A single-precision floating-point value used to initialize bits [191:160] 3793/// of the result. 3794/// \param __g 3795/// A single-precision floating-point value used to initialize bits [223:192] 3796/// of the result. 3797/// \param __h 3798/// A single-precision floating-point value used to initialize bits [255:224] 3799/// of the result. 3800/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3801static __inline __m256 __DEFAULT_FN_ATTRS 3802_mm256_setr_ps(float __a, float __b, float __c, float __d, 3803 float __e, float __f, float __g, float __h) 3804{ 3805 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 3806} 3807 3808/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3809/// with the specified 32-bit integral values. 3810/// 3811/// \headerfile <x86intrin.h> 3812/// 3813/// This intrinsic is a utility function and does not correspond to a specific 3814/// instruction. 3815/// 3816/// \param __i0 3817/// A 32-bit integral value used to initialize bits [31:0] of the result. 3818/// \param __i1 3819/// A 32-bit integral value used to initialize bits [63:32] of the result. 3820/// \param __i2 3821/// A 32-bit integral value used to initialize bits [95:64] of the result. 3822/// \param __i3 3823/// A 32-bit integral value used to initialize bits [127:96] of the result. 3824/// \param __i4 3825/// A 32-bit integral value used to initialize bits [159:128] of the result. 3826/// \param __i5 3827/// A 32-bit integral value used to initialize bits [191:160] of the result. 3828/// \param __i6 3829/// A 32-bit integral value used to initialize bits [223:192] of the result. 3830/// \param __i7 3831/// A 32-bit integral value used to initialize bits [255:224] of the result. 3832/// \returns An initialized 256-bit integer vector. 3833static __inline __m256i __DEFAULT_FN_ATTRS 3834_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 3835 int __i4, int __i5, int __i6, int __i7) 3836{ 3837 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 3838} 3839 3840/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3841/// with the specified 16-bit integral values. 3842/// 3843/// \headerfile <x86intrin.h> 3844/// 3845/// This intrinsic is a utility function and does not correspond to a specific 3846/// instruction. 3847/// 3848/// \param __w15 3849/// A 16-bit integral value used to initialize bits [15:0] of the result. 3850/// \param __w14 3851/// A 16-bit integral value used to initialize bits [31:16] of the result. 3852/// \param __w13 3853/// A 16-bit integral value used to initialize bits [47:32] of the result. 3854/// \param __w12 3855/// A 16-bit integral value used to initialize bits [63:48] of the result. 3856/// \param __w11 3857/// A 16-bit integral value used to initialize bits [79:64] of the result. 3858/// \param __w10 3859/// A 16-bit integral value used to initialize bits [95:80] of the result. 3860/// \param __w09 3861/// A 16-bit integral value used to initialize bits [111:96] of the result. 3862/// \param __w08 3863/// A 16-bit integral value used to initialize bits [127:112] of the result. 3864/// \param __w07 3865/// A 16-bit integral value used to initialize bits [143:128] of the result. 3866/// \param __w06 3867/// A 16-bit integral value used to initialize bits [159:144] of the result. 3868/// \param __w05 3869/// A 16-bit integral value used to initialize bits [175:160] of the result. 3870/// \param __w04 3871/// A 16-bit integral value used to initialize bits [191:176] of the result. 3872/// \param __w03 3873/// A 16-bit integral value used to initialize bits [207:192] of the result. 3874/// \param __w02 3875/// A 16-bit integral value used to initialize bits [223:208] of the result. 3876/// \param __w01 3877/// A 16-bit integral value used to initialize bits [239:224] of the result. 3878/// \param __w00 3879/// A 16-bit integral value used to initialize bits [255:240] of the result. 3880/// \returns An initialized 256-bit integer vector. 3881static __inline __m256i __DEFAULT_FN_ATTRS 3882_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 3883 short __w11, short __w10, short __w09, short __w08, 3884 short __w07, short __w06, short __w05, short __w04, 3885 short __w03, short __w02, short __w01, short __w00) 3886{ 3887 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 3888 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 3889} 3890 3891/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3892/// with the specified 8-bit integral values. 3893/// 3894/// \headerfile <x86intrin.h> 3895/// 3896/// This intrinsic is a utility function and does not correspond to a specific 3897/// instruction. 3898/// 3899/// \param __b31 3900/// An 8-bit integral value used to initialize bits [7:0] of the result. 3901/// \param __b30 3902/// An 8-bit integral value used to initialize bits [15:8] of the result. 3903/// \param __b29 3904/// An 8-bit integral value used to initialize bits [23:16] of the result. 3905/// \param __b28 3906/// An 8-bit integral value used to initialize bits [31:24] of the result. 3907/// \param __b27 3908/// An 8-bit integral value used to initialize bits [39:32] of the result. 3909/// \param __b26 3910/// An 8-bit integral value used to initialize bits [47:40] of the result. 3911/// \param __b25 3912/// An 8-bit integral value used to initialize bits [55:48] of the result. 3913/// \param __b24 3914/// An 8-bit integral value used to initialize bits [63:56] of the result. 3915/// \param __b23 3916/// An 8-bit integral value used to initialize bits [71:64] of the result. 3917/// \param __b22 3918/// An 8-bit integral value used to initialize bits [79:72] of the result. 3919/// \param __b21 3920/// An 8-bit integral value used to initialize bits [87:80] of the result. 3921/// \param __b20 3922/// An 8-bit integral value used to initialize bits [95:88] of the result. 3923/// \param __b19 3924/// An 8-bit integral value used to initialize bits [103:96] of the result. 3925/// \param __b18 3926/// An 8-bit integral value used to initialize bits [111:104] of the result. 3927/// \param __b17 3928/// An 8-bit integral value used to initialize bits [119:112] of the result. 3929/// \param __b16 3930/// An 8-bit integral value used to initialize bits [127:120] of the result. 3931/// \param __b15 3932/// An 8-bit integral value used to initialize bits [135:128] of the result. 3933/// \param __b14 3934/// An 8-bit integral value used to initialize bits [143:136] of the result. 3935/// \param __b13 3936/// An 8-bit integral value used to initialize bits [151:144] of the result. 3937/// \param __b12 3938/// An 8-bit integral value used to initialize bits [159:152] of the result. 3939/// \param __b11 3940/// An 8-bit integral value used to initialize bits [167:160] of the result. 3941/// \param __b10 3942/// An 8-bit integral value used to initialize bits [175:168] of the result. 3943/// \param __b09 3944/// An 8-bit integral value used to initialize bits [183:176] of the result. 3945/// \param __b08 3946/// An 8-bit integral value used to initialize bits [191:184] of the result. 3947/// \param __b07 3948/// An 8-bit integral value used to initialize bits [199:192] of the result. 3949/// \param __b06 3950/// An 8-bit integral value used to initialize bits [207:200] of the result. 3951/// \param __b05 3952/// An 8-bit integral value used to initialize bits [215:208] of the result. 3953/// \param __b04 3954/// An 8-bit integral value used to initialize bits [223:216] of the result. 3955/// \param __b03 3956/// An 8-bit integral value used to initialize bits [231:224] of the result. 3957/// \param __b02 3958/// An 8-bit integral value used to initialize bits [239:232] of the result. 3959/// \param __b01 3960/// An 8-bit integral value used to initialize bits [247:240] of the result. 3961/// \param __b00 3962/// An 8-bit integral value used to initialize bits [255:248] of the result. 3963/// \returns An initialized 256-bit integer vector. 3964static __inline __m256i __DEFAULT_FN_ATTRS 3965_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 3966 char __b27, char __b26, char __b25, char __b24, 3967 char __b23, char __b22, char __b21, char __b20, 3968 char __b19, char __b18, char __b17, char __b16, 3969 char __b15, char __b14, char __b13, char __b12, 3970 char __b11, char __b10, char __b09, char __b08, 3971 char __b07, char __b06, char __b05, char __b04, 3972 char __b03, char __b02, char __b01, char __b00) 3973{ 3974 return (__m256i)(__v32qi){ 3975 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 3976 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 3977 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 3978 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 3979} 3980 3981/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3982/// with the specified 64-bit integral values. 3983/// 3984/// \headerfile <x86intrin.h> 3985/// 3986/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3987/// instruction. 3988/// 3989/// \param __a 3990/// A 64-bit integral value used to initialize bits [63:0] of the result. 3991/// \param __b 3992/// A 64-bit integral value used to initialize bits [127:64] of the result. 3993/// \param __c 3994/// A 64-bit integral value used to initialize bits [191:128] of the result. 3995/// \param __d 3996/// A 64-bit integral value used to initialize bits [255:192] of the result. 3997/// \returns An initialized 256-bit integer vector. 3998static __inline __m256i __DEFAULT_FN_ATTRS 3999_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4000{ 4001 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4002} 4003 4004/* Create vectors with repeated elements */ 4005/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4006/// of the four double-precision floating-point vector elements set to the 4007/// specified double-precision floating-point value. 4008/// 4009/// \headerfile <x86intrin.h> 4010/// 4011/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4012/// 4013/// \param __w 4014/// A double-precision floating-point value used to initialize each vector 4015/// element of the result. 4016/// \returns An initialized 256-bit floating-point vector of [4 x double]. 4017static __inline __m256d __DEFAULT_FN_ATTRS 4018_mm256_set1_pd(double __w) 4019{ 4020 return (__m256d){ __w, __w, __w, __w }; 4021} 4022 4023/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4024/// of the eight single-precision floating-point vector elements set to the 4025/// specified single-precision floating-point value. 4026/// 4027/// \headerfile <x86intrin.h> 4028/// 4029/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4030/// instruction. 4031/// 4032/// \param __w 4033/// A single-precision floating-point value used to initialize each vector 4034/// element of the result. 4035/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4036static __inline __m256 __DEFAULT_FN_ATTRS 4037_mm256_set1_ps(float __w) 4038{ 4039 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4040} 4041 4042/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4043/// 32-bit integral vector elements set to the specified 32-bit integral 4044/// value. 4045/// 4046/// \headerfile <x86intrin.h> 4047/// 4048/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4049/// instruction. 4050/// 4051/// \param __i 4052/// A 32-bit integral value used to initialize each vector element of the 4053/// result. 4054/// \returns An initialized 256-bit integer vector of [8 x i32]. 4055static __inline __m256i __DEFAULT_FN_ATTRS 4056_mm256_set1_epi32(int __i) 4057{ 4058 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4059} 4060 4061/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4062/// 16-bit integral vector elements set to the specified 16-bit integral 4063/// value. 4064/// 4065/// \headerfile <x86intrin.h> 4066/// 4067/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4068/// 4069/// \param __w 4070/// A 16-bit integral value used to initialize each vector element of the 4071/// result. 4072/// \returns An initialized 256-bit integer vector of [16 x i16]. 4073static __inline __m256i __DEFAULT_FN_ATTRS 4074_mm256_set1_epi16(short __w) 4075{ 4076 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4077 __w, __w, __w, __w, __w, __w }; 4078} 4079 4080/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4081/// 8-bit integral vector elements set to the specified 8-bit integral value. 4082/// 4083/// \headerfile <x86intrin.h> 4084/// 4085/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4086/// 4087/// \param __b 4088/// An 8-bit integral value used to initialize each vector element of the 4089/// result. 4090/// \returns An initialized 256-bit integer vector of [32 x i8]. 4091static __inline __m256i __DEFAULT_FN_ATTRS 4092_mm256_set1_epi8(char __b) 4093{ 4094 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4095 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4096 __b, __b, __b, __b, __b, __b, __b }; 4097} 4098 4099/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4100/// 64-bit integral vector elements set to the specified 64-bit integral 4101/// value. 4102/// 4103/// \headerfile <x86intrin.h> 4104/// 4105/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4106/// 4107/// \param __q 4108/// A 64-bit integral value used to initialize each vector element of the 4109/// result. 4110/// \returns An initialized 256-bit integer vector of [4 x i64]. 4111static __inline __m256i __DEFAULT_FN_ATTRS 4112_mm256_set1_epi64x(long long __q) 4113{ 4114 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4115} 4116 4117/* Create __zeroed vectors */ 4118/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4119/// vector elements initialized to zero. 4120/// 4121/// \headerfile <x86intrin.h> 4122/// 4123/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4124/// 4125/// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4126static __inline __m256d __DEFAULT_FN_ATTRS 4127_mm256_setzero_pd(void) 4128{ 4129 return (__m256d){ 0, 0, 0, 0 }; 4130} 4131 4132/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4133/// vector elements initialized to zero. 4134/// 4135/// \headerfile <x86intrin.h> 4136/// 4137/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4138/// 4139/// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4140static __inline __m256 __DEFAULT_FN_ATTRS 4141_mm256_setzero_ps(void) 4142{ 4143 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4144} 4145 4146/// \brief Constructs a 256-bit integer vector initialized to zero. 4147/// 4148/// \headerfile <x86intrin.h> 4149/// 4150/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4151/// 4152/// \returns A 256-bit integer vector initialized to zero. 4153static __inline __m256i __DEFAULT_FN_ATTRS 4154_mm256_setzero_si256(void) 4155{ 4156 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4157} 4158 4159/* Cast between vector types */ 4160/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4161/// floating-point vector of [8 x float]. 4162/// 4163/// \headerfile <x86intrin.h> 4164/// 4165/// This intrinsic has no corresponding instruction. 4166/// 4167/// \param __a 4168/// A 256-bit floating-point vector of [4 x double]. 4169/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4170/// bitwise pattern as the parameter. 4171static __inline __m256 __DEFAULT_FN_ATTRS 4172_mm256_castpd_ps(__m256d __a) 4173{ 4174 return (__m256)__a; 4175} 4176 4177/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4178/// integer vector. 4179/// 4180/// \headerfile <x86intrin.h> 4181/// 4182/// This intrinsic has no corresponding instruction. 4183/// 4184/// \param __a 4185/// A 256-bit floating-point vector of [4 x double]. 4186/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4187/// parameter. 4188static __inline __m256i __DEFAULT_FN_ATTRS 4189_mm256_castpd_si256(__m256d __a) 4190{ 4191 return (__m256i)__a; 4192} 4193 4194/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4195/// floating-point vector of [4 x double]. 4196/// 4197/// \headerfile <x86intrin.h> 4198/// 4199/// This intrinsic has no corresponding instruction. 4200/// 4201/// \param __a 4202/// A 256-bit floating-point vector of [8 x float]. 4203/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4204/// bitwise pattern as the parameter. 4205static __inline __m256d __DEFAULT_FN_ATTRS 4206_mm256_castps_pd(__m256 __a) 4207{ 4208 return (__m256d)__a; 4209} 4210 4211/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4212/// integer vector. 4213/// 4214/// \headerfile <x86intrin.h> 4215/// 4216/// This intrinsic has no corresponding instruction. 4217/// 4218/// \param __a 4219/// A 256-bit floating-point vector of [8 x float]. 4220/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4221/// parameter. 4222static __inline __m256i __DEFAULT_FN_ATTRS 4223_mm256_castps_si256(__m256 __a) 4224{ 4225 return (__m256i)__a; 4226} 4227 4228/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4229/// of [8 x float]. 4230/// 4231/// \headerfile <x86intrin.h> 4232/// 4233/// This intrinsic has no corresponding instruction. 4234/// 4235/// \param __a 4236/// A 256-bit integer vector. 4237/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4238/// bitwise pattern as the parameter. 4239static __inline __m256 __DEFAULT_FN_ATTRS 4240_mm256_castsi256_ps(__m256i __a) 4241{ 4242 return (__m256)__a; 4243} 4244 4245/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4246/// of [4 x double]. 4247/// 4248/// \headerfile <x86intrin.h> 4249/// 4250/// This intrinsic has no corresponding instruction. 4251/// 4252/// \param __a 4253/// A 256-bit integer vector. 4254/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4255/// bitwise pattern as the parameter. 4256static __inline __m256d __DEFAULT_FN_ATTRS 4257_mm256_castsi256_pd(__m256i __a) 4258{ 4259 return (__m256d)__a; 4260} 4261 4262/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4263/// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4264/// 4265/// \headerfile <x86intrin.h> 4266/// 4267/// This intrinsic has no corresponding instruction. 4268/// 4269/// \param __a 4270/// A 256-bit floating-point vector of [4 x double]. 4271/// \returns A 128-bit floating-point vector of [2 x double] containing the 4272/// lower 128 bits of the parameter. 4273static __inline __m128d __DEFAULT_FN_ATTRS 4274_mm256_castpd256_pd128(__m256d __a) 4275{ 4276 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4277} 4278 4279/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4280/// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4281/// 4282/// \headerfile <x86intrin.h> 4283/// 4284/// This intrinsic has no corresponding instruction. 4285/// 4286/// \param __a 4287/// A 256-bit floating-point vector of [8 x float]. 4288/// \returns A 128-bit floating-point vector of [4 x float] containing the 4289/// lower 128 bits of the parameter. 4290static __inline __m128 __DEFAULT_FN_ATTRS 4291_mm256_castps256_ps128(__m256 __a) 4292{ 4293 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4294} 4295 4296/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4297/// 4298/// \headerfile <x86intrin.h> 4299/// 4300/// This intrinsic has no corresponding instruction. 4301/// 4302/// \param __a 4303/// A 256-bit integer vector. 4304/// \returns A 128-bit integer vector containing the lower 128 bits of the 4305/// parameter. 4306static __inline __m128i __DEFAULT_FN_ATTRS 4307_mm256_castsi256_si128(__m256i __a) 4308{ 4309 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4310} 4311 4312/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4313/// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4314/// contain the value of the source vector. The contents of the upper 128 4315/// bits are undefined. 4316/// 4317/// \headerfile <x86intrin.h> 4318/// 4319/// This intrinsic has no corresponding instruction. 4320/// 4321/// \param __a 4322/// A 128-bit vector of [2 x double]. 4323/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4324/// contain the value of the parameter. The contents of the upper 128 bits 4325/// are undefined. 4326static __inline __m256d __DEFAULT_FN_ATTRS 4327_mm256_castpd128_pd256(__m128d __a) 4328{ 4329 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4330} 4331 4332/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4333/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4334/// the value of the source vector. The contents of the upper 128 bits are 4335/// undefined. 4336/// 4337/// \headerfile <x86intrin.h> 4338/// 4339/// This intrinsic has no corresponding instruction. 4340/// 4341/// \param __a 4342/// A 128-bit vector of [4 x float]. 4343/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4344/// contain the value of the parameter. The contents of the upper 128 bits 4345/// are undefined. 4346static __inline __m256 __DEFAULT_FN_ATTRS 4347_mm256_castps128_ps256(__m128 __a) 4348{ 4349 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4350} 4351 4352/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4353/// The lower 128 bits contain the value of the source vector. The contents 4354/// of the upper 128 bits are undefined. 4355/// 4356/// \headerfile <x86intrin.h> 4357/// 4358/// This intrinsic has no corresponding instruction. 4359/// 4360/// \param __a 4361/// A 128-bit integer vector. 4362/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4363/// the parameter. The contents of the upper 128 bits are undefined. 4364static __inline __m256i __DEFAULT_FN_ATTRS 4365_mm256_castsi128_si256(__m128i __a) 4366{ 4367 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4368} 4369 4370/* 4371 Vector insert. 4372 We use macros rather than inlines because we only want to accept 4373 invocations where the immediate M is a constant expression. 4374*/ 4375/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4376/// a 256-bit vector of [8 x float] given in the first parameter, and then 4377/// replacing either the upper or the lower 128 bits with the contents of a 4378/// 128-bit vector of [4 x float] in the second parameter. The immediate 4379/// integer parameter determines between the upper or the lower 128 bits. 4380/// 4381/// \headerfile <x86intrin.h> 4382/// 4383/// \code 4384/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4385/// \endcode 4386/// 4387/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4388/// 4389/// \param V1 4390/// A 256-bit vector of [8 x float]. This vector is copied to the result 4391/// first, and then either the upper or the lower 128 bits of the result will 4392/// be replaced by the contents of \a V2. 4393/// \param V2 4394/// A 128-bit vector of [4 x float]. The contents of this parameter are 4395/// written to either the upper or the lower 128 bits of the result depending 4396/// on the value of parameter \a M. 4397/// \param M 4398/// An immediate integer. The least significant bit determines how the values 4399/// from the two parameters are interleaved: \n 4400/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4401/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4402/// result. \n 4403/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4404/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4405/// result. 4406/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4407#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4408 (__m256)__builtin_shufflevector( \ 4409 (__v8sf)(__m256)(V1), \ 4410 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4411 (((M) & 1) ? 0 : 8), \ 4412 (((M) & 1) ? 1 : 9), \ 4413 (((M) & 1) ? 2 : 10), \ 4414 (((M) & 1) ? 3 : 11), \ 4415 (((M) & 1) ? 8 : 4), \ 4416 (((M) & 1) ? 9 : 5), \ 4417 (((M) & 1) ? 10 : 6), \ 4418 (((M) & 1) ? 11 : 7) );}) 4419 4420/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4421/// a 256-bit vector of [4 x double] given in the first parameter, and then 4422/// replacing either the upper or the lower 128 bits with the contents of a 4423/// 128-bit vector of [2 x double] in the second parameter. The immediate 4424/// integer parameter determines between the upper or the lower 128 bits. 4425/// 4426/// \headerfile <x86intrin.h> 4427/// 4428/// \code 4429/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4430/// \endcode 4431/// 4432/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4433/// 4434/// \param V1 4435/// A 256-bit vector of [4 x double]. This vector is copied to the result 4436/// first, and then either the upper or the lower 128 bits of the result will 4437/// be replaced by the contents of \a V2. 4438/// \param V2 4439/// A 128-bit vector of [2 x double]. The contents of this parameter are 4440/// written to either the upper or the lower 128 bits of the result depending 4441/// on the value of parameter \a M. 4442/// \param M 4443/// An immediate integer. The least significant bit determines how the values 4444/// from the two parameters are interleaved: \n 4445/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4446/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4447/// result. \n 4448/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4449/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4450/// result. 4451/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4452#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4453 (__m256d)__builtin_shufflevector( \ 4454 (__v4df)(__m256d)(V1), \ 4455 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4456 (((M) & 1) ? 0 : 4), \ 4457 (((M) & 1) ? 1 : 5), \ 4458 (((M) & 1) ? 4 : 2), \ 4459 (((M) & 1) ? 5 : 3) );}) 4460 4461/// \brief Constructs a new 256-bit integer vector by first duplicating a 4462/// 256-bit integer vector given in the first parameter, and then replacing 4463/// either the upper or the lower 128 bits with the contents of a 128-bit 4464/// integer vector in the second parameter. The immediate integer parameter 4465/// determines between the upper or the lower 128 bits. 4466/// 4467/// \headerfile <x86intrin.h> 4468/// 4469/// \code 4470/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4471/// \endcode 4472/// 4473/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4474/// 4475/// \param V1 4476/// A 256-bit integer vector. This vector is copied to the result first, and 4477/// then either the upper or the lower 128 bits of the result will be 4478/// replaced by the contents of \a V2. 4479/// \param V2 4480/// A 128-bit integer vector. The contents of this parameter are written to 4481/// either the upper or the lower 128 bits of the result depending on the 4482/// value of parameter \a M. 4483/// \param M 4484/// An immediate integer. The least significant bit determines how the values 4485/// from the two parameters are interleaved: \n 4486/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4487/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4488/// result. \n 4489/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4490/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4491/// result. 4492/// \returns A 256-bit integer vector containing the interleaved values. 4493#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4494 (__m256i)__builtin_shufflevector( \ 4495 (__v4di)(__m256i)(V1), \ 4496 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4497 (((M) & 1) ? 0 : 4), \ 4498 (((M) & 1) ? 1 : 5), \ 4499 (((M) & 1) ? 4 : 2), \ 4500 (((M) & 1) ? 5 : 3) );}) 4501 4502/* 4503 Vector extract. 4504 We use macros rather than inlines because we only want to accept 4505 invocations where the immediate M is a constant expression. 4506*/ 4507/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4508/// of [8 x float], as determined by the immediate integer parameter, and 4509/// returns the extracted bits as a 128-bit vector of [4 x float]. 4510/// 4511/// \headerfile <x86intrin.h> 4512/// 4513/// \code 4514/// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4515/// \endcode 4516/// 4517/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4518/// 4519/// \param V 4520/// A 256-bit vector of [8 x float]. 4521/// \param M 4522/// An immediate integer. The least significant bit determines which bits are 4523/// extracted from the first parameter: \n 4524/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4525/// result. \n 4526/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4527/// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4528#define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4529 (__m128)__builtin_shufflevector( \ 4530 (__v8sf)(__m256)(V), \ 4531 (__v8sf)(_mm256_undefined_ps()), \ 4532 (((M) & 1) ? 4 : 0), \ 4533 (((M) & 1) ? 5 : 1), \ 4534 (((M) & 1) ? 6 : 2), \ 4535 (((M) & 1) ? 7 : 3) );}) 4536 4537/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4538/// of [4 x double], as determined by the immediate integer parameter, and 4539/// returns the extracted bits as a 128-bit vector of [2 x double]. 4540/// 4541/// \headerfile <x86intrin.h> 4542/// 4543/// \code 4544/// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4545/// \endcode 4546/// 4547/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4548/// 4549/// \param V 4550/// A 256-bit vector of [4 x double]. 4551/// \param M 4552/// An immediate integer. The least significant bit determines which bits are 4553/// extracted from the first parameter: \n 4554/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4555/// result. \n 4556/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4557/// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4558#define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4559 (__m128d)__builtin_shufflevector( \ 4560 (__v4df)(__m256d)(V), \ 4561 (__v4df)(_mm256_undefined_pd()), \ 4562 (((M) & 1) ? 2 : 0), \ 4563 (((M) & 1) ? 3 : 1) );}) 4564 4565/// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4566/// integer vector, as determined by the immediate integer parameter, and 4567/// returns the extracted bits as a 128-bit integer vector. 4568/// 4569/// \headerfile <x86intrin.h> 4570/// 4571/// \code 4572/// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4573/// \endcode 4574/// 4575/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4576/// 4577/// \param V 4578/// A 256-bit integer vector. 4579/// \param M 4580/// An immediate integer. The least significant bit determines which bits are 4581/// extracted from the first parameter: \n 4582/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4583/// result. \n 4584/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4585/// \returns A 128-bit integer vector containing the extracted bits. 4586#define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4587 (__m128i)__builtin_shufflevector( \ 4588 (__v4di)(__m256i)(V), \ 4589 (__v4di)(_mm256_undefined_si256()), \ 4590 (((M) & 1) ? 2 : 0), \ 4591 (((M) & 1) ? 3 : 1) );}) 4592 4593/* SIMD load ops (unaligned) */ 4594/// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4595/// unaligned memory locations and constructs a 256-bit floating-point vector 4596/// of [8 x float] by concatenating the two 128-bit vectors. 4597/// 4598/// \headerfile <x86intrin.h> 4599/// 4600/// This intrinsic corresponds to load instructions followed by the 4601/// <c> VINSERTF128 </c> instruction. 4602/// 4603/// \param __addr_hi 4604/// A pointer to a 128-bit memory location containing 4 consecutive 4605/// single-precision floating-point values. These values are to be copied to 4606/// bits[255:128] of the result. The address of the memory location does not 4607/// have to be aligned. 4608/// \param __addr_lo 4609/// A pointer to a 128-bit memory location containing 4 consecutive 4610/// single-precision floating-point values. These values are to be copied to 4611/// bits[127:0] of the result. The address of the memory location does not 4612/// have to be aligned. 4613/// \returns A 256-bit floating-point vector of [8 x float] containing the 4614/// concatenated result. 4615static __inline __m256 __DEFAULT_FN_ATTRS 4616_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4617{ 4618 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4619 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4620} 4621 4622/// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4623/// unaligned memory locations and constructs a 256-bit floating-point vector 4624/// of [4 x double] by concatenating the two 128-bit vectors. 4625/// 4626/// \headerfile <x86intrin.h> 4627/// 4628/// This intrinsic corresponds to load instructions followed by the 4629/// <c> VINSERTF128 </c> instruction. 4630/// 4631/// \param __addr_hi 4632/// A pointer to a 128-bit memory location containing two consecutive 4633/// double-precision floating-point values. These values are to be copied to 4634/// bits[255:128] of the result. The address of the memory location does not 4635/// have to be aligned. 4636/// \param __addr_lo 4637/// A pointer to a 128-bit memory location containing two consecutive 4638/// double-precision floating-point values. These values are to be copied to 4639/// bits[127:0] of the result. The address of the memory location does not 4640/// have to be aligned. 4641/// \returns A 256-bit floating-point vector of [4 x double] containing the 4642/// concatenated result. 4643static __inline __m256d __DEFAULT_FN_ATTRS 4644_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4645{ 4646 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4647 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4648} 4649 4650/// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4651/// constructs a 256-bit integer vector by concatenating the two 128-bit 4652/// vectors. 4653/// 4654/// \headerfile <x86intrin.h> 4655/// 4656/// This intrinsic corresponds to load instructions followed by the 4657/// <c> VINSERTF128 </c> instruction. 4658/// 4659/// \param __addr_hi 4660/// A pointer to a 128-bit memory location containing a 128-bit integer 4661/// vector. This vector is to be copied to bits[255:128] of the result. The 4662/// address of the memory location does not have to be aligned. 4663/// \param __addr_lo 4664/// A pointer to a 128-bit memory location containing a 128-bit integer 4665/// vector. This vector is to be copied to bits[127:0] of the result. The 4666/// address of the memory location does not have to be aligned. 4667/// \returns A 256-bit integer vector containing the concatenated result. 4668static __inline __m256i __DEFAULT_FN_ATTRS 4669_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4670{ 4671 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4672 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4673} 4674 4675/* SIMD store ops (unaligned) */ 4676/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4677/// vector of [8 x float] into two different unaligned memory locations. 4678/// 4679/// \headerfile <x86intrin.h> 4680/// 4681/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4682/// store instructions. 4683/// 4684/// \param __addr_hi 4685/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4686/// copied to this memory location. The address of this memory location does 4687/// not have to be aligned. 4688/// \param __addr_lo 4689/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4690/// copied to this memory location. The address of this memory location does 4691/// not have to be aligned. 4692/// \param __a 4693/// A 256-bit floating-point vector of [8 x float]. 4694static __inline void __DEFAULT_FN_ATTRS 4695_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4696{ 4697 __m128 __v128; 4698 4699 __v128 = _mm256_castps256_ps128(__a); 4700 _mm_storeu_ps(__addr_lo, __v128); 4701 __v128 = _mm256_extractf128_ps(__a, 1); 4702 _mm_storeu_ps(__addr_hi, __v128); 4703} 4704 4705/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4706/// vector of [4 x double] into two different unaligned memory locations. 4707/// 4708/// \headerfile <x86intrin.h> 4709/// 4710/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4711/// store instructions. 4712/// 4713/// \param __addr_hi 4714/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4715/// copied to this memory location. The address of this memory location does 4716/// not have to be aligned. 4717/// \param __addr_lo 4718/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4719/// copied to this memory location. The address of this memory location does 4720/// not have to be aligned. 4721/// \param __a 4722/// A 256-bit floating-point vector of [4 x double]. 4723static __inline void __DEFAULT_FN_ATTRS 4724_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4725{ 4726 __m128d __v128; 4727 4728 __v128 = _mm256_castpd256_pd128(__a); 4729 _mm_storeu_pd(__addr_lo, __v128); 4730 __v128 = _mm256_extractf128_pd(__a, 1); 4731 _mm_storeu_pd(__addr_hi, __v128); 4732} 4733 4734/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 4735/// two different unaligned memory locations. 4736/// 4737/// \headerfile <x86intrin.h> 4738/// 4739/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4740/// store instructions. 4741/// 4742/// \param __addr_hi 4743/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4744/// copied to this memory location. The address of this memory location does 4745/// not have to be aligned. 4746/// \param __addr_lo 4747/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4748/// copied to this memory location. The address of this memory location does 4749/// not have to be aligned. 4750/// \param __a 4751/// A 256-bit integer vector. 4752static __inline void __DEFAULT_FN_ATTRS 4753_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 4754{ 4755 __m128i __v128; 4756 4757 __v128 = _mm256_castsi256_si128(__a); 4758 _mm_storeu_si128(__addr_lo, __v128); 4759 __v128 = _mm256_extractf128_si256(__a, 1); 4760 _mm_storeu_si128(__addr_hi, __v128); 4761} 4762 4763/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4764/// concatenating two 128-bit floating-point vectors of [4 x float]. 4765/// 4766/// \headerfile <x86intrin.h> 4767/// 4768/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4769/// 4770/// \param __hi 4771/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4772/// 128 bits of the result. 4773/// \param __lo 4774/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4775/// 128 bits of the result. 4776/// \returns A 256-bit floating-point vector of [8 x float] containing the 4777/// concatenated result. 4778static __inline __m256 __DEFAULT_FN_ATTRS 4779_mm256_set_m128 (__m128 __hi, __m128 __lo) 4780{ 4781 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4782} 4783 4784/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 4785/// concatenating two 128-bit floating-point vectors of [2 x double]. 4786/// 4787/// \headerfile <x86intrin.h> 4788/// 4789/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4790/// 4791/// \param __hi 4792/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4793/// 128 bits of the result. 4794/// \param __lo 4795/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4796/// 128 bits of the result. 4797/// \returns A 256-bit floating-point vector of [4 x double] containing the 4798/// concatenated result. 4799static __inline __m256d __DEFAULT_FN_ATTRS 4800_mm256_set_m128d (__m128d __hi, __m128d __lo) 4801{ 4802 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4803} 4804 4805/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 4806/// integer vectors. 4807/// 4808/// \headerfile <x86intrin.h> 4809/// 4810/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4811/// 4812/// \param __hi 4813/// A 128-bit integer vector to be copied to the upper 128 bits of the 4814/// result. 4815/// \param __lo 4816/// A 128-bit integer vector to be copied to the lower 128 bits of the 4817/// result. 4818/// \returns A 256-bit integer vector containing the concatenated result. 4819static __inline __m256i __DEFAULT_FN_ATTRS 4820_mm256_set_m128i (__m128i __hi, __m128i __lo) 4821{ 4822 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4823} 4824 4825/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4826/// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4827/// similar to _mm256_set_m128, but the order of the input parameters is 4828/// swapped. 4829/// 4830/// \headerfile <x86intrin.h> 4831/// 4832/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4833/// 4834/// \param __lo 4835/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4836/// 128 bits of the result. 4837/// \param __hi 4838/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4839/// 128 bits of the result. 4840/// \returns A 256-bit floating-point vector of [8 x float] containing the 4841/// concatenated result. 4842static __inline __m256 __DEFAULT_FN_ATTRS 4843_mm256_setr_m128 (__m128 __lo, __m128 __hi) 4844{ 4845 return _mm256_set_m128(__hi, __lo); 4846} 4847 4848/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 4849/// concatenating two 128-bit floating-point vectors of [2 x double]. This is 4850/// similar to _mm256_set_m128d, but the order of the input parameters is 4851/// swapped. 4852/// 4853/// \headerfile <x86intrin.h> 4854/// 4855/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4856/// 4857/// \param __lo 4858/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4859/// 128 bits of the result. 4860/// \param __hi 4861/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4862/// 128 bits of the result. 4863/// \returns A 256-bit floating-point vector of [4 x double] containing the 4864/// concatenated result. 4865static __inline __m256d __DEFAULT_FN_ATTRS 4866_mm256_setr_m128d (__m128d __lo, __m128d __hi) 4867{ 4868 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4869} 4870 4871/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 4872/// integer vectors. This is similar to _mm256_set_m128i, but the order of 4873/// the input parameters is swapped. 4874/// 4875/// \headerfile <x86intrin.h> 4876/// 4877/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4878/// 4879/// \param __lo 4880/// A 128-bit integer vector to be copied to the lower 128 bits of the 4881/// result. 4882/// \param __hi 4883/// A 128-bit integer vector to be copied to the upper 128 bits of the 4884/// result. 4885/// \returns A 256-bit integer vector containing the concatenated result. 4886static __inline __m256i __DEFAULT_FN_ATTRS 4887_mm256_setr_m128i (__m128i __lo, __m128i __hi) 4888{ 4889 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4890} 4891 4892#undef __DEFAULT_FN_ATTRS 4893 4894#endif /* __AVXINTRIN_H */ 4895