1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 12#endif 13 14#ifndef __AVXINTRIN_H 15#define __AVXINTRIN_H 16 17typedef double __v4df __attribute__ ((__vector_size__ (32))); 18typedef float __v8sf __attribute__ ((__vector_size__ (32))); 19typedef long long __v4di __attribute__ ((__vector_size__ (32))); 20typedef int __v8si __attribute__ ((__vector_size__ (32))); 21typedef short __v16hi __attribute__ ((__vector_size__ (32))); 22typedef char __v32qi __attribute__ ((__vector_size__ (32))); 23 24/* Unsigned types */ 25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 29 30/* We need an explicitly signed variant for char. Note that this shouldn't 31 * appear in the interface though. */ 32typedef signed char __v32qs __attribute__((__vector_size__(32))); 33 34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); 35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); 36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); 37 38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); 39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); 40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); 41 42/* Define the default attributes for the functions in this file. */ 43#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) 44#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128))) 45 46/* Arithmetic */ 47/// Adds two 256-bit vectors of [4 x double]. 48/// 49/// \headerfile <x86intrin.h> 50/// 51/// This intrinsic corresponds to the <c> VADDPD </c> instruction. 52/// 53/// \param __a 54/// A 256-bit vector of [4 x double] containing one of the source operands. 55/// \param __b 56/// A 256-bit vector of [4 x double] containing one of the source operands. 57/// \returns A 256-bit vector of [4 x double] containing the sums of both 58/// operands. 59static __inline __m256d __DEFAULT_FN_ATTRS 60_mm256_add_pd(__m256d __a, __m256d __b) 61{ 62 return (__m256d)((__v4df)__a+(__v4df)__b); 63} 64 65/// Adds two 256-bit vectors of [8 x float]. 66/// 67/// \headerfile <x86intrin.h> 68/// 69/// This intrinsic corresponds to the <c> VADDPS </c> instruction. 70/// 71/// \param __a 72/// A 256-bit vector of [8 x float] containing one of the source operands. 73/// \param __b 74/// A 256-bit vector of [8 x float] containing one of the source operands. 75/// \returns A 256-bit vector of [8 x float] containing the sums of both 76/// operands. 77static __inline __m256 __DEFAULT_FN_ATTRS 78_mm256_add_ps(__m256 __a, __m256 __b) 79{ 80 return (__m256)((__v8sf)__a+(__v8sf)__b); 81} 82 83/// Subtracts two 256-bit vectors of [4 x double]. 84/// 85/// \headerfile <x86intrin.h> 86/// 87/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 88/// 89/// \param __a 90/// A 256-bit vector of [4 x double] containing the minuend. 91/// \param __b 92/// A 256-bit vector of [4 x double] containing the subtrahend. 93/// \returns A 256-bit vector of [4 x double] containing the differences between 94/// both operands. 95static __inline __m256d __DEFAULT_FN_ATTRS 96_mm256_sub_pd(__m256d __a, __m256d __b) 97{ 98 return (__m256d)((__v4df)__a-(__v4df)__b); 99} 100 101/// Subtracts two 256-bit vectors of [8 x float]. 102/// 103/// \headerfile <x86intrin.h> 104/// 105/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 106/// 107/// \param __a 108/// A 256-bit vector of [8 x float] containing the minuend. 109/// \param __b 110/// A 256-bit vector of [8 x float] containing the subtrahend. 111/// \returns A 256-bit vector of [8 x float] containing the differences between 112/// both operands. 113static __inline __m256 __DEFAULT_FN_ATTRS 114_mm256_sub_ps(__m256 __a, __m256 __b) 115{ 116 return (__m256)((__v8sf)__a-(__v8sf)__b); 117} 118 119/// Adds the even-indexed values and subtracts the odd-indexed values of 120/// two 256-bit vectors of [4 x double]. 121/// 122/// \headerfile <x86intrin.h> 123/// 124/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 125/// 126/// \param __a 127/// A 256-bit vector of [4 x double] containing the left source operand. 128/// \param __b 129/// A 256-bit vector of [4 x double] containing the right source operand. 130/// \returns A 256-bit vector of [4 x double] containing the alternating sums 131/// and differences between both operands. 132static __inline __m256d __DEFAULT_FN_ATTRS 133_mm256_addsub_pd(__m256d __a, __m256d __b) 134{ 135 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 136} 137 138/// Adds the even-indexed values and subtracts the odd-indexed values of 139/// two 256-bit vectors of [8 x float]. 140/// 141/// \headerfile <x86intrin.h> 142/// 143/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 144/// 145/// \param __a 146/// A 256-bit vector of [8 x float] containing the left source operand. 147/// \param __b 148/// A 256-bit vector of [8 x float] containing the right source operand. 149/// \returns A 256-bit vector of [8 x float] containing the alternating sums and 150/// differences between both operands. 151static __inline __m256 __DEFAULT_FN_ATTRS 152_mm256_addsub_ps(__m256 __a, __m256 __b) 153{ 154 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 155} 156 157/// Divides two 256-bit vectors of [4 x double]. 158/// 159/// \headerfile <x86intrin.h> 160/// 161/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 162/// 163/// \param __a 164/// A 256-bit vector of [4 x double] containing the dividend. 165/// \param __b 166/// A 256-bit vector of [4 x double] containing the divisor. 167/// \returns A 256-bit vector of [4 x double] containing the quotients of both 168/// operands. 169static __inline __m256d __DEFAULT_FN_ATTRS 170_mm256_div_pd(__m256d __a, __m256d __b) 171{ 172 return (__m256d)((__v4df)__a/(__v4df)__b); 173} 174 175/// Divides two 256-bit vectors of [8 x float]. 176/// 177/// \headerfile <x86intrin.h> 178/// 179/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 180/// 181/// \param __a 182/// A 256-bit vector of [8 x float] containing the dividend. 183/// \param __b 184/// A 256-bit vector of [8 x float] containing the divisor. 185/// \returns A 256-bit vector of [8 x float] containing the quotients of both 186/// operands. 187static __inline __m256 __DEFAULT_FN_ATTRS 188_mm256_div_ps(__m256 __a, __m256 __b) 189{ 190 return (__m256)((__v8sf)__a/(__v8sf)__b); 191} 192 193/// Compares two 256-bit vectors of [4 x double] and returns the greater 194/// of each pair of values. 195/// 196/// \headerfile <x86intrin.h> 197/// 198/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 199/// 200/// \param __a 201/// A 256-bit vector of [4 x double] containing one of the operands. 202/// \param __b 203/// A 256-bit vector of [4 x double] containing one of the operands. 204/// \returns A 256-bit vector of [4 x double] containing the maximum values 205/// between both operands. 206static __inline __m256d __DEFAULT_FN_ATTRS 207_mm256_max_pd(__m256d __a, __m256d __b) 208{ 209 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 210} 211 212/// Compares two 256-bit vectors of [8 x float] and returns the greater 213/// of each pair of values. 214/// 215/// \headerfile <x86intrin.h> 216/// 217/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 218/// 219/// \param __a 220/// A 256-bit vector of [8 x float] containing one of the operands. 221/// \param __b 222/// A 256-bit vector of [8 x float] containing one of the operands. 223/// \returns A 256-bit vector of [8 x float] containing the maximum values 224/// between both operands. 225static __inline __m256 __DEFAULT_FN_ATTRS 226_mm256_max_ps(__m256 __a, __m256 __b) 227{ 228 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 229} 230 231/// Compares two 256-bit vectors of [4 x double] and returns the lesser 232/// of each pair of values. 233/// 234/// \headerfile <x86intrin.h> 235/// 236/// This intrinsic corresponds to the <c> VMINPD </c> instruction. 237/// 238/// \param __a 239/// A 256-bit vector of [4 x double] containing one of the operands. 240/// \param __b 241/// A 256-bit vector of [4 x double] containing one of the operands. 242/// \returns A 256-bit vector of [4 x double] containing the minimum values 243/// between both operands. 244static __inline __m256d __DEFAULT_FN_ATTRS 245_mm256_min_pd(__m256d __a, __m256d __b) 246{ 247 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 248} 249 250/// Compares two 256-bit vectors of [8 x float] and returns the lesser 251/// of each pair of values. 252/// 253/// \headerfile <x86intrin.h> 254/// 255/// This intrinsic corresponds to the <c> VMINPS </c> instruction. 256/// 257/// \param __a 258/// A 256-bit vector of [8 x float] containing one of the operands. 259/// \param __b 260/// A 256-bit vector of [8 x float] containing one of the operands. 261/// \returns A 256-bit vector of [8 x float] containing the minimum values 262/// between both operands. 263static __inline __m256 __DEFAULT_FN_ATTRS 264_mm256_min_ps(__m256 __a, __m256 __b) 265{ 266 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 267} 268 269/// Multiplies two 256-bit vectors of [4 x double]. 270/// 271/// \headerfile <x86intrin.h> 272/// 273/// This intrinsic corresponds to the <c> VMULPD </c> instruction. 274/// 275/// \param __a 276/// A 256-bit vector of [4 x double] containing one of the operands. 277/// \param __b 278/// A 256-bit vector of [4 x double] containing one of the operands. 279/// \returns A 256-bit vector of [4 x double] containing the products of both 280/// operands. 281static __inline __m256d __DEFAULT_FN_ATTRS 282_mm256_mul_pd(__m256d __a, __m256d __b) 283{ 284 return (__m256d)((__v4df)__a * (__v4df)__b); 285} 286 287/// Multiplies two 256-bit vectors of [8 x float]. 288/// 289/// \headerfile <x86intrin.h> 290/// 291/// This intrinsic corresponds to the <c> VMULPS </c> instruction. 292/// 293/// \param __a 294/// A 256-bit vector of [8 x float] containing one of the operands. 295/// \param __b 296/// A 256-bit vector of [8 x float] containing one of the operands. 297/// \returns A 256-bit vector of [8 x float] containing the products of both 298/// operands. 299static __inline __m256 __DEFAULT_FN_ATTRS 300_mm256_mul_ps(__m256 __a, __m256 __b) 301{ 302 return (__m256)((__v8sf)__a * (__v8sf)__b); 303} 304 305/// Calculates the square roots of the values in a 256-bit vector of 306/// [4 x double]. 307/// 308/// \headerfile <x86intrin.h> 309/// 310/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 311/// 312/// \param __a 313/// A 256-bit vector of [4 x double]. 314/// \returns A 256-bit vector of [4 x double] containing the square roots of the 315/// values in the operand. 316static __inline __m256d __DEFAULT_FN_ATTRS 317_mm256_sqrt_pd(__m256d __a) 318{ 319 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 320} 321 322/// Calculates the square roots of the values in a 256-bit vector of 323/// [8 x float]. 324/// 325/// \headerfile <x86intrin.h> 326/// 327/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 328/// 329/// \param __a 330/// A 256-bit vector of [8 x float]. 331/// \returns A 256-bit vector of [8 x float] containing the square roots of the 332/// values in the operand. 333static __inline __m256 __DEFAULT_FN_ATTRS 334_mm256_sqrt_ps(__m256 __a) 335{ 336 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 337} 338 339/// Calculates the reciprocal square roots of the values in a 256-bit 340/// vector of [8 x float]. 341/// 342/// \headerfile <x86intrin.h> 343/// 344/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 345/// 346/// \param __a 347/// A 256-bit vector of [8 x float]. 348/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 349/// roots of the values in the operand. 350static __inline __m256 __DEFAULT_FN_ATTRS 351_mm256_rsqrt_ps(__m256 __a) 352{ 353 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 354} 355 356/// Calculates the reciprocals of the values in a 256-bit vector of 357/// [8 x float]. 358/// 359/// \headerfile <x86intrin.h> 360/// 361/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 362/// 363/// \param __a 364/// A 256-bit vector of [8 x float]. 365/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 366/// values in the operand. 367static __inline __m256 __DEFAULT_FN_ATTRS 368_mm256_rcp_ps(__m256 __a) 369{ 370 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 371} 372 373/// Rounds the values in a 256-bit vector of [4 x double] as specified 374/// by the byte operand. The source values are rounded to integer values and 375/// returned as 64-bit double-precision floating-point values. 376/// 377/// \headerfile <x86intrin.h> 378/// 379/// \code 380/// __m256d _mm256_round_pd(__m256d V, const int M); 381/// \endcode 382/// 383/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 384/// 385/// \param V 386/// A 256-bit vector of [4 x double]. 387/// \param M 388/// An integer value that specifies the rounding operation. \n 389/// Bits [7:4] are reserved. \n 390/// Bit [3] is a precision exception value: \n 391/// 0: A normal PE exception is used. \n 392/// 1: The PE field is not updated. \n 393/// Bit [2] is the rounding control source: \n 394/// 0: Use bits [1:0] of \a M. \n 395/// 1: Use the current MXCSR setting. \n 396/// Bits [1:0] contain the rounding control definition: \n 397/// 00: Nearest. \n 398/// 01: Downward (toward negative infinity). \n 399/// 10: Upward (toward positive infinity). \n 400/// 11: Truncated. 401/// \returns A 256-bit vector of [4 x double] containing the rounded values. 402#define _mm256_round_pd(V, M) \ 403 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)) 404 405/// Rounds the values stored in a 256-bit vector of [8 x float] as 406/// specified by the byte operand. The source values are rounded to integer 407/// values and returned as floating-point values. 408/// 409/// \headerfile <x86intrin.h> 410/// 411/// \code 412/// __m256 _mm256_round_ps(__m256 V, const int M); 413/// \endcode 414/// 415/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 416/// 417/// \param V 418/// A 256-bit vector of [8 x float]. 419/// \param M 420/// An integer value that specifies the rounding operation. \n 421/// Bits [7:4] are reserved. \n 422/// Bit [3] is a precision exception value: \n 423/// 0: A normal PE exception is used. \n 424/// 1: The PE field is not updated. \n 425/// Bit [2] is the rounding control source: \n 426/// 0: Use bits [1:0] of \a M. \n 427/// 1: Use the current MXCSR setting. \n 428/// Bits [1:0] contain the rounding control definition: \n 429/// 00: Nearest. \n 430/// 01: Downward (toward negative infinity). \n 431/// 10: Upward (toward positive infinity). \n 432/// 11: Truncated. 433/// \returns A 256-bit vector of [8 x float] containing the rounded values. 434#define _mm256_round_ps(V, M) \ 435 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)) 436 437/// Rounds up the values stored in a 256-bit vector of [4 x double]. The 438/// source values are rounded up to integer values and returned as 64-bit 439/// double-precision floating-point values. 440/// 441/// \headerfile <x86intrin.h> 442/// 443/// \code 444/// __m256d _mm256_ceil_pd(__m256d V); 445/// \endcode 446/// 447/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 448/// 449/// \param V 450/// A 256-bit vector of [4 x double]. 451/// \returns A 256-bit vector of [4 x double] containing the rounded up values. 452#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 453 454/// Rounds down the values stored in a 256-bit vector of [4 x double]. 455/// The source values are rounded down to integer values and returned as 456/// 64-bit double-precision floating-point values. 457/// 458/// \headerfile <x86intrin.h> 459/// 460/// \code 461/// __m256d _mm256_floor_pd(__m256d V); 462/// \endcode 463/// 464/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 465/// 466/// \param V 467/// A 256-bit vector of [4 x double]. 468/// \returns A 256-bit vector of [4 x double] containing the rounded down 469/// values. 470#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 471 472/// Rounds up the values stored in a 256-bit vector of [8 x float]. The 473/// source values are rounded up to integer values and returned as 474/// floating-point values. 475/// 476/// \headerfile <x86intrin.h> 477/// 478/// \code 479/// __m256 _mm256_ceil_ps(__m256 V); 480/// \endcode 481/// 482/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 483/// 484/// \param V 485/// A 256-bit vector of [8 x float]. 486/// \returns A 256-bit vector of [8 x float] containing the rounded up values. 487#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 488 489/// Rounds down the values stored in a 256-bit vector of [8 x float]. The 490/// source values are rounded down to integer values and returned as 491/// floating-point values. 492/// 493/// \headerfile <x86intrin.h> 494/// 495/// \code 496/// __m256 _mm256_floor_ps(__m256 V); 497/// \endcode 498/// 499/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 500/// 501/// \param V 502/// A 256-bit vector of [8 x float]. 503/// \returns A 256-bit vector of [8 x float] containing the rounded down values. 504#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 505 506/* Logical */ 507/// Performs a bitwise AND of two 256-bit vectors of [4 x double]. 508/// 509/// \headerfile <x86intrin.h> 510/// 511/// This intrinsic corresponds to the <c> VANDPD </c> instruction. 512/// 513/// \param __a 514/// A 256-bit vector of [4 x double] containing one of the source operands. 515/// \param __b 516/// A 256-bit vector of [4 x double] containing one of the source operands. 517/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 518/// values between both operands. 519static __inline __m256d __DEFAULT_FN_ATTRS 520_mm256_and_pd(__m256d __a, __m256d __b) 521{ 522 return (__m256d)((__v4du)__a & (__v4du)__b); 523} 524 525/// Performs a bitwise AND of two 256-bit vectors of [8 x float]. 526/// 527/// \headerfile <x86intrin.h> 528/// 529/// This intrinsic corresponds to the <c> VANDPS </c> instruction. 530/// 531/// \param __a 532/// A 256-bit vector of [8 x float] containing one of the source operands. 533/// \param __b 534/// A 256-bit vector of [8 x float] containing one of the source operands. 535/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 536/// values between both operands. 537static __inline __m256 __DEFAULT_FN_ATTRS 538_mm256_and_ps(__m256 __a, __m256 __b) 539{ 540 return (__m256)((__v8su)__a & (__v8su)__b); 541} 542 543/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using 544/// the one's complement of the values contained in the first source operand. 545/// 546/// \headerfile <x86intrin.h> 547/// 548/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 549/// 550/// \param __a 551/// A 256-bit vector of [4 x double] containing the left source operand. The 552/// one's complement of this value is used in the bitwise AND. 553/// \param __b 554/// A 256-bit vector of [4 x double] containing the right source operand. 555/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 556/// values of the second operand and the one's complement of the first 557/// operand. 558static __inline __m256d __DEFAULT_FN_ATTRS 559_mm256_andnot_pd(__m256d __a, __m256d __b) 560{ 561 return (__m256d)(~(__v4du)__a & (__v4du)__b); 562} 563 564/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using 565/// the one's complement of the values contained in the first source operand. 566/// 567/// \headerfile <x86intrin.h> 568/// 569/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 570/// 571/// \param __a 572/// A 256-bit vector of [8 x float] containing the left source operand. The 573/// one's complement of this value is used in the bitwise AND. 574/// \param __b 575/// A 256-bit vector of [8 x float] containing the right source operand. 576/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 577/// values of the second operand and the one's complement of the first 578/// operand. 579static __inline __m256 __DEFAULT_FN_ATTRS 580_mm256_andnot_ps(__m256 __a, __m256 __b) 581{ 582 return (__m256)(~(__v8su)__a & (__v8su)__b); 583} 584 585/// Performs a bitwise OR of two 256-bit vectors of [4 x double]. 586/// 587/// \headerfile <x86intrin.h> 588/// 589/// This intrinsic corresponds to the <c> VORPD </c> instruction. 590/// 591/// \param __a 592/// A 256-bit vector of [4 x double] containing one of the source operands. 593/// \param __b 594/// A 256-bit vector of [4 x double] containing one of the source operands. 595/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 596/// values between both operands. 597static __inline __m256d __DEFAULT_FN_ATTRS 598_mm256_or_pd(__m256d __a, __m256d __b) 599{ 600 return (__m256d)((__v4du)__a | (__v4du)__b); 601} 602 603/// Performs a bitwise OR of two 256-bit vectors of [8 x float]. 604/// 605/// \headerfile <x86intrin.h> 606/// 607/// This intrinsic corresponds to the <c> VORPS </c> instruction. 608/// 609/// \param __a 610/// A 256-bit vector of [8 x float] containing one of the source operands. 611/// \param __b 612/// A 256-bit vector of [8 x float] containing one of the source operands. 613/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 614/// values between both operands. 615static __inline __m256 __DEFAULT_FN_ATTRS 616_mm256_or_ps(__m256 __a, __m256 __b) 617{ 618 return (__m256)((__v8su)__a | (__v8su)__b); 619} 620 621/// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 622/// 623/// \headerfile <x86intrin.h> 624/// 625/// This intrinsic corresponds to the <c> VXORPD </c> instruction. 626/// 627/// \param __a 628/// A 256-bit vector of [4 x double] containing one of the source operands. 629/// \param __b 630/// A 256-bit vector of [4 x double] containing one of the source operands. 631/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 632/// values between both operands. 633static __inline __m256d __DEFAULT_FN_ATTRS 634_mm256_xor_pd(__m256d __a, __m256d __b) 635{ 636 return (__m256d)((__v4du)__a ^ (__v4du)__b); 637} 638 639/// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 640/// 641/// \headerfile <x86intrin.h> 642/// 643/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 644/// 645/// \param __a 646/// A 256-bit vector of [8 x float] containing one of the source operands. 647/// \param __b 648/// A 256-bit vector of [8 x float] containing one of the source operands. 649/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 650/// values between both operands. 651static __inline __m256 __DEFAULT_FN_ATTRS 652_mm256_xor_ps(__m256 __a, __m256 __b) 653{ 654 return (__m256)((__v8su)__a ^ (__v8su)__b); 655} 656 657/* Horizontal arithmetic */ 658/// Horizontally adds the adjacent pairs of values contained in two 659/// 256-bit vectors of [4 x double]. 660/// 661/// \headerfile <x86intrin.h> 662/// 663/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 664/// 665/// \param __a 666/// A 256-bit vector of [4 x double] containing one of the source operands. 667/// The horizontal sums of the values are returned in the even-indexed 668/// elements of a vector of [4 x double]. 669/// \param __b 670/// A 256-bit vector of [4 x double] containing one of the source operands. 671/// The horizontal sums of the values are returned in the odd-indexed 672/// elements of a vector of [4 x double]. 673/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 674/// both operands. 675static __inline __m256d __DEFAULT_FN_ATTRS 676_mm256_hadd_pd(__m256d __a, __m256d __b) 677{ 678 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 679} 680 681/// Horizontally adds the adjacent pairs of values contained in two 682/// 256-bit vectors of [8 x float]. 683/// 684/// \headerfile <x86intrin.h> 685/// 686/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 687/// 688/// \param __a 689/// A 256-bit vector of [8 x float] containing one of the source operands. 690/// The horizontal sums of the values are returned in the elements with 691/// index 0, 1, 4, 5 of a vector of [8 x float]. 692/// \param __b 693/// A 256-bit vector of [8 x float] containing one of the source operands. 694/// The horizontal sums of the values are returned in the elements with 695/// index 2, 3, 6, 7 of a vector of [8 x float]. 696/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 697/// both operands. 698static __inline __m256 __DEFAULT_FN_ATTRS 699_mm256_hadd_ps(__m256 __a, __m256 __b) 700{ 701 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 702} 703 704/// Horizontally subtracts the adjacent pairs of values contained in two 705/// 256-bit vectors of [4 x double]. 706/// 707/// \headerfile <x86intrin.h> 708/// 709/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 710/// 711/// \param __a 712/// A 256-bit vector of [4 x double] containing one of the source operands. 713/// The horizontal differences between the values are returned in the 714/// even-indexed elements of a vector of [4 x double]. 715/// \param __b 716/// A 256-bit vector of [4 x double] containing one of the source operands. 717/// The horizontal differences between the values are returned in the 718/// odd-indexed elements of a vector of [4 x double]. 719/// \returns A 256-bit vector of [4 x double] containing the horizontal 720/// differences of both operands. 721static __inline __m256d __DEFAULT_FN_ATTRS 722_mm256_hsub_pd(__m256d __a, __m256d __b) 723{ 724 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 725} 726 727/// Horizontally subtracts the adjacent pairs of values contained in two 728/// 256-bit vectors of [8 x float]. 729/// 730/// \headerfile <x86intrin.h> 731/// 732/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 733/// 734/// \param __a 735/// A 256-bit vector of [8 x float] containing one of the source operands. 736/// The horizontal differences between the values are returned in the 737/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 738/// \param __b 739/// A 256-bit vector of [8 x float] containing one of the source operands. 740/// The horizontal differences between the values are returned in the 741/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 742/// \returns A 256-bit vector of [8 x float] containing the horizontal 743/// differences of both operands. 744static __inline __m256 __DEFAULT_FN_ATTRS 745_mm256_hsub_ps(__m256 __a, __m256 __b) 746{ 747 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 748} 749 750/* Vector permutations */ 751/// Copies the values in a 128-bit vector of [2 x double] as specified 752/// by the 128-bit integer vector operand. 753/// 754/// \headerfile <x86intrin.h> 755/// 756/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 757/// 758/// \param __a 759/// A 128-bit vector of [2 x double]. 760/// \param __c 761/// A 128-bit integer vector operand specifying how the values are to be 762/// copied. \n 763/// Bit [1]: \n 764/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 765/// vector. \n 766/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 767/// returned vector. \n 768/// Bit [65]: \n 769/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 770/// returned vector. \n 771/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 772/// returned vector. 773/// \returns A 128-bit vector of [2 x double] containing the copied values. 774static __inline __m128d __DEFAULT_FN_ATTRS128 775_mm_permutevar_pd(__m128d __a, __m128i __c) 776{ 777 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 778} 779 780/// Copies the values in a 256-bit vector of [4 x double] as specified 781/// by the 256-bit integer vector operand. 782/// 783/// \headerfile <x86intrin.h> 784/// 785/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 786/// 787/// \param __a 788/// A 256-bit vector of [4 x double]. 789/// \param __c 790/// A 256-bit integer vector operand specifying how the values are to be 791/// copied. \n 792/// Bit [1]: \n 793/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 794/// vector. \n 795/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 796/// returned vector. \n 797/// Bit [65]: \n 798/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 799/// returned vector. \n 800/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 801/// returned vector. \n 802/// Bit [129]: \n 803/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 804/// returned vector. \n 805/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 806/// returned vector. \n 807/// Bit [193]: \n 808/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 809/// returned vector. \n 810/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 811/// returned vector. 812/// \returns A 256-bit vector of [4 x double] containing the copied values. 813static __inline __m256d __DEFAULT_FN_ATTRS 814_mm256_permutevar_pd(__m256d __a, __m256i __c) 815{ 816 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 817} 818 819/// Copies the values stored in a 128-bit vector of [4 x float] as 820/// specified by the 128-bit integer vector operand. 821/// \headerfile <x86intrin.h> 822/// 823/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 824/// 825/// \param __a 826/// A 128-bit vector of [4 x float]. 827/// \param __c 828/// A 128-bit integer vector operand specifying how the values are to be 829/// copied. \n 830/// Bits [1:0]: \n 831/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 832/// returned vector. \n 833/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 834/// returned vector. \n 835/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 836/// returned vector. \n 837/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 838/// returned vector. \n 839/// Bits [33:32]: \n 840/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 841/// returned vector. \n 842/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 843/// returned vector. \n 844/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 845/// returned vector. \n 846/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 847/// returned vector. \n 848/// Bits [65:64]: \n 849/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 850/// returned vector. \n 851/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 852/// returned vector. \n 853/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 854/// returned vector. \n 855/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 856/// returned vector. \n 857/// Bits [97:96]: \n 858/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 859/// returned vector. \n 860/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 861/// returned vector. \n 862/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 863/// returned vector. \n 864/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 865/// returned vector. 866/// \returns A 128-bit vector of [4 x float] containing the copied values. 867static __inline __m128 __DEFAULT_FN_ATTRS128 868_mm_permutevar_ps(__m128 __a, __m128i __c) 869{ 870 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 871} 872 873/// Copies the values stored in a 256-bit vector of [8 x float] as 874/// specified by the 256-bit integer vector operand. 875/// 876/// \headerfile <x86intrin.h> 877/// 878/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 879/// 880/// \param __a 881/// A 256-bit vector of [8 x float]. 882/// \param __c 883/// A 256-bit integer vector operand specifying how the values are to be 884/// copied. \n 885/// Bits [1:0]: \n 886/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 887/// returned vector. \n 888/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 889/// returned vector. \n 890/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 891/// returned vector. \n 892/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 893/// returned vector. \n 894/// Bits [33:32]: \n 895/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 896/// returned vector. \n 897/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 898/// returned vector. \n 899/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 900/// returned vector. \n 901/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 902/// returned vector. \n 903/// Bits [65:64]: \n 904/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 905/// returned vector. \n 906/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 907/// returned vector. \n 908/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 909/// returned vector. \n 910/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 911/// returned vector. \n 912/// Bits [97:96]: \n 913/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 914/// returned vector. \n 915/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 916/// returned vector. \n 917/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 918/// returned vector. \n 919/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 920/// returned vector. \n 921/// Bits [129:128]: \n 922/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 923/// returned vector. \n 924/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 925/// returned vector. \n 926/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 927/// returned vector. \n 928/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 929/// returned vector. \n 930/// Bits [161:160]: \n 931/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 932/// returned vector. \n 933/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 934/// returned vector. \n 935/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 936/// returned vector. \n 937/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 938/// returned vector. \n 939/// Bits [193:192]: \n 940/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 941/// returned vector. \n 942/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 943/// returned vector. \n 944/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 945/// returned vector. \n 946/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 947/// returned vector. \n 948/// Bits [225:224]: \n 949/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 950/// returned vector. \n 951/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 952/// returned vector. \n 953/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 954/// returned vector. \n 955/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 956/// returned vector. 957/// \returns A 256-bit vector of [8 x float] containing the copied values. 958static __inline __m256 __DEFAULT_FN_ATTRS 959_mm256_permutevar_ps(__m256 __a, __m256i __c) 960{ 961 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 962} 963 964/// Copies the values in a 128-bit vector of [2 x double] as specified 965/// by the immediate integer operand. 966/// 967/// \headerfile <x86intrin.h> 968/// 969/// \code 970/// __m128d _mm_permute_pd(__m128d A, const int C); 971/// \endcode 972/// 973/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 974/// 975/// \param A 976/// A 128-bit vector of [2 x double]. 977/// \param C 978/// An immediate integer operand specifying how the values are to be 979/// copied. \n 980/// Bit [0]: \n 981/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 982/// vector. \n 983/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 984/// returned vector. \n 985/// Bit [1]: \n 986/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 987/// returned vector. \n 988/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 989/// returned vector. 990/// \returns A 128-bit vector of [2 x double] containing the copied values. 991#define _mm_permute_pd(A, C) \ 992 (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)) 993 994/// Copies the values in a 256-bit vector of [4 x double] as specified by 995/// the immediate integer operand. 996/// 997/// \headerfile <x86intrin.h> 998/// 999/// \code 1000/// __m256d _mm256_permute_pd(__m256d A, const int C); 1001/// \endcode 1002/// 1003/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1004/// 1005/// \param A 1006/// A 256-bit vector of [4 x double]. 1007/// \param C 1008/// An immediate integer operand specifying how the values are to be 1009/// copied. \n 1010/// Bit [0]: \n 1011/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1012/// vector. \n 1013/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1014/// returned vector. \n 1015/// Bit [1]: \n 1016/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1017/// returned vector. \n 1018/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1019/// returned vector. \n 1020/// Bit [2]: \n 1021/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1022/// returned vector. \n 1023/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1024/// returned vector. \n 1025/// Bit [3]: \n 1026/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1027/// returned vector. \n 1028/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1029/// returned vector. 1030/// \returns A 256-bit vector of [4 x double] containing the copied values. 1031#define _mm256_permute_pd(A, C) \ 1032 (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)) 1033 1034/// Copies the values in a 128-bit vector of [4 x float] as specified by 1035/// the immediate integer operand. 1036/// 1037/// \headerfile <x86intrin.h> 1038/// 1039/// \code 1040/// __m128 _mm_permute_ps(__m128 A, const int C); 1041/// \endcode 1042/// 1043/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1044/// 1045/// \param A 1046/// A 128-bit vector of [4 x float]. 1047/// \param C 1048/// An immediate integer operand specifying how the values are to be 1049/// copied. \n 1050/// Bits [1:0]: \n 1051/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1052/// returned vector. \n 1053/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1054/// returned vector. \n 1055/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1056/// returned vector. \n 1057/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1058/// returned vector. \n 1059/// Bits [3:2]: \n 1060/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1061/// returned vector. \n 1062/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1063/// returned vector. \n 1064/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1065/// returned vector. \n 1066/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1067/// returned vector. \n 1068/// Bits [5:4]: \n 1069/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1070/// returned vector. \n 1071/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1072/// returned vector. \n 1073/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1074/// returned vector. \n 1075/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1076/// returned vector. \n 1077/// Bits [7:6]: \n 1078/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1079/// returned vector. \n 1080/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1081/// returned vector. \n 1082/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1083/// returned vector. \n 1084/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1085/// returned vector. 1086/// \returns A 128-bit vector of [4 x float] containing the copied values. 1087#define _mm_permute_ps(A, C) \ 1088 (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)) 1089 1090/// Copies the values in a 256-bit vector of [8 x float] as specified by 1091/// the immediate integer operand. 1092/// 1093/// \headerfile <x86intrin.h> 1094/// 1095/// \code 1096/// __m256 _mm256_permute_ps(__m256 A, const int C); 1097/// \endcode 1098/// 1099/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1100/// 1101/// \param A 1102/// A 256-bit vector of [8 x float]. 1103/// \param C 1104/// An immediate integer operand specifying how the values are to be 1105/// copied. \n 1106/// Bits [1:0]: \n 1107/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1108/// returned vector. \n 1109/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1110/// returned vector. \n 1111/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1112/// returned vector. \n 1113/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1114/// returned vector. \n 1115/// Bits [3:2]: \n 1116/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1117/// returned vector. \n 1118/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1119/// returned vector. \n 1120/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1121/// returned vector. \n 1122/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1123/// returned vector. \n 1124/// Bits [5:4]: \n 1125/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1126/// returned vector. \n 1127/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1128/// returned vector. \n 1129/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1130/// returned vector. \n 1131/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1132/// returned vector. \n 1133/// Bits [7:6]: \n 1134/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1135/// returned vector. \n 1136/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1137/// returned vector. \n 1138/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1139/// returned vector. \n 1140/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1141/// returned vector. \n 1142/// Bits [1:0]: \n 1143/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1144/// returned vector. \n 1145/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1146/// returned vector. \n 1147/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1148/// returned vector. \n 1149/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1150/// returned vector. \n 1151/// Bits [3:2]: \n 1152/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1153/// returned vector. \n 1154/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1155/// returned vector. \n 1156/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1157/// returned vector. \n 1158/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1159/// returned vector. \n 1160/// Bits [5:4]: \n 1161/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1162/// returned vector. \n 1163/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1164/// returned vector. \n 1165/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1166/// returned vector. \n 1167/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1168/// returned vector. \n 1169/// Bits [7:6]: \n 1170/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1171/// returned vector. \n 1172/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1173/// returned vector. \n 1174/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1175/// returned vector. \n 1176/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1177/// returned vector. 1178/// \returns A 256-bit vector of [8 x float] containing the copied values. 1179#define _mm256_permute_ps(A, C) \ 1180 (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)) 1181 1182/// Permutes 128-bit data values stored in two 256-bit vectors of 1183/// [4 x double], as specified by the immediate integer operand. 1184/// 1185/// \headerfile <x86intrin.h> 1186/// 1187/// \code 1188/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1189/// \endcode 1190/// 1191/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1192/// 1193/// \param V1 1194/// A 256-bit vector of [4 x double]. 1195/// \param V2 1196/// A 256-bit vector of [4 x double. 1197/// \param M 1198/// An immediate integer operand specifying how the values are to be 1199/// permuted. \n 1200/// Bits [1:0]: \n 1201/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1202/// destination. \n 1203/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1204/// destination. \n 1205/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1206/// destination. \n 1207/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1208/// destination. \n 1209/// Bits [5:4]: \n 1210/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1211/// destination. \n 1212/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1213/// destination. \n 1214/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1215/// destination. \n 1216/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1217/// destination. 1218/// \returns A 256-bit vector of [4 x double] containing the copied values. 1219#define _mm256_permute2f128_pd(V1, V2, M) \ 1220 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1221 (__v4df)(__m256d)(V2), (int)(M)) 1222 1223/// Permutes 128-bit data values stored in two 256-bit vectors of 1224/// [8 x float], as specified by the immediate integer operand. 1225/// 1226/// \headerfile <x86intrin.h> 1227/// 1228/// \code 1229/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1230/// \endcode 1231/// 1232/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1233/// 1234/// \param V1 1235/// A 256-bit vector of [8 x float]. 1236/// \param V2 1237/// A 256-bit vector of [8 x float]. 1238/// \param M 1239/// An immediate integer operand specifying how the values are to be 1240/// permuted. \n 1241/// Bits [1:0]: \n 1242/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1243/// destination. \n 1244/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1245/// destination. \n 1246/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1247/// destination. \n 1248/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1249/// destination. \n 1250/// Bits [5:4]: \n 1251/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1252/// destination. \n 1253/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1254/// destination. \n 1255/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1256/// destination. \n 1257/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1258/// destination. 1259/// \returns A 256-bit vector of [8 x float] containing the copied values. 1260#define _mm256_permute2f128_ps(V1, V2, M) \ 1261 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1262 (__v8sf)(__m256)(V2), (int)(M)) 1263 1264/// Permutes 128-bit data values stored in two 256-bit integer vectors, 1265/// as specified by the immediate integer operand. 1266/// 1267/// \headerfile <x86intrin.h> 1268/// 1269/// \code 1270/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1271/// \endcode 1272/// 1273/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1274/// 1275/// \param V1 1276/// A 256-bit integer vector. 1277/// \param V2 1278/// A 256-bit integer vector. 1279/// \param M 1280/// An immediate integer operand specifying how the values are to be copied. 1281/// Bits [1:0]: \n 1282/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1283/// destination. \n 1284/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1285/// destination. \n 1286/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1287/// destination. \n 1288/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1289/// destination. \n 1290/// Bits [5:4]: \n 1291/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1292/// destination. \n 1293/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1294/// destination. \n 1295/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1296/// destination. \n 1297/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1298/// destination. 1299/// \returns A 256-bit integer vector containing the copied values. 1300#define _mm256_permute2f128_si256(V1, V2, M) \ 1301 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1302 (__v8si)(__m256i)(V2), (int)(M)) 1303 1304/* Vector Blend */ 1305/// Merges 64-bit double-precision data values stored in either of the 1306/// two 256-bit vectors of [4 x double], as specified by the immediate 1307/// integer operand. 1308/// 1309/// \headerfile <x86intrin.h> 1310/// 1311/// \code 1312/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1313/// \endcode 1314/// 1315/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1316/// 1317/// \param V1 1318/// A 256-bit vector of [4 x double]. 1319/// \param V2 1320/// A 256-bit vector of [4 x double]. 1321/// \param M 1322/// An immediate integer operand, with mask bits [3:0] specifying how the 1323/// values are to be copied. The position of the mask bit corresponds to the 1324/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1325/// element in operand \a V1 is copied to the same position in the 1326/// destination. When a mask bit is 1, the corresponding 64-bit element in 1327/// operand \a V2 is copied to the same position in the destination. 1328/// \returns A 256-bit vector of [4 x double] containing the copied values. 1329#define _mm256_blend_pd(V1, V2, M) \ 1330 (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ 1331 (__v4df)(__m256d)(V2), (int)(M)) 1332 1333/// Merges 32-bit single-precision data values stored in either of the 1334/// two 256-bit vectors of [8 x float], as specified by the immediate 1335/// integer operand. 1336/// 1337/// \headerfile <x86intrin.h> 1338/// 1339/// \code 1340/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1341/// \endcode 1342/// 1343/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1344/// 1345/// \param V1 1346/// A 256-bit vector of [8 x float]. 1347/// \param V2 1348/// A 256-bit vector of [8 x float]. 1349/// \param M 1350/// An immediate integer operand, with mask bits [7:0] specifying how the 1351/// values are to be copied. The position of the mask bit corresponds to the 1352/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1353/// element in operand \a V1 is copied to the same position in the 1354/// destination. When a mask bit is 1, the corresponding 32-bit element in 1355/// operand \a V2 is copied to the same position in the destination. 1356/// \returns A 256-bit vector of [8 x float] containing the copied values. 1357#define _mm256_blend_ps(V1, V2, M) \ 1358 (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ 1359 (__v8sf)(__m256)(V2), (int)(M)) 1360 1361/// Merges 64-bit double-precision data values stored in either of the 1362/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1363/// operand. 1364/// 1365/// \headerfile <x86intrin.h> 1366/// 1367/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1368/// 1369/// \param __a 1370/// A 256-bit vector of [4 x double]. 1371/// \param __b 1372/// A 256-bit vector of [4 x double]. 1373/// \param __c 1374/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1375/// how the values are to be copied. The position of the mask bit corresponds 1376/// to the most significant bit of a copied value. When a mask bit is 0, the 1377/// corresponding 64-bit element in operand \a __a is copied to the same 1378/// position in the destination. When a mask bit is 1, the corresponding 1379/// 64-bit element in operand \a __b is copied to the same position in the 1380/// destination. 1381/// \returns A 256-bit vector of [4 x double] containing the copied values. 1382static __inline __m256d __DEFAULT_FN_ATTRS 1383_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1384{ 1385 return (__m256d)__builtin_ia32_blendvpd256( 1386 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1387} 1388 1389/// Merges 32-bit single-precision data values stored in either of the 1390/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1391/// operand. 1392/// 1393/// \headerfile <x86intrin.h> 1394/// 1395/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1396/// 1397/// \param __a 1398/// A 256-bit vector of [8 x float]. 1399/// \param __b 1400/// A 256-bit vector of [8 x float]. 1401/// \param __c 1402/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1403/// and 31 specifying how the values are to be copied. The position of the 1404/// mask bit corresponds to the most significant bit of a copied value. When 1405/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1406/// copied to the same position in the destination. When a mask bit is 1, the 1407/// corresponding 32-bit element in operand \a __b is copied to the same 1408/// position in the destination. 1409/// \returns A 256-bit vector of [8 x float] containing the copied values. 1410static __inline __m256 __DEFAULT_FN_ATTRS 1411_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1412{ 1413 return (__m256)__builtin_ia32_blendvps256( 1414 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1415} 1416 1417/* Vector Dot Product */ 1418/// Computes two dot products in parallel, using the lower and upper 1419/// halves of two [8 x float] vectors as input to the two computations, and 1420/// returning the two dot products in the lower and upper halves of the 1421/// [8 x float] result. 1422/// 1423/// The immediate integer operand controls which input elements will 1424/// contribute to the dot product, and where the final results are returned. 1425/// In general, for each dot product, the four corresponding elements of the 1426/// input vectors are multiplied; the first two and second two products are 1427/// summed, then the two sums are added to form the final result. 1428/// 1429/// \headerfile <x86intrin.h> 1430/// 1431/// \code 1432/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1433/// \endcode 1434/// 1435/// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1436/// 1437/// \param V1 1438/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1439/// \param V2 1440/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1441/// \param M 1442/// An immediate integer argument. Bits [7:4] determine which elements of 1443/// the input vectors are used, with bit [4] corresponding to the lowest 1444/// element and bit [7] corresponding to the highest element of each [4 x 1445/// float] subvector. If a bit is set, the corresponding elements from the 1446/// two input vectors are used as an input for dot product; otherwise that 1447/// input is treated as zero. Bits [3:0] determine which elements of the 1448/// result will receive a copy of the final dot product, with bit [0] 1449/// corresponding to the lowest element and bit [3] corresponding to the 1450/// highest element of each [4 x float] subvector. If a bit is set, the dot 1451/// product is returned in the corresponding element; otherwise that element 1452/// is set to zero. The bitmask is applied in the same way to each of the 1453/// two parallel dot product computations. 1454/// \returns A 256-bit vector of [8 x float] containing the two dot products. 1455#define _mm256_dp_ps(V1, V2, M) \ 1456 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1457 (__v8sf)(__m256)(V2), (M)) 1458 1459/* Vector shuffle */ 1460/// Selects 8 float values from the 256-bit operands of [8 x float], as 1461/// specified by the immediate value operand. 1462/// 1463/// The four selected elements in each operand are copied to the destination 1464/// according to the bits specified in the immediate operand. The selected 1465/// elements from the first 256-bit operand are copied to bits [63:0] and 1466/// bits [191:128] of the destination, and the selected elements from the 1467/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1468/// the destination. For example, if bits [7:0] of the immediate operand 1469/// contain a value of 0xFF, the 256-bit destination vector would contain the 1470/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1471/// 1472/// \headerfile <x86intrin.h> 1473/// 1474/// \code 1475/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1476/// \endcode 1477/// 1478/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1479/// 1480/// \param a 1481/// A 256-bit vector of [8 x float]. The four selected elements in this 1482/// operand are copied to bits [63:0] and bits [191:128] in the destination, 1483/// according to the bits specified in the immediate operand. 1484/// \param b 1485/// A 256-bit vector of [8 x float]. The four selected elements in this 1486/// operand are copied to bits [127:64] and bits [255:192] in the 1487/// destination, according to the bits specified in the immediate operand. 1488/// \param mask 1489/// An immediate value containing an 8-bit value specifying which elements to 1490/// copy from \a a and \a b \n. 1491/// Bits [3:0] specify the values copied from operand \a a. \n 1492/// Bits [7:4] specify the values copied from operand \a b. \n 1493/// The destinations within the 256-bit destination are assigned values as 1494/// follows, according to the bit value assignments described below: \n 1495/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1496/// destination. \n 1497/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1498/// destination. \n 1499/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1500/// destination. \n 1501/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1502/// the destination. \n 1503/// Bit value assignments: \n 1504/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1505/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1506/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1507/// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1508/// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1509#define _mm256_shuffle_ps(a, b, mask) \ 1510 (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ 1511 (__v8sf)(__m256)(b), (int)(mask)) 1512 1513/// Selects four double-precision values from the 256-bit operands of 1514/// [4 x double], as specified by the immediate value operand. 1515/// 1516/// The selected elements from the first 256-bit operand are copied to bits 1517/// [63:0] and bits [191:128] in the destination, and the selected elements 1518/// from the second 256-bit operand are copied to bits [127:64] and bits 1519/// [255:192] in the destination. For example, if bits [3:0] of the immediate 1520/// operand contain a value of 0xF, the 256-bit destination vector would 1521/// contain the following values: b[3], a[3], b[1], a[1]. 1522/// 1523/// \headerfile <x86intrin.h> 1524/// 1525/// \code 1526/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1527/// \endcode 1528/// 1529/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1530/// 1531/// \param a 1532/// A 256-bit vector of [4 x double]. 1533/// \param b 1534/// A 256-bit vector of [4 x double]. 1535/// \param mask 1536/// An immediate value containing 8-bit values specifying which elements to 1537/// copy from \a a and \a b: \n 1538/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1539/// destination. \n 1540/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1541/// destination. \n 1542/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1543/// destination. \n 1544/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1545/// destination. \n 1546/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1547/// destination. \n 1548/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1549/// destination. \n 1550/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1551/// destination. \n 1552/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1553/// destination. 1554/// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1555#define _mm256_shuffle_pd(a, b, mask) \ 1556 (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ 1557 (__v4df)(__m256d)(b), (int)(mask)) 1558 1559/* Compare */ 1560#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1561#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1562#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1563#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1564#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1565#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1566#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1567#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1568#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1569#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1570#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1571#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1572#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1573#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1574#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1575#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1576#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1577#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1578#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1579#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1580#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1581#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1582#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1583#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1584#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1585#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1586#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1587#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1588#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1589#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1590#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1591#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1592 1593/// Compares each of the corresponding double-precision values of two 1594/// 128-bit vectors of [2 x double], using the operation specified by the 1595/// immediate integer operand. 1596/// 1597/// Returns a [2 x double] vector consisting of two doubles corresponding to 1598/// the two comparison results: zero if the comparison is false, and all 1's 1599/// if the comparison is true. 1600/// 1601/// \headerfile <x86intrin.h> 1602/// 1603/// \code 1604/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1605/// \endcode 1606/// 1607/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1608/// 1609/// \param a 1610/// A 128-bit vector of [2 x double]. 1611/// \param b 1612/// A 128-bit vector of [2 x double]. 1613/// \param c 1614/// An immediate integer operand, with bits [4:0] specifying which comparison 1615/// operation to use: \n 1616/// 0x00: Equal (ordered, non-signaling) \n 1617/// 0x01: Less-than (ordered, signaling) \n 1618/// 0x02: Less-than-or-equal (ordered, signaling) \n 1619/// 0x03: Unordered (non-signaling) \n 1620/// 0x04: Not-equal (unordered, non-signaling) \n 1621/// 0x05: Not-less-than (unordered, signaling) \n 1622/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1623/// 0x07: Ordered (non-signaling) \n 1624/// 0x08: Equal (unordered, non-signaling) \n 1625/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1626/// 0x0A: Not-greater-than (unordered, signaling) \n 1627/// 0x0B: False (ordered, non-signaling) \n 1628/// 0x0C: Not-equal (ordered, non-signaling) \n 1629/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1630/// 0x0E: Greater-than (ordered, signaling) \n 1631/// 0x0F: True (unordered, non-signaling) \n 1632/// 0x10: Equal (ordered, signaling) \n 1633/// 0x11: Less-than (ordered, non-signaling) \n 1634/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1635/// 0x13: Unordered (signaling) \n 1636/// 0x14: Not-equal (unordered, signaling) \n 1637/// 0x15: Not-less-than (unordered, non-signaling) \n 1638/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1639/// 0x17: Ordered (signaling) \n 1640/// 0x18: Equal (unordered, signaling) \n 1641/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1642/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1643/// 0x1B: False (ordered, signaling) \n 1644/// 0x1C: Not-equal (ordered, signaling) \n 1645/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1646/// 0x1E: Greater-than (ordered, non-signaling) \n 1647/// 0x1F: True (unordered, signaling) 1648/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1649#define _mm_cmp_pd(a, b, c) \ 1650 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1651 (__v2df)(__m128d)(b), (c)) 1652 1653/// Compares each of the corresponding values of two 128-bit vectors of 1654/// [4 x float], using the operation specified by the immediate integer 1655/// operand. 1656/// 1657/// Returns a [4 x float] vector consisting of four floats corresponding to 1658/// the four comparison results: zero if the comparison is false, and all 1's 1659/// if the comparison is true. 1660/// 1661/// \headerfile <x86intrin.h> 1662/// 1663/// \code 1664/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1665/// \endcode 1666/// 1667/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1668/// 1669/// \param a 1670/// A 128-bit vector of [4 x float]. 1671/// \param b 1672/// A 128-bit vector of [4 x float]. 1673/// \param c 1674/// An immediate integer operand, with bits [4:0] specifying which comparison 1675/// operation to use: \n 1676/// 0x00: Equal (ordered, non-signaling) \n 1677/// 0x01: Less-than (ordered, signaling) \n 1678/// 0x02: Less-than-or-equal (ordered, signaling) \n 1679/// 0x03: Unordered (non-signaling) \n 1680/// 0x04: Not-equal (unordered, non-signaling) \n 1681/// 0x05: Not-less-than (unordered, signaling) \n 1682/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1683/// 0x07: Ordered (non-signaling) \n 1684/// 0x08: Equal (unordered, non-signaling) \n 1685/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1686/// 0x0A: Not-greater-than (unordered, signaling) \n 1687/// 0x0B: False (ordered, non-signaling) \n 1688/// 0x0C: Not-equal (ordered, non-signaling) \n 1689/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1690/// 0x0E: Greater-than (ordered, signaling) \n 1691/// 0x0F: True (unordered, non-signaling) \n 1692/// 0x10: Equal (ordered, signaling) \n 1693/// 0x11: Less-than (ordered, non-signaling) \n 1694/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1695/// 0x13: Unordered (signaling) \n 1696/// 0x14: Not-equal (unordered, signaling) \n 1697/// 0x15: Not-less-than (unordered, non-signaling) \n 1698/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1699/// 0x17: Ordered (signaling) \n 1700/// 0x18: Equal (unordered, signaling) \n 1701/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1702/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1703/// 0x1B: False (ordered, signaling) \n 1704/// 0x1C: Not-equal (ordered, signaling) \n 1705/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1706/// 0x1E: Greater-than (ordered, non-signaling) \n 1707/// 0x1F: True (unordered, signaling) 1708/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1709#define _mm_cmp_ps(a, b, c) \ 1710 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1711 (__v4sf)(__m128)(b), (c)) 1712 1713/// Compares each of the corresponding double-precision values of two 1714/// 256-bit vectors of [4 x double], using the operation specified by the 1715/// immediate integer operand. 1716/// 1717/// Returns a [4 x double] vector consisting of four doubles corresponding to 1718/// the four comparison results: zero if the comparison is false, and all 1's 1719/// if the comparison is true. 1720/// 1721/// \headerfile <x86intrin.h> 1722/// 1723/// \code 1724/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1725/// \endcode 1726/// 1727/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1728/// 1729/// \param a 1730/// A 256-bit vector of [4 x double]. 1731/// \param b 1732/// A 256-bit vector of [4 x double]. 1733/// \param c 1734/// An immediate integer operand, with bits [4:0] specifying which comparison 1735/// operation to use: \n 1736/// 0x00: Equal (ordered, non-signaling) \n 1737/// 0x01: Less-than (ordered, signaling) \n 1738/// 0x02: Less-than-or-equal (ordered, signaling) \n 1739/// 0x03: Unordered (non-signaling) \n 1740/// 0x04: Not-equal (unordered, non-signaling) \n 1741/// 0x05: Not-less-than (unordered, signaling) \n 1742/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1743/// 0x07: Ordered (non-signaling) \n 1744/// 0x08: Equal (unordered, non-signaling) \n 1745/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1746/// 0x0A: Not-greater-than (unordered, signaling) \n 1747/// 0x0B: False (ordered, non-signaling) \n 1748/// 0x0C: Not-equal (ordered, non-signaling) \n 1749/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1750/// 0x0E: Greater-than (ordered, signaling) \n 1751/// 0x0F: True (unordered, non-signaling) \n 1752/// 0x10: Equal (ordered, signaling) \n 1753/// 0x11: Less-than (ordered, non-signaling) \n 1754/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1755/// 0x13: Unordered (signaling) \n 1756/// 0x14: Not-equal (unordered, signaling) \n 1757/// 0x15: Not-less-than (unordered, non-signaling) \n 1758/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1759/// 0x17: Ordered (signaling) \n 1760/// 0x18: Equal (unordered, signaling) \n 1761/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1762/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1763/// 0x1B: False (ordered, signaling) \n 1764/// 0x1C: Not-equal (ordered, signaling) \n 1765/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1766/// 0x1E: Greater-than (ordered, non-signaling) \n 1767/// 0x1F: True (unordered, signaling) 1768/// \returns A 256-bit vector of [4 x double] containing the comparison results. 1769#define _mm256_cmp_pd(a, b, c) \ 1770 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1771 (__v4df)(__m256d)(b), (c)) 1772 1773/// Compares each of the corresponding values of two 256-bit vectors of 1774/// [8 x float], using the operation specified by the immediate integer 1775/// operand. 1776/// 1777/// Returns a [8 x float] vector consisting of eight floats corresponding to 1778/// the eight comparison results: zero if the comparison is false, and all 1779/// 1's if the comparison is true. 1780/// 1781/// \headerfile <x86intrin.h> 1782/// 1783/// \code 1784/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1785/// \endcode 1786/// 1787/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1788/// 1789/// \param a 1790/// A 256-bit vector of [8 x float]. 1791/// \param b 1792/// A 256-bit vector of [8 x float]. 1793/// \param c 1794/// An immediate integer operand, with bits [4:0] specifying which comparison 1795/// operation to use: \n 1796/// 0x00: Equal (ordered, non-signaling) \n 1797/// 0x01: Less-than (ordered, signaling) \n 1798/// 0x02: Less-than-or-equal (ordered, signaling) \n 1799/// 0x03: Unordered (non-signaling) \n 1800/// 0x04: Not-equal (unordered, non-signaling) \n 1801/// 0x05: Not-less-than (unordered, signaling) \n 1802/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1803/// 0x07: Ordered (non-signaling) \n 1804/// 0x08: Equal (unordered, non-signaling) \n 1805/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1806/// 0x0A: Not-greater-than (unordered, signaling) \n 1807/// 0x0B: False (ordered, non-signaling) \n 1808/// 0x0C: Not-equal (ordered, non-signaling) \n 1809/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1810/// 0x0E: Greater-than (ordered, signaling) \n 1811/// 0x0F: True (unordered, non-signaling) \n 1812/// 0x10: Equal (ordered, signaling) \n 1813/// 0x11: Less-than (ordered, non-signaling) \n 1814/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1815/// 0x13: Unordered (signaling) \n 1816/// 0x14: Not-equal (unordered, signaling) \n 1817/// 0x15: Not-less-than (unordered, non-signaling) \n 1818/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1819/// 0x17: Ordered (signaling) \n 1820/// 0x18: Equal (unordered, signaling) \n 1821/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1822/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1823/// 0x1B: False (ordered, signaling) \n 1824/// 0x1C: Not-equal (ordered, signaling) \n 1825/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1826/// 0x1E: Greater-than (ordered, non-signaling) \n 1827/// 0x1F: True (unordered, signaling) 1828/// \returns A 256-bit vector of [8 x float] containing the comparison results. 1829#define _mm256_cmp_ps(a, b, c) \ 1830 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1831 (__v8sf)(__m256)(b), (c)) 1832 1833/// Compares each of the corresponding scalar double-precision values of 1834/// two 128-bit vectors of [2 x double], using the operation specified by the 1835/// immediate integer operand. 1836/// 1837/// If the result is true, all 64 bits of the destination vector are set; 1838/// otherwise they are cleared. 1839/// 1840/// \headerfile <x86intrin.h> 1841/// 1842/// \code 1843/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1844/// \endcode 1845/// 1846/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1847/// 1848/// \param a 1849/// A 128-bit vector of [2 x double]. 1850/// \param b 1851/// A 128-bit vector of [2 x double]. 1852/// \param c 1853/// An immediate integer operand, with bits [4:0] specifying which comparison 1854/// operation to use: \n 1855/// 0x00: Equal (ordered, non-signaling) \n 1856/// 0x01: Less-than (ordered, signaling) \n 1857/// 0x02: Less-than-or-equal (ordered, signaling) \n 1858/// 0x03: Unordered (non-signaling) \n 1859/// 0x04: Not-equal (unordered, non-signaling) \n 1860/// 0x05: Not-less-than (unordered, signaling) \n 1861/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1862/// 0x07: Ordered (non-signaling) \n 1863/// 0x08: Equal (unordered, non-signaling) \n 1864/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1865/// 0x0A: Not-greater-than (unordered, signaling) \n 1866/// 0x0B: False (ordered, non-signaling) \n 1867/// 0x0C: Not-equal (ordered, non-signaling) \n 1868/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1869/// 0x0E: Greater-than (ordered, signaling) \n 1870/// 0x0F: True (unordered, non-signaling) \n 1871/// 0x10: Equal (ordered, signaling) \n 1872/// 0x11: Less-than (ordered, non-signaling) \n 1873/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1874/// 0x13: Unordered (signaling) \n 1875/// 0x14: Not-equal (unordered, signaling) \n 1876/// 0x15: Not-less-than (unordered, non-signaling) \n 1877/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1878/// 0x17: Ordered (signaling) \n 1879/// 0x18: Equal (unordered, signaling) \n 1880/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1881/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1882/// 0x1B: False (ordered, signaling) \n 1883/// 0x1C: Not-equal (ordered, signaling) \n 1884/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1885/// 0x1E: Greater-than (ordered, non-signaling) \n 1886/// 0x1F: True (unordered, signaling) 1887/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1888#define _mm_cmp_sd(a, b, c) \ 1889 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1890 (__v2df)(__m128d)(b), (c)) 1891 1892/// Compares each of the corresponding scalar values of two 128-bit 1893/// vectors of [4 x float], using the operation specified by the immediate 1894/// integer operand. 1895/// 1896/// If the result is true, all 32 bits of the destination vector are set; 1897/// otherwise they are cleared. 1898/// 1899/// \headerfile <x86intrin.h> 1900/// 1901/// \code 1902/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1903/// \endcode 1904/// 1905/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1906/// 1907/// \param a 1908/// A 128-bit vector of [4 x float]. 1909/// \param b 1910/// A 128-bit vector of [4 x float]. 1911/// \param c 1912/// An immediate integer operand, with bits [4:0] specifying which comparison 1913/// operation to use: \n 1914/// 0x00: Equal (ordered, non-signaling) \n 1915/// 0x01: Less-than (ordered, signaling) \n 1916/// 0x02: Less-than-or-equal (ordered, signaling) \n 1917/// 0x03: Unordered (non-signaling) \n 1918/// 0x04: Not-equal (unordered, non-signaling) \n 1919/// 0x05: Not-less-than (unordered, signaling) \n 1920/// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1921/// 0x07: Ordered (non-signaling) \n 1922/// 0x08: Equal (unordered, non-signaling) \n 1923/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1924/// 0x0A: Not-greater-than (unordered, signaling) \n 1925/// 0x0B: False (ordered, non-signaling) \n 1926/// 0x0C: Not-equal (ordered, non-signaling) \n 1927/// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1928/// 0x0E: Greater-than (ordered, signaling) \n 1929/// 0x0F: True (unordered, non-signaling) \n 1930/// 0x10: Equal (ordered, signaling) \n 1931/// 0x11: Less-than (ordered, non-signaling) \n 1932/// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1933/// 0x13: Unordered (signaling) \n 1934/// 0x14: Not-equal (unordered, signaling) \n 1935/// 0x15: Not-less-than (unordered, non-signaling) \n 1936/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1937/// 0x17: Ordered (signaling) \n 1938/// 0x18: Equal (unordered, signaling) \n 1939/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1940/// 0x1A: Not-greater-than (unordered, non-signaling) \n 1941/// 0x1B: False (ordered, signaling) \n 1942/// 0x1C: Not-equal (ordered, signaling) \n 1943/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1944/// 0x1E: Greater-than (ordered, non-signaling) \n 1945/// 0x1F: True (unordered, signaling) 1946/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1947#define _mm_cmp_ss(a, b, c) \ 1948 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1949 (__v4sf)(__m128)(b), (c)) 1950 1951/// Takes a [8 x i32] vector and returns the vector element value 1952/// indexed by the immediate constant operand. 1953/// 1954/// \headerfile <x86intrin.h> 1955/// 1956/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1957/// instruction. 1958/// 1959/// \param __a 1960/// A 256-bit vector of [8 x i32]. 1961/// \param __imm 1962/// An immediate integer operand with bits [2:0] determining which vector 1963/// element is extracted and returned. 1964/// \returns A 32-bit integer containing the extracted 32 bits of extended 1965/// packed data. 1966#define _mm256_extract_epi32(X, N) \ 1967 (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)) 1968 1969/// Takes a [16 x i16] vector and returns the vector element value 1970/// indexed by the immediate constant operand. 1971/// 1972/// \headerfile <x86intrin.h> 1973/// 1974/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1975/// instruction. 1976/// 1977/// \param __a 1978/// A 256-bit integer vector of [16 x i16]. 1979/// \param __imm 1980/// An immediate integer operand with bits [3:0] determining which vector 1981/// element is extracted and returned. 1982/// \returns A 32-bit integer containing the extracted 16 bits of zero extended 1983/// packed data. 1984#define _mm256_extract_epi16(X, N) \ 1985 (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ 1986 (int)(N)) 1987 1988/// Takes a [32 x i8] vector and returns the vector element value 1989/// indexed by the immediate constant operand. 1990/// 1991/// \headerfile <x86intrin.h> 1992/// 1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1994/// instruction. 1995/// 1996/// \param __a 1997/// A 256-bit integer vector of [32 x i8]. 1998/// \param __imm 1999/// An immediate integer operand with bits [4:0] determining which vector 2000/// element is extracted and returned. 2001/// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2002/// packed data. 2003#define _mm256_extract_epi8(X, N) \ 2004 (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ 2005 (int)(N)) 2006 2007#ifdef __x86_64__ 2008/// Takes a [4 x i64] vector and returns the vector element value 2009/// indexed by the immediate constant operand. 2010/// 2011/// \headerfile <x86intrin.h> 2012/// 2013/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2014/// instruction. 2015/// 2016/// \param __a 2017/// A 256-bit integer vector of [4 x i64]. 2018/// \param __imm 2019/// An immediate integer operand with bits [1:0] determining which vector 2020/// element is extracted and returned. 2021/// \returns A 64-bit integer containing the extracted 64 bits of extended 2022/// packed data. 2023#define _mm256_extract_epi64(X, N) \ 2024 (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)) 2025#endif 2026 2027/// Takes a [8 x i32] vector and replaces the vector element value 2028/// indexed by the immediate constant operand by a new value. Returns the 2029/// modified vector. 2030/// 2031/// \headerfile <x86intrin.h> 2032/// 2033/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2034/// instruction. 2035/// 2036/// \param __a 2037/// A vector of [8 x i32] to be used by the insert operation. 2038/// \param __b 2039/// An integer value. The replacement value for the insert operation. 2040/// \param __imm 2041/// An immediate integer specifying the index of the vector element to be 2042/// replaced. 2043/// \returns A copy of vector \a __a, after replacing its element indexed by 2044/// \a __imm with \a __b. 2045#define _mm256_insert_epi32(X, I, N) \ 2046 (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ 2047 (int)(I), (int)(N)) 2048 2049 2050/// Takes a [16 x i16] vector and replaces the vector element value 2051/// indexed by the immediate constant operand with a new value. Returns the 2052/// modified vector. 2053/// 2054/// \headerfile <x86intrin.h> 2055/// 2056/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2057/// instruction. 2058/// 2059/// \param __a 2060/// A vector of [16 x i16] to be used by the insert operation. 2061/// \param __b 2062/// An i16 integer value. The replacement value for the insert operation. 2063/// \param __imm 2064/// An immediate integer specifying the index of the vector element to be 2065/// replaced. 2066/// \returns A copy of vector \a __a, after replacing its element indexed by 2067/// \a __imm with \a __b. 2068#define _mm256_insert_epi16(X, I, N) \ 2069 (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ 2070 (int)(I), (int)(N)) 2071 2072/// Takes a [32 x i8] vector and replaces the vector element value 2073/// indexed by the immediate constant operand with a new value. Returns the 2074/// modified vector. 2075/// 2076/// \headerfile <x86intrin.h> 2077/// 2078/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2079/// instruction. 2080/// 2081/// \param __a 2082/// A vector of [32 x i8] to be used by the insert operation. 2083/// \param __b 2084/// An i8 integer value. The replacement value for the insert operation. 2085/// \param __imm 2086/// An immediate integer specifying the index of the vector element to be 2087/// replaced. 2088/// \returns A copy of vector \a __a, after replacing its element indexed by 2089/// \a __imm with \a __b. 2090#define _mm256_insert_epi8(X, I, N) \ 2091 (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ 2092 (int)(I), (int)(N)) 2093 2094#ifdef __x86_64__ 2095/// Takes a [4 x i64] vector and replaces the vector element value 2096/// indexed by the immediate constant operand with a new value. Returns the 2097/// modified vector. 2098/// 2099/// \headerfile <x86intrin.h> 2100/// 2101/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2102/// instruction. 2103/// 2104/// \param __a 2105/// A vector of [4 x i64] to be used by the insert operation. 2106/// \param __b 2107/// A 64-bit integer value. The replacement value for the insert operation. 2108/// \param __imm 2109/// An immediate integer specifying the index of the vector element to be 2110/// replaced. 2111/// \returns A copy of vector \a __a, after replacing its element indexed by 2112/// \a __imm with \a __b. 2113#define _mm256_insert_epi64(X, I, N) \ 2114 (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ 2115 (long long)(I), (int)(N)) 2116#endif 2117 2118/* Conversion */ 2119/// Converts a vector of [4 x i32] into a vector of [4 x double]. 2120/// 2121/// \headerfile <x86intrin.h> 2122/// 2123/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2124/// 2125/// \param __a 2126/// A 128-bit integer vector of [4 x i32]. 2127/// \returns A 256-bit vector of [4 x double] containing the converted values. 2128static __inline __m256d __DEFAULT_FN_ATTRS 2129_mm256_cvtepi32_pd(__m128i __a) 2130{ 2131 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2132} 2133 2134/// Converts a vector of [8 x i32] into a vector of [8 x float]. 2135/// 2136/// \headerfile <x86intrin.h> 2137/// 2138/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2139/// 2140/// \param __a 2141/// A 256-bit integer vector. 2142/// \returns A 256-bit vector of [8 x float] containing the converted values. 2143static __inline __m256 __DEFAULT_FN_ATTRS 2144_mm256_cvtepi32_ps(__m256i __a) 2145{ 2146 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); 2147} 2148 2149/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2150/// [4 x float]. 2151/// 2152/// \headerfile <x86intrin.h> 2153/// 2154/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2155/// 2156/// \param __a 2157/// A 256-bit vector of [4 x double]. 2158/// \returns A 128-bit vector of [4 x float] containing the converted values. 2159static __inline __m128 __DEFAULT_FN_ATTRS 2160_mm256_cvtpd_ps(__m256d __a) 2161{ 2162 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2163} 2164 2165/// Converts a vector of [8 x float] into a vector of [8 x i32]. 2166/// 2167/// \headerfile <x86intrin.h> 2168/// 2169/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2170/// 2171/// \param __a 2172/// A 256-bit vector of [8 x float]. 2173/// \returns A 256-bit integer vector containing the converted values. 2174static __inline __m256i __DEFAULT_FN_ATTRS 2175_mm256_cvtps_epi32(__m256 __a) 2176{ 2177 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2178} 2179 2180/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2181/// x double]. 2182/// 2183/// \headerfile <x86intrin.h> 2184/// 2185/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2186/// 2187/// \param __a 2188/// A 128-bit vector of [4 x float]. 2189/// \returns A 256-bit vector of [4 x double] containing the converted values. 2190static __inline __m256d __DEFAULT_FN_ATTRS 2191_mm256_cvtps_pd(__m128 __a) 2192{ 2193 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2194} 2195 2196/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2197/// x i32], truncating the result by rounding towards zero when it is 2198/// inexact. 2199/// 2200/// \headerfile <x86intrin.h> 2201/// 2202/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2203/// 2204/// \param __a 2205/// A 256-bit vector of [4 x double]. 2206/// \returns A 128-bit integer vector containing the converted values. 2207static __inline __m128i __DEFAULT_FN_ATTRS 2208_mm256_cvttpd_epi32(__m256d __a) 2209{ 2210 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2211} 2212 2213/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2214/// x i32]. When a conversion is inexact, the value returned is rounded 2215/// according to the rounding control bits in the MXCSR register. 2216/// 2217/// \headerfile <x86intrin.h> 2218/// 2219/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2220/// 2221/// \param __a 2222/// A 256-bit vector of [4 x double]. 2223/// \returns A 128-bit integer vector containing the converted values. 2224static __inline __m128i __DEFAULT_FN_ATTRS 2225_mm256_cvtpd_epi32(__m256d __a) 2226{ 2227 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2228} 2229 2230/// Converts a vector of [8 x float] into a vector of [8 x i32], 2231/// truncating the result by rounding towards zero when it is inexact. 2232/// 2233/// \headerfile <x86intrin.h> 2234/// 2235/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2236/// 2237/// \param __a 2238/// A 256-bit vector of [8 x float]. 2239/// \returns A 256-bit integer vector containing the converted values. 2240static __inline __m256i __DEFAULT_FN_ATTRS 2241_mm256_cvttps_epi32(__m256 __a) 2242{ 2243 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2244} 2245 2246/// Returns the first element of the input vector of [4 x double]. 2247/// 2248/// \headerfile <avxintrin.h> 2249/// 2250/// This intrinsic is a utility function and does not correspond to a specific 2251/// instruction. 2252/// 2253/// \param __a 2254/// A 256-bit vector of [4 x double]. 2255/// \returns A 64 bit double containing the first element of the input vector. 2256static __inline double __DEFAULT_FN_ATTRS 2257_mm256_cvtsd_f64(__m256d __a) 2258{ 2259 return __a[0]; 2260} 2261 2262/// Returns the first element of the input vector of [8 x i32]. 2263/// 2264/// \headerfile <avxintrin.h> 2265/// 2266/// This intrinsic is a utility function and does not correspond to a specific 2267/// instruction. 2268/// 2269/// \param __a 2270/// A 256-bit vector of [8 x i32]. 2271/// \returns A 32 bit integer containing the first element of the input vector. 2272static __inline int __DEFAULT_FN_ATTRS 2273_mm256_cvtsi256_si32(__m256i __a) 2274{ 2275 __v8si __b = (__v8si)__a; 2276 return __b[0]; 2277} 2278 2279/// Returns the first element of the input vector of [8 x float]. 2280/// 2281/// \headerfile <avxintrin.h> 2282/// 2283/// This intrinsic is a utility function and does not correspond to a specific 2284/// instruction. 2285/// 2286/// \param __a 2287/// A 256-bit vector of [8 x float]. 2288/// \returns A 32 bit float containing the first element of the input vector. 2289static __inline float __DEFAULT_FN_ATTRS 2290_mm256_cvtss_f32(__m256 __a) 2291{ 2292 return __a[0]; 2293} 2294 2295/* Vector replicate */ 2296/// Moves and duplicates odd-indexed values from a 256-bit vector of 2297/// [8 x float] to float values in a 256-bit vector of [8 x float]. 2298/// 2299/// \headerfile <x86intrin.h> 2300/// 2301/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2302/// 2303/// \param __a 2304/// A 256-bit vector of [8 x float]. \n 2305/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2306/// the return value. \n 2307/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2308/// the return value. \n 2309/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2310/// return value. \n 2311/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2312/// return value. 2313/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2314/// values. 2315static __inline __m256 __DEFAULT_FN_ATTRS 2316_mm256_movehdup_ps(__m256 __a) 2317{ 2318 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2319} 2320 2321/// Moves and duplicates even-indexed values from a 256-bit vector of 2322/// [8 x float] to float values in a 256-bit vector of [8 x float]. 2323/// 2324/// \headerfile <x86intrin.h> 2325/// 2326/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2327/// 2328/// \param __a 2329/// A 256-bit vector of [8 x float]. \n 2330/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2331/// the return value. \n 2332/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2333/// the return value. \n 2334/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2335/// return value. \n 2336/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2337/// return value. 2338/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2339/// values. 2340static __inline __m256 __DEFAULT_FN_ATTRS 2341_mm256_moveldup_ps(__m256 __a) 2342{ 2343 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2344} 2345 2346/// Moves and duplicates double-precision floating point values from a 2347/// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2348/// vector of [4 x double]. 2349/// 2350/// \headerfile <x86intrin.h> 2351/// 2352/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2353/// 2354/// \param __a 2355/// A 256-bit vector of [4 x double]. \n 2356/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2357/// return value. \n 2358/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2359/// the return value. 2360/// \returns A 256-bit vector of [4 x double] containing the moved and 2361/// duplicated values. 2362static __inline __m256d __DEFAULT_FN_ATTRS 2363_mm256_movedup_pd(__m256d __a) 2364{ 2365 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2366} 2367 2368/* Unpack and Interleave */ 2369/// Unpacks the odd-indexed vector elements from two 256-bit vectors of 2370/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2371/// 2372/// \headerfile <x86intrin.h> 2373/// 2374/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2375/// 2376/// \param __a 2377/// A 256-bit floating-point vector of [4 x double]. \n 2378/// Bits [127:64] are written to bits [63:0] of the return value. \n 2379/// Bits [255:192] are written to bits [191:128] of the return value. \n 2380/// \param __b 2381/// A 256-bit floating-point vector of [4 x double]. \n 2382/// Bits [127:64] are written to bits [127:64] of the return value. \n 2383/// Bits [255:192] are written to bits [255:192] of the return value. \n 2384/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2385static __inline __m256d __DEFAULT_FN_ATTRS 2386_mm256_unpackhi_pd(__m256d __a, __m256d __b) 2387{ 2388 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2389} 2390 2391/// Unpacks the even-indexed vector elements from two 256-bit vectors of 2392/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2393/// 2394/// \headerfile <x86intrin.h> 2395/// 2396/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2397/// 2398/// \param __a 2399/// A 256-bit floating-point vector of [4 x double]. \n 2400/// Bits [63:0] are written to bits [63:0] of the return value. \n 2401/// Bits [191:128] are written to bits [191:128] of the return value. 2402/// \param __b 2403/// A 256-bit floating-point vector of [4 x double]. \n 2404/// Bits [63:0] are written to bits [127:64] of the return value. \n 2405/// Bits [191:128] are written to bits [255:192] of the return value. \n 2406/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2407static __inline __m256d __DEFAULT_FN_ATTRS 2408_mm256_unpacklo_pd(__m256d __a, __m256d __b) 2409{ 2410 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2411} 2412 2413/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2414/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2415/// vector of [8 x float]. 2416/// 2417/// \headerfile <x86intrin.h> 2418/// 2419/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2420/// 2421/// \param __a 2422/// A 256-bit vector of [8 x float]. \n 2423/// Bits [95:64] are written to bits [31:0] of the return value. \n 2424/// Bits [127:96] are written to bits [95:64] of the return value. \n 2425/// Bits [223:192] are written to bits [159:128] of the return value. \n 2426/// Bits [255:224] are written to bits [223:192] of the return value. 2427/// \param __b 2428/// A 256-bit vector of [8 x float]. \n 2429/// Bits [95:64] are written to bits [63:32] of the return value. \n 2430/// Bits [127:96] are written to bits [127:96] of the return value. \n 2431/// Bits [223:192] are written to bits [191:160] of the return value. \n 2432/// Bits [255:224] are written to bits [255:224] of the return value. 2433/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2434static __inline __m256 __DEFAULT_FN_ATTRS 2435_mm256_unpackhi_ps(__m256 __a, __m256 __b) 2436{ 2437 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2438} 2439 2440/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2441/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2442/// vector of [8 x float]. 2443/// 2444/// \headerfile <x86intrin.h> 2445/// 2446/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2447/// 2448/// \param __a 2449/// A 256-bit vector of [8 x float]. \n 2450/// Bits [31:0] are written to bits [31:0] of the return value. \n 2451/// Bits [63:32] are written to bits [95:64] of the return value. \n 2452/// Bits [159:128] are written to bits [159:128] of the return value. \n 2453/// Bits [191:160] are written to bits [223:192] of the return value. 2454/// \param __b 2455/// A 256-bit vector of [8 x float]. \n 2456/// Bits [31:0] are written to bits [63:32] of the return value. \n 2457/// Bits [63:32] are written to bits [127:96] of the return value. \n 2458/// Bits [159:128] are written to bits [191:160] of the return value. \n 2459/// Bits [191:160] are written to bits [255:224] of the return value. 2460/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2461static __inline __m256 __DEFAULT_FN_ATTRS 2462_mm256_unpacklo_ps(__m256 __a, __m256 __b) 2463{ 2464 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2465} 2466 2467/* Bit Test */ 2468/// Given two 128-bit floating-point vectors of [2 x double], perform an 2469/// element-by-element comparison of the double-precision element in the 2470/// first source vector and the corresponding element in the second source 2471/// vector. 2472/// 2473/// The EFLAGS register is updated as follows: \n 2474/// If there is at least one pair of double-precision elements where the 2475/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2476/// ZF flag is set to 1. \n 2477/// If there is at least one pair of double-precision elements where the 2478/// sign-bit of the first element is 0 and the sign-bit of the second element 2479/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2480/// This intrinsic returns the value of the ZF flag. 2481/// 2482/// \headerfile <x86intrin.h> 2483/// 2484/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2485/// 2486/// \param __a 2487/// A 128-bit vector of [2 x double]. 2488/// \param __b 2489/// A 128-bit vector of [2 x double]. 2490/// \returns the ZF flag in the EFLAGS register. 2491static __inline int __DEFAULT_FN_ATTRS128 2492_mm_testz_pd(__m128d __a, __m128d __b) 2493{ 2494 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2495} 2496 2497/// Given two 128-bit floating-point vectors of [2 x double], perform an 2498/// element-by-element comparison of the double-precision element in the 2499/// first source vector and the corresponding element in the second source 2500/// vector. 2501/// 2502/// The EFLAGS register is updated as follows: \n 2503/// If there is at least one pair of double-precision elements where the 2504/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2505/// ZF flag is set to 1. \n 2506/// If there is at least one pair of double-precision elements where the 2507/// sign-bit of the first element is 0 and the sign-bit of the second element 2508/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2509/// This intrinsic returns the value of the CF flag. 2510/// 2511/// \headerfile <x86intrin.h> 2512/// 2513/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2514/// 2515/// \param __a 2516/// A 128-bit vector of [2 x double]. 2517/// \param __b 2518/// A 128-bit vector of [2 x double]. 2519/// \returns the CF flag in the EFLAGS register. 2520static __inline int __DEFAULT_FN_ATTRS128 2521_mm_testc_pd(__m128d __a, __m128d __b) 2522{ 2523 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2524} 2525 2526/// Given two 128-bit floating-point vectors of [2 x double], perform an 2527/// element-by-element comparison of the double-precision element in the 2528/// first source vector and the corresponding element in the second source 2529/// vector. 2530/// 2531/// The EFLAGS register is updated as follows: \n 2532/// If there is at least one pair of double-precision elements where the 2533/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2534/// ZF flag is set to 1. \n 2535/// If there is at least one pair of double-precision elements where the 2536/// sign-bit of the first element is 0 and the sign-bit of the second element 2537/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2538/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2539/// otherwise it returns 0. 2540/// 2541/// \headerfile <x86intrin.h> 2542/// 2543/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2544/// 2545/// \param __a 2546/// A 128-bit vector of [2 x double]. 2547/// \param __b 2548/// A 128-bit vector of [2 x double]. 2549/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2550static __inline int __DEFAULT_FN_ATTRS128 2551_mm_testnzc_pd(__m128d __a, __m128d __b) 2552{ 2553 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2554} 2555 2556/// Given two 128-bit floating-point vectors of [4 x float], perform an 2557/// element-by-element comparison of the single-precision element in the 2558/// first source vector and the corresponding element in the second source 2559/// vector. 2560/// 2561/// The EFLAGS register is updated as follows: \n 2562/// If there is at least one pair of single-precision elements where the 2563/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2564/// ZF flag is set to 1. \n 2565/// If there is at least one pair of single-precision elements where the 2566/// sign-bit of the first element is 0 and the sign-bit of the second element 2567/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2568/// This intrinsic returns the value of the ZF flag. 2569/// 2570/// \headerfile <x86intrin.h> 2571/// 2572/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2573/// 2574/// \param __a 2575/// A 128-bit vector of [4 x float]. 2576/// \param __b 2577/// A 128-bit vector of [4 x float]. 2578/// \returns the ZF flag. 2579static __inline int __DEFAULT_FN_ATTRS128 2580_mm_testz_ps(__m128 __a, __m128 __b) 2581{ 2582 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2583} 2584 2585/// Given two 128-bit floating-point vectors of [4 x float], perform an 2586/// element-by-element comparison of the single-precision element in the 2587/// first source vector and the corresponding element in the second source 2588/// vector. 2589/// 2590/// The EFLAGS register is updated as follows: \n 2591/// If there is at least one pair of single-precision elements where the 2592/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2593/// ZF flag is set to 1. \n 2594/// If there is at least one pair of single-precision elements where the 2595/// sign-bit of the first element is 0 and the sign-bit of the second element 2596/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2597/// This intrinsic returns the value of the CF flag. 2598/// 2599/// \headerfile <x86intrin.h> 2600/// 2601/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2602/// 2603/// \param __a 2604/// A 128-bit vector of [4 x float]. 2605/// \param __b 2606/// A 128-bit vector of [4 x float]. 2607/// \returns the CF flag. 2608static __inline int __DEFAULT_FN_ATTRS128 2609_mm_testc_ps(__m128 __a, __m128 __b) 2610{ 2611 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2612} 2613 2614/// Given two 128-bit floating-point vectors of [4 x float], perform an 2615/// element-by-element comparison of the single-precision element in the 2616/// first source vector and the corresponding element in the second source 2617/// vector. 2618/// 2619/// The EFLAGS register is updated as follows: \n 2620/// If there is at least one pair of single-precision elements where the 2621/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2622/// ZF flag is set to 1. \n 2623/// If there is at least one pair of single-precision elements where the 2624/// sign-bit of the first element is 0 and the sign-bit of the second element 2625/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2626/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2627/// otherwise it returns 0. 2628/// 2629/// \headerfile <x86intrin.h> 2630/// 2631/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2632/// 2633/// \param __a 2634/// A 128-bit vector of [4 x float]. 2635/// \param __b 2636/// A 128-bit vector of [4 x float]. 2637/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2638static __inline int __DEFAULT_FN_ATTRS128 2639_mm_testnzc_ps(__m128 __a, __m128 __b) 2640{ 2641 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2642} 2643 2644/// Given two 256-bit floating-point vectors of [4 x double], perform an 2645/// element-by-element comparison of the double-precision elements in the 2646/// first source vector and the corresponding elements in the second source 2647/// vector. 2648/// 2649/// The EFLAGS register is updated as follows: \n 2650/// If there is at least one pair of double-precision elements where the 2651/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2652/// ZF flag is set to 1. \n 2653/// If there is at least one pair of double-precision elements where the 2654/// sign-bit of the first element is 0 and the sign-bit of the second element 2655/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2656/// This intrinsic returns the value of the ZF flag. 2657/// 2658/// \headerfile <x86intrin.h> 2659/// 2660/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2661/// 2662/// \param __a 2663/// A 256-bit vector of [4 x double]. 2664/// \param __b 2665/// A 256-bit vector of [4 x double]. 2666/// \returns the ZF flag. 2667static __inline int __DEFAULT_FN_ATTRS 2668_mm256_testz_pd(__m256d __a, __m256d __b) 2669{ 2670 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2671} 2672 2673/// Given two 256-bit floating-point vectors of [4 x double], perform an 2674/// element-by-element comparison of the double-precision elements in the 2675/// first source vector and the corresponding elements in the second source 2676/// vector. 2677/// 2678/// The EFLAGS register is updated as follows: \n 2679/// If there is at least one pair of double-precision elements where the 2680/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2681/// ZF flag is set to 1. \n 2682/// If there is at least one pair of double-precision elements where the 2683/// sign-bit of the first element is 0 and the sign-bit of the second element 2684/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2685/// This intrinsic returns the value of the CF flag. 2686/// 2687/// \headerfile <x86intrin.h> 2688/// 2689/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2690/// 2691/// \param __a 2692/// A 256-bit vector of [4 x double]. 2693/// \param __b 2694/// A 256-bit vector of [4 x double]. 2695/// \returns the CF flag. 2696static __inline int __DEFAULT_FN_ATTRS 2697_mm256_testc_pd(__m256d __a, __m256d __b) 2698{ 2699 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2700} 2701 2702/// Given two 256-bit floating-point vectors of [4 x double], perform an 2703/// element-by-element comparison of the double-precision elements in the 2704/// first source vector and the corresponding elements in the second source 2705/// vector. 2706/// 2707/// The EFLAGS register is updated as follows: \n 2708/// If there is at least one pair of double-precision elements where the 2709/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2710/// ZF flag is set to 1. \n 2711/// If there is at least one pair of double-precision elements where the 2712/// sign-bit of the first element is 0 and the sign-bit of the second element 2713/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2714/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2715/// otherwise it returns 0. 2716/// 2717/// \headerfile <x86intrin.h> 2718/// 2719/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2720/// 2721/// \param __a 2722/// A 256-bit vector of [4 x double]. 2723/// \param __b 2724/// A 256-bit vector of [4 x double]. 2725/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2726static __inline int __DEFAULT_FN_ATTRS 2727_mm256_testnzc_pd(__m256d __a, __m256d __b) 2728{ 2729 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2730} 2731 2732/// Given two 256-bit floating-point vectors of [8 x float], perform an 2733/// element-by-element comparison of the single-precision element in the 2734/// first source vector and the corresponding element in the second source 2735/// vector. 2736/// 2737/// The EFLAGS register is updated as follows: \n 2738/// If there is at least one pair of single-precision elements where the 2739/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2740/// ZF flag is set to 1. \n 2741/// If there is at least one pair of single-precision elements where the 2742/// sign-bit of the first element is 0 and the sign-bit of the second element 2743/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2744/// This intrinsic returns the value of the ZF flag. 2745/// 2746/// \headerfile <x86intrin.h> 2747/// 2748/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2749/// 2750/// \param __a 2751/// A 256-bit vector of [8 x float]. 2752/// \param __b 2753/// A 256-bit vector of [8 x float]. 2754/// \returns the ZF flag. 2755static __inline int __DEFAULT_FN_ATTRS 2756_mm256_testz_ps(__m256 __a, __m256 __b) 2757{ 2758 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2759} 2760 2761/// Given two 256-bit floating-point vectors of [8 x float], perform an 2762/// element-by-element comparison of the single-precision element in the 2763/// first source vector and the corresponding element in the second source 2764/// vector. 2765/// 2766/// The EFLAGS register is updated as follows: \n 2767/// If there is at least one pair of single-precision elements where the 2768/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2769/// ZF flag is set to 1. \n 2770/// If there is at least one pair of single-precision elements where the 2771/// sign-bit of the first element is 0 and the sign-bit of the second element 2772/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2773/// This intrinsic returns the value of the CF flag. 2774/// 2775/// \headerfile <x86intrin.h> 2776/// 2777/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2778/// 2779/// \param __a 2780/// A 256-bit vector of [8 x float]. 2781/// \param __b 2782/// A 256-bit vector of [8 x float]. 2783/// \returns the CF flag. 2784static __inline int __DEFAULT_FN_ATTRS 2785_mm256_testc_ps(__m256 __a, __m256 __b) 2786{ 2787 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2788} 2789 2790/// Given two 256-bit floating-point vectors of [8 x float], perform an 2791/// element-by-element comparison of the single-precision elements in the 2792/// first source vector and the corresponding elements in the second source 2793/// vector. 2794/// 2795/// The EFLAGS register is updated as follows: \n 2796/// If there is at least one pair of single-precision elements where the 2797/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2798/// ZF flag is set to 1. \n 2799/// If there is at least one pair of single-precision elements where the 2800/// sign-bit of the first element is 0 and the sign-bit of the second element 2801/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2802/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2803/// otherwise it returns 0. 2804/// 2805/// \headerfile <x86intrin.h> 2806/// 2807/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2808/// 2809/// \param __a 2810/// A 256-bit vector of [8 x float]. 2811/// \param __b 2812/// A 256-bit vector of [8 x float]. 2813/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2814static __inline int __DEFAULT_FN_ATTRS 2815_mm256_testnzc_ps(__m256 __a, __m256 __b) 2816{ 2817 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2818} 2819 2820/// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2821/// of the two source vectors. 2822/// 2823/// The EFLAGS register is updated as follows: \n 2824/// If there is at least one pair of bits where both bits are 1, the ZF flag 2825/// is set to 0. Otherwise the ZF flag is set to 1. \n 2826/// If there is at least one pair of bits where the bit from the first source 2827/// vector is 0 and the bit from the second source vector is 1, the CF flag 2828/// is set to 0. Otherwise the CF flag is set to 1. \n 2829/// This intrinsic returns the value of the ZF flag. 2830/// 2831/// \headerfile <x86intrin.h> 2832/// 2833/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2834/// 2835/// \param __a 2836/// A 256-bit integer vector. 2837/// \param __b 2838/// A 256-bit integer vector. 2839/// \returns the ZF flag. 2840static __inline int __DEFAULT_FN_ATTRS 2841_mm256_testz_si256(__m256i __a, __m256i __b) 2842{ 2843 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2844} 2845 2846/// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2847/// of the two source vectors. 2848/// 2849/// The EFLAGS register is updated as follows: \n 2850/// If there is at least one pair of bits where both bits are 1, the ZF flag 2851/// is set to 0. Otherwise the ZF flag is set to 1. \n 2852/// If there is at least one pair of bits where the bit from the first source 2853/// vector is 0 and the bit from the second source vector is 1, the CF flag 2854/// is set to 0. Otherwise the CF flag is set to 1. \n 2855/// This intrinsic returns the value of the CF flag. 2856/// 2857/// \headerfile <x86intrin.h> 2858/// 2859/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2860/// 2861/// \param __a 2862/// A 256-bit integer vector. 2863/// \param __b 2864/// A 256-bit integer vector. 2865/// \returns the CF flag. 2866static __inline int __DEFAULT_FN_ATTRS 2867_mm256_testc_si256(__m256i __a, __m256i __b) 2868{ 2869 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2870} 2871 2872/// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2873/// of the two source vectors. 2874/// 2875/// The EFLAGS register is updated as follows: \n 2876/// If there is at least one pair of bits where both bits are 1, the ZF flag 2877/// is set to 0. Otherwise the ZF flag is set to 1. \n 2878/// If there is at least one pair of bits where the bit from the first source 2879/// vector is 0 and the bit from the second source vector is 1, the CF flag 2880/// is set to 0. Otherwise the CF flag is set to 1. \n 2881/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2882/// otherwise it returns 0. 2883/// 2884/// \headerfile <x86intrin.h> 2885/// 2886/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2887/// 2888/// \param __a 2889/// A 256-bit integer vector. 2890/// \param __b 2891/// A 256-bit integer vector. 2892/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2893static __inline int __DEFAULT_FN_ATTRS 2894_mm256_testnzc_si256(__m256i __a, __m256i __b) 2895{ 2896 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2897} 2898 2899/* Vector extract sign mask */ 2900/// Extracts the sign bits of double-precision floating point elements 2901/// in a 256-bit vector of [4 x double] and writes them to the lower order 2902/// bits of the return value. 2903/// 2904/// \headerfile <x86intrin.h> 2905/// 2906/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2907/// 2908/// \param __a 2909/// A 256-bit vector of [4 x double] containing the double-precision 2910/// floating point values with sign bits to be extracted. 2911/// \returns The sign bits from the operand, written to bits [3:0]. 2912static __inline int __DEFAULT_FN_ATTRS 2913_mm256_movemask_pd(__m256d __a) 2914{ 2915 return __builtin_ia32_movmskpd256((__v4df)__a); 2916} 2917 2918/// Extracts the sign bits of single-precision floating point elements 2919/// in a 256-bit vector of [8 x float] and writes them to the lower order 2920/// bits of the return value. 2921/// 2922/// \headerfile <x86intrin.h> 2923/// 2924/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 2925/// 2926/// \param __a 2927/// A 256-bit vector of [8 x float] containing the single-precision floating 2928/// point values with sign bits to be extracted. 2929/// \returns The sign bits from the operand, written to bits [7:0]. 2930static __inline int __DEFAULT_FN_ATTRS 2931_mm256_movemask_ps(__m256 __a) 2932{ 2933 return __builtin_ia32_movmskps256((__v8sf)__a); 2934} 2935 2936/* Vector __zero */ 2937/// Zeroes the contents of all XMM or YMM registers. 2938/// 2939/// \headerfile <x86intrin.h> 2940/// 2941/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 2942static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 2943_mm256_zeroall(void) 2944{ 2945 __builtin_ia32_vzeroall(); 2946} 2947 2948/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 2949/// 2950/// \headerfile <x86intrin.h> 2951/// 2952/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 2953static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 2954_mm256_zeroupper(void) 2955{ 2956 __builtin_ia32_vzeroupper(); 2957} 2958 2959/* Vector load with broadcast */ 2960/// Loads a scalar single-precision floating point value from the 2961/// specified address pointed to by \a __a and broadcasts it to the elements 2962/// of a [4 x float] vector. 2963/// 2964/// \headerfile <x86intrin.h> 2965/// 2966/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 2967/// 2968/// \param __a 2969/// The single-precision floating point value to be broadcast. 2970/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 2971/// equal to the broadcast value. 2972static __inline __m128 __DEFAULT_FN_ATTRS128 2973_mm_broadcast_ss(float const *__a) 2974{ 2975 float __f = *__a; 2976 return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f }; 2977} 2978 2979/// Loads a scalar double-precision floating point value from the 2980/// specified address pointed to by \a __a and broadcasts it to the elements 2981/// of a [4 x double] vector. 2982/// 2983/// \headerfile <x86intrin.h> 2984/// 2985/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 2986/// 2987/// \param __a 2988/// The double-precision floating point value to be broadcast. 2989/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 2990/// equal to the broadcast value. 2991static __inline __m256d __DEFAULT_FN_ATTRS 2992_mm256_broadcast_sd(double const *__a) 2993{ 2994 double __d = *__a; 2995 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; 2996} 2997 2998/// Loads a scalar single-precision floating point value from the 2999/// specified address pointed to by \a __a and broadcasts it to the elements 3000/// of a [8 x float] vector. 3001/// 3002/// \headerfile <x86intrin.h> 3003/// 3004/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3005/// 3006/// \param __a 3007/// The single-precision floating point value to be broadcast. 3008/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3009/// equal to the broadcast value. 3010static __inline __m256 __DEFAULT_FN_ATTRS 3011_mm256_broadcast_ss(float const *__a) 3012{ 3013 float __f = *__a; 3014 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3015} 3016 3017/// Loads the data from a 128-bit vector of [2 x double] from the 3018/// specified address pointed to by \a __a and broadcasts it to 128-bit 3019/// elements in a 256-bit vector of [4 x double]. 3020/// 3021/// \headerfile <x86intrin.h> 3022/// 3023/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3024/// 3025/// \param __a 3026/// The 128-bit vector of [2 x double] to be broadcast. 3027/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3028/// equal to the broadcast value. 3029static __inline __m256d __DEFAULT_FN_ATTRS 3030_mm256_broadcast_pd(__m128d const *__a) 3031{ 3032 __m128d __b = _mm_loadu_pd((const double *)__a); 3033 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, 3034 0, 1, 0, 1); 3035} 3036 3037/// Loads the data from a 128-bit vector of [4 x float] from the 3038/// specified address pointed to by \a __a and broadcasts it to 128-bit 3039/// elements in a 256-bit vector of [8 x float]. 3040/// 3041/// \headerfile <x86intrin.h> 3042/// 3043/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3044/// 3045/// \param __a 3046/// The 128-bit vector of [4 x float] to be broadcast. 3047/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3048/// equal to the broadcast value. 3049static __inline __m256 __DEFAULT_FN_ATTRS 3050_mm256_broadcast_ps(__m128 const *__a) 3051{ 3052 __m128 __b = _mm_loadu_ps((const float *)__a); 3053 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, 3054 0, 1, 2, 3, 0, 1, 2, 3); 3055} 3056 3057/* SIMD load ops */ 3058/// Loads 4 double-precision floating point values from a 32-byte aligned 3059/// memory location pointed to by \a __p into a vector of [4 x double]. 3060/// 3061/// \headerfile <x86intrin.h> 3062/// 3063/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3064/// 3065/// \param __p 3066/// A 32-byte aligned pointer to a memory location containing 3067/// double-precision floating point values. 3068/// \returns A 256-bit vector of [4 x double] containing the moved values. 3069static __inline __m256d __DEFAULT_FN_ATTRS 3070_mm256_load_pd(double const *__p) 3071{ 3072 return *(const __m256d *)__p; 3073} 3074 3075/// Loads 8 single-precision floating point values from a 32-byte aligned 3076/// memory location pointed to by \a __p into a vector of [8 x float]. 3077/// 3078/// \headerfile <x86intrin.h> 3079/// 3080/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3081/// 3082/// \param __p 3083/// A 32-byte aligned pointer to a memory location containing float values. 3084/// \returns A 256-bit vector of [8 x float] containing the moved values. 3085static __inline __m256 __DEFAULT_FN_ATTRS 3086_mm256_load_ps(float const *__p) 3087{ 3088 return *(const __m256 *)__p; 3089} 3090 3091/// Loads 4 double-precision floating point values from an unaligned 3092/// memory location pointed to by \a __p into a vector of [4 x double]. 3093/// 3094/// \headerfile <x86intrin.h> 3095/// 3096/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3097/// 3098/// \param __p 3099/// A pointer to a memory location containing double-precision floating 3100/// point values. 3101/// \returns A 256-bit vector of [4 x double] containing the moved values. 3102static __inline __m256d __DEFAULT_FN_ATTRS 3103_mm256_loadu_pd(double const *__p) 3104{ 3105 struct __loadu_pd { 3106 __m256d_u __v; 3107 } __attribute__((__packed__, __may_alias__)); 3108 return ((const struct __loadu_pd*)__p)->__v; 3109} 3110 3111/// Loads 8 single-precision floating point values from an unaligned 3112/// memory location pointed to by \a __p into a vector of [8 x float]. 3113/// 3114/// \headerfile <x86intrin.h> 3115/// 3116/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3117/// 3118/// \param __p 3119/// A pointer to a memory location containing single-precision floating 3120/// point values. 3121/// \returns A 256-bit vector of [8 x float] containing the moved values. 3122static __inline __m256 __DEFAULT_FN_ATTRS 3123_mm256_loadu_ps(float const *__p) 3124{ 3125 struct __loadu_ps { 3126 __m256_u __v; 3127 } __attribute__((__packed__, __may_alias__)); 3128 return ((const struct __loadu_ps*)__p)->__v; 3129} 3130 3131/// Loads 256 bits of integer data from a 32-byte aligned memory 3132/// location pointed to by \a __p into elements of a 256-bit integer vector. 3133/// 3134/// \headerfile <x86intrin.h> 3135/// 3136/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3137/// 3138/// \param __p 3139/// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3140/// values. 3141/// \returns A 256-bit integer vector containing the moved values. 3142static __inline __m256i __DEFAULT_FN_ATTRS 3143_mm256_load_si256(__m256i const *__p) 3144{ 3145 return *__p; 3146} 3147 3148/// Loads 256 bits of integer data from an unaligned memory location 3149/// pointed to by \a __p into a 256-bit integer vector. 3150/// 3151/// \headerfile <x86intrin.h> 3152/// 3153/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3154/// 3155/// \param __p 3156/// A pointer to a 256-bit integer vector containing integer values. 3157/// \returns A 256-bit integer vector containing the moved values. 3158static __inline __m256i __DEFAULT_FN_ATTRS 3159_mm256_loadu_si256(__m256i_u const *__p) 3160{ 3161 struct __loadu_si256 { 3162 __m256i_u __v; 3163 } __attribute__((__packed__, __may_alias__)); 3164 return ((const struct __loadu_si256*)__p)->__v; 3165} 3166 3167/// Loads 256 bits of integer data from an unaligned memory location 3168/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3169/// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3170/// line boundary. 3171/// 3172/// \headerfile <x86intrin.h> 3173/// 3174/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3175/// 3176/// \param __p 3177/// A pointer to a 256-bit integer vector containing integer values. 3178/// \returns A 256-bit integer vector containing the moved values. 3179static __inline __m256i __DEFAULT_FN_ATTRS 3180_mm256_lddqu_si256(__m256i const *__p) 3181{ 3182 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3183} 3184 3185/* SIMD store ops */ 3186/// Stores double-precision floating point values from a 256-bit vector 3187/// of [4 x double] to a 32-byte aligned memory location pointed to by 3188/// \a __p. 3189/// 3190/// \headerfile <x86intrin.h> 3191/// 3192/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3193/// 3194/// \param __p 3195/// A 32-byte aligned pointer to a memory location that will receive the 3196/// double-precision floaing point values. 3197/// \param __a 3198/// A 256-bit vector of [4 x double] containing the values to be moved. 3199static __inline void __DEFAULT_FN_ATTRS 3200_mm256_store_pd(double *__p, __m256d __a) 3201{ 3202 *(__m256d *)__p = __a; 3203} 3204 3205/// Stores single-precision floating point values from a 256-bit vector 3206/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3207/// 3208/// \headerfile <x86intrin.h> 3209/// 3210/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3211/// 3212/// \param __p 3213/// A 32-byte aligned pointer to a memory location that will receive the 3214/// float values. 3215/// \param __a 3216/// A 256-bit vector of [8 x float] containing the values to be moved. 3217static __inline void __DEFAULT_FN_ATTRS 3218_mm256_store_ps(float *__p, __m256 __a) 3219{ 3220 *(__m256 *)__p = __a; 3221} 3222 3223/// Stores double-precision floating point values from a 256-bit vector 3224/// of [4 x double] to an unaligned memory location pointed to by \a __p. 3225/// 3226/// \headerfile <x86intrin.h> 3227/// 3228/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3229/// 3230/// \param __p 3231/// A pointer to a memory location that will receive the double-precision 3232/// floating point values. 3233/// \param __a 3234/// A 256-bit vector of [4 x double] containing the values to be moved. 3235static __inline void __DEFAULT_FN_ATTRS 3236_mm256_storeu_pd(double *__p, __m256d __a) 3237{ 3238 struct __storeu_pd { 3239 __m256d_u __v; 3240 } __attribute__((__packed__, __may_alias__)); 3241 ((struct __storeu_pd*)__p)->__v = __a; 3242} 3243 3244/// Stores single-precision floating point values from a 256-bit vector 3245/// of [8 x float] to an unaligned memory location pointed to by \a __p. 3246/// 3247/// \headerfile <x86intrin.h> 3248/// 3249/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3250/// 3251/// \param __p 3252/// A pointer to a memory location that will receive the float values. 3253/// \param __a 3254/// A 256-bit vector of [8 x float] containing the values to be moved. 3255static __inline void __DEFAULT_FN_ATTRS 3256_mm256_storeu_ps(float *__p, __m256 __a) 3257{ 3258 struct __storeu_ps { 3259 __m256_u __v; 3260 } __attribute__((__packed__, __may_alias__)); 3261 ((struct __storeu_ps*)__p)->__v = __a; 3262} 3263 3264/// Stores integer values from a 256-bit integer vector to a 32-byte 3265/// aligned memory location pointed to by \a __p. 3266/// 3267/// \headerfile <x86intrin.h> 3268/// 3269/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3270/// 3271/// \param __p 3272/// A 32-byte aligned pointer to a memory location that will receive the 3273/// integer values. 3274/// \param __a 3275/// A 256-bit integer vector containing the values to be moved. 3276static __inline void __DEFAULT_FN_ATTRS 3277_mm256_store_si256(__m256i *__p, __m256i __a) 3278{ 3279 *__p = __a; 3280} 3281 3282/// Stores integer values from a 256-bit integer vector to an unaligned 3283/// memory location pointed to by \a __p. 3284/// 3285/// \headerfile <x86intrin.h> 3286/// 3287/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3288/// 3289/// \param __p 3290/// A pointer to a memory location that will receive the integer values. 3291/// \param __a 3292/// A 256-bit integer vector containing the values to be moved. 3293static __inline void __DEFAULT_FN_ATTRS 3294_mm256_storeu_si256(__m256i_u *__p, __m256i __a) 3295{ 3296 struct __storeu_si256 { 3297 __m256i_u __v; 3298 } __attribute__((__packed__, __may_alias__)); 3299 ((struct __storeu_si256*)__p)->__v = __a; 3300} 3301 3302/* Conditional load ops */ 3303/// Conditionally loads double-precision floating point elements from a 3304/// memory location pointed to by \a __p into a 128-bit vector of 3305/// [2 x double], depending on the mask bits associated with each data 3306/// element. 3307/// 3308/// \headerfile <x86intrin.h> 3309/// 3310/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3311/// 3312/// \param __p 3313/// A pointer to a memory location that contains the double-precision 3314/// floating point values. 3315/// \param __m 3316/// A 128-bit integer vector containing the mask. The most significant bit of 3317/// each data element represents the mask bits. If a mask bit is zero, the 3318/// corresponding value in the memory location is not loaded and the 3319/// corresponding field in the return value is set to zero. 3320/// \returns A 128-bit vector of [2 x double] containing the loaded values. 3321static __inline __m128d __DEFAULT_FN_ATTRS128 3322_mm_maskload_pd(double const *__p, __m128i __m) 3323{ 3324 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3325} 3326 3327/// Conditionally loads double-precision floating point elements from a 3328/// memory location pointed to by \a __p into a 256-bit vector of 3329/// [4 x double], depending on the mask bits associated with each data 3330/// element. 3331/// 3332/// \headerfile <x86intrin.h> 3333/// 3334/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3335/// 3336/// \param __p 3337/// A pointer to a memory location that contains the double-precision 3338/// floating point values. 3339/// \param __m 3340/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3341/// significant bit of each quadword element represents the mask bits. If a 3342/// mask bit is zero, the corresponding value in the memory location is not 3343/// loaded and the corresponding field in the return value is set to zero. 3344/// \returns A 256-bit vector of [4 x double] containing the loaded values. 3345static __inline __m256d __DEFAULT_FN_ATTRS 3346_mm256_maskload_pd(double const *__p, __m256i __m) 3347{ 3348 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3349 (__v4di)__m); 3350} 3351 3352/// Conditionally loads single-precision floating point elements from a 3353/// memory location pointed to by \a __p into a 128-bit vector of 3354/// [4 x float], depending on the mask bits associated with each data 3355/// element. 3356/// 3357/// \headerfile <x86intrin.h> 3358/// 3359/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3360/// 3361/// \param __p 3362/// A pointer to a memory location that contains the single-precision 3363/// floating point values. 3364/// \param __m 3365/// A 128-bit integer vector containing the mask. The most significant bit of 3366/// each data element represents the mask bits. If a mask bit is zero, the 3367/// corresponding value in the memory location is not loaded and the 3368/// corresponding field in the return value is set to zero. 3369/// \returns A 128-bit vector of [4 x float] containing the loaded values. 3370static __inline __m128 __DEFAULT_FN_ATTRS128 3371_mm_maskload_ps(float const *__p, __m128i __m) 3372{ 3373 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3374} 3375 3376/// Conditionally loads single-precision floating point elements from a 3377/// memory location pointed to by \a __p into a 256-bit vector of 3378/// [8 x float], depending on the mask bits associated with each data 3379/// element. 3380/// 3381/// \headerfile <x86intrin.h> 3382/// 3383/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3384/// 3385/// \param __p 3386/// A pointer to a memory location that contains the single-precision 3387/// floating point values. 3388/// \param __m 3389/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3390/// significant bit of each dword element represents the mask bits. If a mask 3391/// bit is zero, the corresponding value in the memory location is not loaded 3392/// and the corresponding field in the return value is set to zero. 3393/// \returns A 256-bit vector of [8 x float] containing the loaded values. 3394static __inline __m256 __DEFAULT_FN_ATTRS 3395_mm256_maskload_ps(float const *__p, __m256i __m) 3396{ 3397 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3398} 3399 3400/* Conditional store ops */ 3401/// Moves single-precision floating point values from a 256-bit vector 3402/// of [8 x float] to a memory location pointed to by \a __p, according to 3403/// the specified mask. 3404/// 3405/// \headerfile <x86intrin.h> 3406/// 3407/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3408/// 3409/// \param __p 3410/// A pointer to a memory location that will receive the float values. 3411/// \param __m 3412/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3413/// significant bit of each dword element in the mask vector represents the 3414/// mask bits. If a mask bit is zero, the corresponding value from vector 3415/// \a __a is not stored and the corresponding field in the memory location 3416/// pointed to by \a __p is not changed. 3417/// \param __a 3418/// A 256-bit vector of [8 x float] containing the values to be stored. 3419static __inline void __DEFAULT_FN_ATTRS 3420_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3421{ 3422 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3423} 3424 3425/// Moves double-precision values from a 128-bit vector of [2 x double] 3426/// to a memory location pointed to by \a __p, according to the specified 3427/// mask. 3428/// 3429/// \headerfile <x86intrin.h> 3430/// 3431/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3432/// 3433/// \param __p 3434/// A pointer to a memory location that will receive the float values. 3435/// \param __m 3436/// A 128-bit integer vector containing the mask. The most significant bit of 3437/// each field in the mask vector represents the mask bits. If a mask bit is 3438/// zero, the corresponding value from vector \a __a is not stored and the 3439/// corresponding field in the memory location pointed to by \a __p is not 3440/// changed. 3441/// \param __a 3442/// A 128-bit vector of [2 x double] containing the values to be stored. 3443static __inline void __DEFAULT_FN_ATTRS128 3444_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3445{ 3446 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3447} 3448 3449/// Moves double-precision values from a 256-bit vector of [4 x double] 3450/// to a memory location pointed to by \a __p, according to the specified 3451/// mask. 3452/// 3453/// \headerfile <x86intrin.h> 3454/// 3455/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3456/// 3457/// \param __p 3458/// A pointer to a memory location that will receive the float values. 3459/// \param __m 3460/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3461/// significant bit of each quadword element in the mask vector represents 3462/// the mask bits. If a mask bit is zero, the corresponding value from vector 3463/// __a is not stored and the corresponding field in the memory location 3464/// pointed to by \a __p is not changed. 3465/// \param __a 3466/// A 256-bit vector of [4 x double] containing the values to be stored. 3467static __inline void __DEFAULT_FN_ATTRS 3468_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3469{ 3470 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3471} 3472 3473/// Moves single-precision floating point values from a 128-bit vector 3474/// of [4 x float] to a memory location pointed to by \a __p, according to 3475/// the specified mask. 3476/// 3477/// \headerfile <x86intrin.h> 3478/// 3479/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3480/// 3481/// \param __p 3482/// A pointer to a memory location that will receive the float values. 3483/// \param __m 3484/// A 128-bit integer vector containing the mask. The most significant bit of 3485/// each field in the mask vector represents the mask bits. If a mask bit is 3486/// zero, the corresponding value from vector __a is not stored and the 3487/// corresponding field in the memory location pointed to by \a __p is not 3488/// changed. 3489/// \param __a 3490/// A 128-bit vector of [4 x float] containing the values to be stored. 3491static __inline void __DEFAULT_FN_ATTRS128 3492_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3493{ 3494 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3495} 3496 3497/* Cacheability support ops */ 3498/// Moves integer data from a 256-bit integer vector to a 32-byte 3499/// aligned memory location. To minimize caching, the data is flagged as 3500/// non-temporal (unlikely to be used again soon). 3501/// 3502/// \headerfile <x86intrin.h> 3503/// 3504/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3505/// 3506/// \param __a 3507/// A pointer to a 32-byte aligned memory location that will receive the 3508/// integer values. 3509/// \param __b 3510/// A 256-bit integer vector containing the values to be moved. 3511static __inline void __DEFAULT_FN_ATTRS 3512_mm256_stream_si256(__m256i *__a, __m256i __b) 3513{ 3514 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 3515 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); 3516} 3517 3518/// Moves double-precision values from a 256-bit vector of [4 x double] 3519/// to a 32-byte aligned memory location. To minimize caching, the data is 3520/// flagged as non-temporal (unlikely to be used again soon). 3521/// 3522/// \headerfile <x86intrin.h> 3523/// 3524/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3525/// 3526/// \param __a 3527/// A pointer to a 32-byte aligned memory location that will receive the 3528/// double-precision floating-point values. 3529/// \param __b 3530/// A 256-bit vector of [4 x double] containing the values to be moved. 3531static __inline void __DEFAULT_FN_ATTRS 3532_mm256_stream_pd(double *__a, __m256d __b) 3533{ 3534 typedef __v4df __v4df_aligned __attribute__((aligned(32))); 3535 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); 3536} 3537 3538/// Moves single-precision floating point values from a 256-bit vector 3539/// of [8 x float] to a 32-byte aligned memory location. To minimize 3540/// caching, the data is flagged as non-temporal (unlikely to be used again 3541/// soon). 3542/// 3543/// \headerfile <x86intrin.h> 3544/// 3545/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3546/// 3547/// \param __p 3548/// A pointer to a 32-byte aligned memory location that will receive the 3549/// single-precision floating point values. 3550/// \param __a 3551/// A 256-bit vector of [8 x float] containing the values to be moved. 3552static __inline void __DEFAULT_FN_ATTRS 3553_mm256_stream_ps(float *__p, __m256 __a) 3554{ 3555 typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); 3556 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); 3557} 3558 3559/* Create vectors */ 3560/// Create a 256-bit vector of [4 x double] with undefined values. 3561/// 3562/// \headerfile <x86intrin.h> 3563/// 3564/// This intrinsic has no corresponding instruction. 3565/// 3566/// \returns A 256-bit vector of [4 x double] containing undefined values. 3567static __inline__ __m256d __DEFAULT_FN_ATTRS 3568_mm256_undefined_pd(void) 3569{ 3570 return (__m256d)__builtin_ia32_undef256(); 3571} 3572 3573/// Create a 256-bit vector of [8 x float] with undefined values. 3574/// 3575/// \headerfile <x86intrin.h> 3576/// 3577/// This intrinsic has no corresponding instruction. 3578/// 3579/// \returns A 256-bit vector of [8 x float] containing undefined values. 3580static __inline__ __m256 __DEFAULT_FN_ATTRS 3581_mm256_undefined_ps(void) 3582{ 3583 return (__m256)__builtin_ia32_undef256(); 3584} 3585 3586/// Create a 256-bit integer vector with undefined values. 3587/// 3588/// \headerfile <x86intrin.h> 3589/// 3590/// This intrinsic has no corresponding instruction. 3591/// 3592/// \returns A 256-bit integer vector containing undefined values. 3593static __inline__ __m256i __DEFAULT_FN_ATTRS 3594_mm256_undefined_si256(void) 3595{ 3596 return (__m256i)__builtin_ia32_undef256(); 3597} 3598 3599/// Constructs a 256-bit floating-point vector of [4 x double] 3600/// initialized with the specified double-precision floating-point values. 3601/// 3602/// \headerfile <x86intrin.h> 3603/// 3604/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3605/// instruction. 3606/// 3607/// \param __a 3608/// A double-precision floating-point value used to initialize bits [255:192] 3609/// of the result. 3610/// \param __b 3611/// A double-precision floating-point value used to initialize bits [191:128] 3612/// of the result. 3613/// \param __c 3614/// A double-precision floating-point value used to initialize bits [127:64] 3615/// of the result. 3616/// \param __d 3617/// A double-precision floating-point value used to initialize bits [63:0] 3618/// of the result. 3619/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3620static __inline __m256d __DEFAULT_FN_ATTRS 3621_mm256_set_pd(double __a, double __b, double __c, double __d) 3622{ 3623 return __extension__ (__m256d){ __d, __c, __b, __a }; 3624} 3625 3626/// Constructs a 256-bit floating-point vector of [8 x float] initialized 3627/// with the specified single-precision floating-point values. 3628/// 3629/// \headerfile <x86intrin.h> 3630/// 3631/// This intrinsic is a utility function and does not correspond to a specific 3632/// instruction. 3633/// 3634/// \param __a 3635/// A single-precision floating-point value used to initialize bits [255:224] 3636/// of the result. 3637/// \param __b 3638/// A single-precision floating-point value used to initialize bits [223:192] 3639/// of the result. 3640/// \param __c 3641/// A single-precision floating-point value used to initialize bits [191:160] 3642/// of the result. 3643/// \param __d 3644/// A single-precision floating-point value used to initialize bits [159:128] 3645/// of the result. 3646/// \param __e 3647/// A single-precision floating-point value used to initialize bits [127:96] 3648/// of the result. 3649/// \param __f 3650/// A single-precision floating-point value used to initialize bits [95:64] 3651/// of the result. 3652/// \param __g 3653/// A single-precision floating-point value used to initialize bits [63:32] 3654/// of the result. 3655/// \param __h 3656/// A single-precision floating-point value used to initialize bits [31:0] 3657/// of the result. 3658/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3659static __inline __m256 __DEFAULT_FN_ATTRS 3660_mm256_set_ps(float __a, float __b, float __c, float __d, 3661 float __e, float __f, float __g, float __h) 3662{ 3663 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3664} 3665 3666/// Constructs a 256-bit integer vector initialized with the specified 3667/// 32-bit integral values. 3668/// 3669/// \headerfile <x86intrin.h> 3670/// 3671/// This intrinsic is a utility function and does not correspond to a specific 3672/// instruction. 3673/// 3674/// \param __i0 3675/// A 32-bit integral value used to initialize bits [255:224] of the result. 3676/// \param __i1 3677/// A 32-bit integral value used to initialize bits [223:192] of the result. 3678/// \param __i2 3679/// A 32-bit integral value used to initialize bits [191:160] of the result. 3680/// \param __i3 3681/// A 32-bit integral value used to initialize bits [159:128] of the result. 3682/// \param __i4 3683/// A 32-bit integral value used to initialize bits [127:96] of the result. 3684/// \param __i5 3685/// A 32-bit integral value used to initialize bits [95:64] of the result. 3686/// \param __i6 3687/// A 32-bit integral value used to initialize bits [63:32] of the result. 3688/// \param __i7 3689/// A 32-bit integral value used to initialize bits [31:0] of the result. 3690/// \returns An initialized 256-bit integer vector. 3691static __inline __m256i __DEFAULT_FN_ATTRS 3692_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3693 int __i4, int __i5, int __i6, int __i7) 3694{ 3695 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3696} 3697 3698/// Constructs a 256-bit integer vector initialized with the specified 3699/// 16-bit integral values. 3700/// 3701/// \headerfile <x86intrin.h> 3702/// 3703/// This intrinsic is a utility function and does not correspond to a specific 3704/// instruction. 3705/// 3706/// \param __w15 3707/// A 16-bit integral value used to initialize bits [255:240] of the result. 3708/// \param __w14 3709/// A 16-bit integral value used to initialize bits [239:224] of the result. 3710/// \param __w13 3711/// A 16-bit integral value used to initialize bits [223:208] of the result. 3712/// \param __w12 3713/// A 16-bit integral value used to initialize bits [207:192] of the result. 3714/// \param __w11 3715/// A 16-bit integral value used to initialize bits [191:176] of the result. 3716/// \param __w10 3717/// A 16-bit integral value used to initialize bits [175:160] of the result. 3718/// \param __w09 3719/// A 16-bit integral value used to initialize bits [159:144] of the result. 3720/// \param __w08 3721/// A 16-bit integral value used to initialize bits [143:128] of the result. 3722/// \param __w07 3723/// A 16-bit integral value used to initialize bits [127:112] of the result. 3724/// \param __w06 3725/// A 16-bit integral value used to initialize bits [111:96] of the result. 3726/// \param __w05 3727/// A 16-bit integral value used to initialize bits [95:80] of the result. 3728/// \param __w04 3729/// A 16-bit integral value used to initialize bits [79:64] of the result. 3730/// \param __w03 3731/// A 16-bit integral value used to initialize bits [63:48] of the result. 3732/// \param __w02 3733/// A 16-bit integral value used to initialize bits [47:32] of the result. 3734/// \param __w01 3735/// A 16-bit integral value used to initialize bits [31:16] of the result. 3736/// \param __w00 3737/// A 16-bit integral value used to initialize bits [15:0] of the result. 3738/// \returns An initialized 256-bit integer vector. 3739static __inline __m256i __DEFAULT_FN_ATTRS 3740_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3741 short __w11, short __w10, short __w09, short __w08, 3742 short __w07, short __w06, short __w05, short __w04, 3743 short __w03, short __w02, short __w01, short __w00) 3744{ 3745 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3746 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3747} 3748 3749/// Constructs a 256-bit integer vector initialized with the specified 3750/// 8-bit integral values. 3751/// 3752/// \headerfile <x86intrin.h> 3753/// 3754/// This intrinsic is a utility function and does not correspond to a specific 3755/// instruction. 3756/// 3757/// \param __b31 3758/// An 8-bit integral value used to initialize bits [255:248] of the result. 3759/// \param __b30 3760/// An 8-bit integral value used to initialize bits [247:240] of the result. 3761/// \param __b29 3762/// An 8-bit integral value used to initialize bits [239:232] of the result. 3763/// \param __b28 3764/// An 8-bit integral value used to initialize bits [231:224] of the result. 3765/// \param __b27 3766/// An 8-bit integral value used to initialize bits [223:216] of the result. 3767/// \param __b26 3768/// An 8-bit integral value used to initialize bits [215:208] of the result. 3769/// \param __b25 3770/// An 8-bit integral value used to initialize bits [207:200] of the result. 3771/// \param __b24 3772/// An 8-bit integral value used to initialize bits [199:192] of the result. 3773/// \param __b23 3774/// An 8-bit integral value used to initialize bits [191:184] of the result. 3775/// \param __b22 3776/// An 8-bit integral value used to initialize bits [183:176] of the result. 3777/// \param __b21 3778/// An 8-bit integral value used to initialize bits [175:168] of the result. 3779/// \param __b20 3780/// An 8-bit integral value used to initialize bits [167:160] of the result. 3781/// \param __b19 3782/// An 8-bit integral value used to initialize bits [159:152] of the result. 3783/// \param __b18 3784/// An 8-bit integral value used to initialize bits [151:144] of the result. 3785/// \param __b17 3786/// An 8-bit integral value used to initialize bits [143:136] of the result. 3787/// \param __b16 3788/// An 8-bit integral value used to initialize bits [135:128] of the result. 3789/// \param __b15 3790/// An 8-bit integral value used to initialize bits [127:120] of the result. 3791/// \param __b14 3792/// An 8-bit integral value used to initialize bits [119:112] of the result. 3793/// \param __b13 3794/// An 8-bit integral value used to initialize bits [111:104] of the result. 3795/// \param __b12 3796/// An 8-bit integral value used to initialize bits [103:96] of the result. 3797/// \param __b11 3798/// An 8-bit integral value used to initialize bits [95:88] of the result. 3799/// \param __b10 3800/// An 8-bit integral value used to initialize bits [87:80] of the result. 3801/// \param __b09 3802/// An 8-bit integral value used to initialize bits [79:72] of the result. 3803/// \param __b08 3804/// An 8-bit integral value used to initialize bits [71:64] of the result. 3805/// \param __b07 3806/// An 8-bit integral value used to initialize bits [63:56] of the result. 3807/// \param __b06 3808/// An 8-bit integral value used to initialize bits [55:48] of the result. 3809/// \param __b05 3810/// An 8-bit integral value used to initialize bits [47:40] of the result. 3811/// \param __b04 3812/// An 8-bit integral value used to initialize bits [39:32] of the result. 3813/// \param __b03 3814/// An 8-bit integral value used to initialize bits [31:24] of the result. 3815/// \param __b02 3816/// An 8-bit integral value used to initialize bits [23:16] of the result. 3817/// \param __b01 3818/// An 8-bit integral value used to initialize bits [15:8] of the result. 3819/// \param __b00 3820/// An 8-bit integral value used to initialize bits [7:0] of the result. 3821/// \returns An initialized 256-bit integer vector. 3822static __inline __m256i __DEFAULT_FN_ATTRS 3823_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3824 char __b27, char __b26, char __b25, char __b24, 3825 char __b23, char __b22, char __b21, char __b20, 3826 char __b19, char __b18, char __b17, char __b16, 3827 char __b15, char __b14, char __b13, char __b12, 3828 char __b11, char __b10, char __b09, char __b08, 3829 char __b07, char __b06, char __b05, char __b04, 3830 char __b03, char __b02, char __b01, char __b00) 3831{ 3832 return __extension__ (__m256i)(__v32qi){ 3833 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3834 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3835 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3836 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3837 }; 3838} 3839 3840/// Constructs a 256-bit integer vector initialized with the specified 3841/// 64-bit integral values. 3842/// 3843/// \headerfile <x86intrin.h> 3844/// 3845/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3846/// instruction. 3847/// 3848/// \param __a 3849/// A 64-bit integral value used to initialize bits [255:192] of the result. 3850/// \param __b 3851/// A 64-bit integral value used to initialize bits [191:128] of the result. 3852/// \param __c 3853/// A 64-bit integral value used to initialize bits [127:64] of the result. 3854/// \param __d 3855/// A 64-bit integral value used to initialize bits [63:0] of the result. 3856/// \returns An initialized 256-bit integer vector. 3857static __inline __m256i __DEFAULT_FN_ATTRS 3858_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3859{ 3860 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; 3861} 3862 3863/* Create vectors with elements in reverse order */ 3864/// Constructs a 256-bit floating-point vector of [4 x double], 3865/// initialized in reverse order with the specified double-precision 3866/// floating-point values. 3867/// 3868/// \headerfile <x86intrin.h> 3869/// 3870/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3871/// instruction. 3872/// 3873/// \param __a 3874/// A double-precision floating-point value used to initialize bits [63:0] 3875/// of the result. 3876/// \param __b 3877/// A double-precision floating-point value used to initialize bits [127:64] 3878/// of the result. 3879/// \param __c 3880/// A double-precision floating-point value used to initialize bits [191:128] 3881/// of the result. 3882/// \param __d 3883/// A double-precision floating-point value used to initialize bits [255:192] 3884/// of the result. 3885/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3886static __inline __m256d __DEFAULT_FN_ATTRS 3887_mm256_setr_pd(double __a, double __b, double __c, double __d) 3888{ 3889 return _mm256_set_pd(__d, __c, __b, __a); 3890} 3891 3892/// Constructs a 256-bit floating-point vector of [8 x float], 3893/// initialized in reverse order with the specified single-precision 3894/// float-point values. 3895/// 3896/// \headerfile <x86intrin.h> 3897/// 3898/// This intrinsic is a utility function and does not correspond to a specific 3899/// instruction. 3900/// 3901/// \param __a 3902/// A single-precision floating-point value used to initialize bits [31:0] 3903/// of the result. 3904/// \param __b 3905/// A single-precision floating-point value used to initialize bits [63:32] 3906/// of the result. 3907/// \param __c 3908/// A single-precision floating-point value used to initialize bits [95:64] 3909/// of the result. 3910/// \param __d 3911/// A single-precision floating-point value used to initialize bits [127:96] 3912/// of the result. 3913/// \param __e 3914/// A single-precision floating-point value used to initialize bits [159:128] 3915/// of the result. 3916/// \param __f 3917/// A single-precision floating-point value used to initialize bits [191:160] 3918/// of the result. 3919/// \param __g 3920/// A single-precision floating-point value used to initialize bits [223:192] 3921/// of the result. 3922/// \param __h 3923/// A single-precision floating-point value used to initialize bits [255:224] 3924/// of the result. 3925/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3926static __inline __m256 __DEFAULT_FN_ATTRS 3927_mm256_setr_ps(float __a, float __b, float __c, float __d, 3928 float __e, float __f, float __g, float __h) 3929{ 3930 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); 3931} 3932 3933/// Constructs a 256-bit integer vector, initialized in reverse order 3934/// with the specified 32-bit integral values. 3935/// 3936/// \headerfile <x86intrin.h> 3937/// 3938/// This intrinsic is a utility function and does not correspond to a specific 3939/// instruction. 3940/// 3941/// \param __i0 3942/// A 32-bit integral value used to initialize bits [31:0] of the result. 3943/// \param __i1 3944/// A 32-bit integral value used to initialize bits [63:32] of the result. 3945/// \param __i2 3946/// A 32-bit integral value used to initialize bits [95:64] of the result. 3947/// \param __i3 3948/// A 32-bit integral value used to initialize bits [127:96] of the result. 3949/// \param __i4 3950/// A 32-bit integral value used to initialize bits [159:128] of the result. 3951/// \param __i5 3952/// A 32-bit integral value used to initialize bits [191:160] of the result. 3953/// \param __i6 3954/// A 32-bit integral value used to initialize bits [223:192] of the result. 3955/// \param __i7 3956/// A 32-bit integral value used to initialize bits [255:224] of the result. 3957/// \returns An initialized 256-bit integer vector. 3958static __inline __m256i __DEFAULT_FN_ATTRS 3959_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 3960 int __i4, int __i5, int __i6, int __i7) 3961{ 3962 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); 3963} 3964 3965/// Constructs a 256-bit integer vector, initialized in reverse order 3966/// with the specified 16-bit integral values. 3967/// 3968/// \headerfile <x86intrin.h> 3969/// 3970/// This intrinsic is a utility function and does not correspond to a specific 3971/// instruction. 3972/// 3973/// \param __w15 3974/// A 16-bit integral value used to initialize bits [15:0] of the result. 3975/// \param __w14 3976/// A 16-bit integral value used to initialize bits [31:16] of the result. 3977/// \param __w13 3978/// A 16-bit integral value used to initialize bits [47:32] of the result. 3979/// \param __w12 3980/// A 16-bit integral value used to initialize bits [63:48] of the result. 3981/// \param __w11 3982/// A 16-bit integral value used to initialize bits [79:64] of the result. 3983/// \param __w10 3984/// A 16-bit integral value used to initialize bits [95:80] of the result. 3985/// \param __w09 3986/// A 16-bit integral value used to initialize bits [111:96] of the result. 3987/// \param __w08 3988/// A 16-bit integral value used to initialize bits [127:112] of the result. 3989/// \param __w07 3990/// A 16-bit integral value used to initialize bits [143:128] of the result. 3991/// \param __w06 3992/// A 16-bit integral value used to initialize bits [159:144] of the result. 3993/// \param __w05 3994/// A 16-bit integral value used to initialize bits [175:160] of the result. 3995/// \param __w04 3996/// A 16-bit integral value used to initialize bits [191:176] of the result. 3997/// \param __w03 3998/// A 16-bit integral value used to initialize bits [207:192] of the result. 3999/// \param __w02 4000/// A 16-bit integral value used to initialize bits [223:208] of the result. 4001/// \param __w01 4002/// A 16-bit integral value used to initialize bits [239:224] of the result. 4003/// \param __w00 4004/// A 16-bit integral value used to initialize bits [255:240] of the result. 4005/// \returns An initialized 256-bit integer vector. 4006static __inline __m256i __DEFAULT_FN_ATTRS 4007_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4008 short __w11, short __w10, short __w09, short __w08, 4009 short __w07, short __w06, short __w05, short __w04, 4010 short __w03, short __w02, short __w01, short __w00) 4011{ 4012 return _mm256_set_epi16(__w00, __w01, __w02, __w03, 4013 __w04, __w05, __w06, __w07, 4014 __w08, __w09, __w10, __w11, 4015 __w12, __w13, __w14, __w15); 4016} 4017 4018/// Constructs a 256-bit integer vector, initialized in reverse order 4019/// with the specified 8-bit integral values. 4020/// 4021/// \headerfile <x86intrin.h> 4022/// 4023/// This intrinsic is a utility function and does not correspond to a specific 4024/// instruction. 4025/// 4026/// \param __b31 4027/// An 8-bit integral value used to initialize bits [7:0] of the result. 4028/// \param __b30 4029/// An 8-bit integral value used to initialize bits [15:8] of the result. 4030/// \param __b29 4031/// An 8-bit integral value used to initialize bits [23:16] of the result. 4032/// \param __b28 4033/// An 8-bit integral value used to initialize bits [31:24] of the result. 4034/// \param __b27 4035/// An 8-bit integral value used to initialize bits [39:32] of the result. 4036/// \param __b26 4037/// An 8-bit integral value used to initialize bits [47:40] of the result. 4038/// \param __b25 4039/// An 8-bit integral value used to initialize bits [55:48] of the result. 4040/// \param __b24 4041/// An 8-bit integral value used to initialize bits [63:56] of the result. 4042/// \param __b23 4043/// An 8-bit integral value used to initialize bits [71:64] of the result. 4044/// \param __b22 4045/// An 8-bit integral value used to initialize bits [79:72] of the result. 4046/// \param __b21 4047/// An 8-bit integral value used to initialize bits [87:80] of the result. 4048/// \param __b20 4049/// An 8-bit integral value used to initialize bits [95:88] of the result. 4050/// \param __b19 4051/// An 8-bit integral value used to initialize bits [103:96] of the result. 4052/// \param __b18 4053/// An 8-bit integral value used to initialize bits [111:104] of the result. 4054/// \param __b17 4055/// An 8-bit integral value used to initialize bits [119:112] of the result. 4056/// \param __b16 4057/// An 8-bit integral value used to initialize bits [127:120] of the result. 4058/// \param __b15 4059/// An 8-bit integral value used to initialize bits [135:128] of the result. 4060/// \param __b14 4061/// An 8-bit integral value used to initialize bits [143:136] of the result. 4062/// \param __b13 4063/// An 8-bit integral value used to initialize bits [151:144] of the result. 4064/// \param __b12 4065/// An 8-bit integral value used to initialize bits [159:152] of the result. 4066/// \param __b11 4067/// An 8-bit integral value used to initialize bits [167:160] of the result. 4068/// \param __b10 4069/// An 8-bit integral value used to initialize bits [175:168] of the result. 4070/// \param __b09 4071/// An 8-bit integral value used to initialize bits [183:176] of the result. 4072/// \param __b08 4073/// An 8-bit integral value used to initialize bits [191:184] of the result. 4074/// \param __b07 4075/// An 8-bit integral value used to initialize bits [199:192] of the result. 4076/// \param __b06 4077/// An 8-bit integral value used to initialize bits [207:200] of the result. 4078/// \param __b05 4079/// An 8-bit integral value used to initialize bits [215:208] of the result. 4080/// \param __b04 4081/// An 8-bit integral value used to initialize bits [223:216] of the result. 4082/// \param __b03 4083/// An 8-bit integral value used to initialize bits [231:224] of the result. 4084/// \param __b02 4085/// An 8-bit integral value used to initialize bits [239:232] of the result. 4086/// \param __b01 4087/// An 8-bit integral value used to initialize bits [247:240] of the result. 4088/// \param __b00 4089/// An 8-bit integral value used to initialize bits [255:248] of the result. 4090/// \returns An initialized 256-bit integer vector. 4091static __inline __m256i __DEFAULT_FN_ATTRS 4092_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4093 char __b27, char __b26, char __b25, char __b24, 4094 char __b23, char __b22, char __b21, char __b20, 4095 char __b19, char __b18, char __b17, char __b16, 4096 char __b15, char __b14, char __b13, char __b12, 4097 char __b11, char __b10, char __b09, char __b08, 4098 char __b07, char __b06, char __b05, char __b04, 4099 char __b03, char __b02, char __b01, char __b00) 4100{ 4101 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 4102 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 4103 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 4104 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); 4105} 4106 4107/// Constructs a 256-bit integer vector, initialized in reverse order 4108/// with the specified 64-bit integral values. 4109/// 4110/// \headerfile <x86intrin.h> 4111/// 4112/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4113/// instruction. 4114/// 4115/// \param __a 4116/// A 64-bit integral value used to initialize bits [63:0] of the result. 4117/// \param __b 4118/// A 64-bit integral value used to initialize bits [127:64] of the result. 4119/// \param __c 4120/// A 64-bit integral value used to initialize bits [191:128] of the result. 4121/// \param __d 4122/// A 64-bit integral value used to initialize bits [255:192] of the result. 4123/// \returns An initialized 256-bit integer vector. 4124static __inline __m256i __DEFAULT_FN_ATTRS 4125_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4126{ 4127 return _mm256_set_epi64x(__d, __c, __b, __a); 4128} 4129 4130/* Create vectors with repeated elements */ 4131/// Constructs a 256-bit floating-point vector of [4 x double], with each 4132/// of the four double-precision floating-point vector elements set to the 4133/// specified double-precision floating-point value. 4134/// 4135/// \headerfile <x86intrin.h> 4136/// 4137/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4138/// 4139/// \param __w 4140/// A double-precision floating-point value used to initialize each vector 4141/// element of the result. 4142/// \returns An initialized 256-bit floating-point vector of [4 x double]. 4143static __inline __m256d __DEFAULT_FN_ATTRS 4144_mm256_set1_pd(double __w) 4145{ 4146 return _mm256_set_pd(__w, __w, __w, __w); 4147} 4148 4149/// Constructs a 256-bit floating-point vector of [8 x float], with each 4150/// of the eight single-precision floating-point vector elements set to the 4151/// specified single-precision floating-point value. 4152/// 4153/// \headerfile <x86intrin.h> 4154/// 4155/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4156/// instruction. 4157/// 4158/// \param __w 4159/// A single-precision floating-point value used to initialize each vector 4160/// element of the result. 4161/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4162static __inline __m256 __DEFAULT_FN_ATTRS 4163_mm256_set1_ps(float __w) 4164{ 4165 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); 4166} 4167 4168/// Constructs a 256-bit integer vector of [8 x i32], with each of the 4169/// 32-bit integral vector elements set to the specified 32-bit integral 4170/// value. 4171/// 4172/// \headerfile <x86intrin.h> 4173/// 4174/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4175/// instruction. 4176/// 4177/// \param __i 4178/// A 32-bit integral value used to initialize each vector element of the 4179/// result. 4180/// \returns An initialized 256-bit integer vector of [8 x i32]. 4181static __inline __m256i __DEFAULT_FN_ATTRS 4182_mm256_set1_epi32(int __i) 4183{ 4184 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); 4185} 4186 4187/// Constructs a 256-bit integer vector of [16 x i16], with each of the 4188/// 16-bit integral vector elements set to the specified 16-bit integral 4189/// value. 4190/// 4191/// \headerfile <x86intrin.h> 4192/// 4193/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4194/// 4195/// \param __w 4196/// A 16-bit integral value used to initialize each vector element of the 4197/// result. 4198/// \returns An initialized 256-bit integer vector of [16 x i16]. 4199static __inline __m256i __DEFAULT_FN_ATTRS 4200_mm256_set1_epi16(short __w) 4201{ 4202 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, 4203 __w, __w, __w, __w, __w, __w, __w, __w); 4204} 4205 4206/// Constructs a 256-bit integer vector of [32 x i8], with each of the 4207/// 8-bit integral vector elements set to the specified 8-bit integral value. 4208/// 4209/// \headerfile <x86intrin.h> 4210/// 4211/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4212/// 4213/// \param __b 4214/// An 8-bit integral value used to initialize each vector element of the 4215/// result. 4216/// \returns An initialized 256-bit integer vector of [32 x i8]. 4217static __inline __m256i __DEFAULT_FN_ATTRS 4218_mm256_set1_epi8(char __b) 4219{ 4220 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, 4221 __b, __b, __b, __b, __b, __b, __b, __b, 4222 __b, __b, __b, __b, __b, __b, __b, __b, 4223 __b, __b, __b, __b, __b, __b, __b, __b); 4224} 4225 4226/// Constructs a 256-bit integer vector of [4 x i64], with each of the 4227/// 64-bit integral vector elements set to the specified 64-bit integral 4228/// value. 4229/// 4230/// \headerfile <x86intrin.h> 4231/// 4232/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4233/// 4234/// \param __q 4235/// A 64-bit integral value used to initialize each vector element of the 4236/// result. 4237/// \returns An initialized 256-bit integer vector of [4 x i64]. 4238static __inline __m256i __DEFAULT_FN_ATTRS 4239_mm256_set1_epi64x(long long __q) 4240{ 4241 return _mm256_set_epi64x(__q, __q, __q, __q); 4242} 4243 4244/* Create __zeroed vectors */ 4245/// Constructs a 256-bit floating-point vector of [4 x double] with all 4246/// vector elements initialized to zero. 4247/// 4248/// \headerfile <x86intrin.h> 4249/// 4250/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4251/// 4252/// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4253static __inline __m256d __DEFAULT_FN_ATTRS 4254_mm256_setzero_pd(void) 4255{ 4256 return __extension__ (__m256d){ 0, 0, 0, 0 }; 4257} 4258 4259/// Constructs a 256-bit floating-point vector of [8 x float] with all 4260/// vector elements initialized to zero. 4261/// 4262/// \headerfile <x86intrin.h> 4263/// 4264/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4265/// 4266/// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4267static __inline __m256 __DEFAULT_FN_ATTRS 4268_mm256_setzero_ps(void) 4269{ 4270 return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4271} 4272 4273/// Constructs a 256-bit integer vector initialized to zero. 4274/// 4275/// \headerfile <x86intrin.h> 4276/// 4277/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4278/// 4279/// \returns A 256-bit integer vector initialized to zero. 4280static __inline __m256i __DEFAULT_FN_ATTRS 4281_mm256_setzero_si256(void) 4282{ 4283 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 4284} 4285 4286/* Cast between vector types */ 4287/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4288/// floating-point vector of [8 x float]. 4289/// 4290/// \headerfile <x86intrin.h> 4291/// 4292/// This intrinsic has no corresponding instruction. 4293/// 4294/// \param __a 4295/// A 256-bit floating-point vector of [4 x double]. 4296/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4297/// bitwise pattern as the parameter. 4298static __inline __m256 __DEFAULT_FN_ATTRS 4299_mm256_castpd_ps(__m256d __a) 4300{ 4301 return (__m256)__a; 4302} 4303 4304/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4305/// integer vector. 4306/// 4307/// \headerfile <x86intrin.h> 4308/// 4309/// This intrinsic has no corresponding instruction. 4310/// 4311/// \param __a 4312/// A 256-bit floating-point vector of [4 x double]. 4313/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4314/// parameter. 4315static __inline __m256i __DEFAULT_FN_ATTRS 4316_mm256_castpd_si256(__m256d __a) 4317{ 4318 return (__m256i)__a; 4319} 4320 4321/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4322/// floating-point vector of [4 x double]. 4323/// 4324/// \headerfile <x86intrin.h> 4325/// 4326/// This intrinsic has no corresponding instruction. 4327/// 4328/// \param __a 4329/// A 256-bit floating-point vector of [8 x float]. 4330/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4331/// bitwise pattern as the parameter. 4332static __inline __m256d __DEFAULT_FN_ATTRS 4333_mm256_castps_pd(__m256 __a) 4334{ 4335 return (__m256d)__a; 4336} 4337 4338/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4339/// integer vector. 4340/// 4341/// \headerfile <x86intrin.h> 4342/// 4343/// This intrinsic has no corresponding instruction. 4344/// 4345/// \param __a 4346/// A 256-bit floating-point vector of [8 x float]. 4347/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4348/// parameter. 4349static __inline __m256i __DEFAULT_FN_ATTRS 4350_mm256_castps_si256(__m256 __a) 4351{ 4352 return (__m256i)__a; 4353} 4354 4355/// Casts a 256-bit integer vector into a 256-bit floating-point vector 4356/// of [8 x float]. 4357/// 4358/// \headerfile <x86intrin.h> 4359/// 4360/// This intrinsic has no corresponding instruction. 4361/// 4362/// \param __a 4363/// A 256-bit integer vector. 4364/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4365/// bitwise pattern as the parameter. 4366static __inline __m256 __DEFAULT_FN_ATTRS 4367_mm256_castsi256_ps(__m256i __a) 4368{ 4369 return (__m256)__a; 4370} 4371 4372/// Casts a 256-bit integer vector into a 256-bit floating-point vector 4373/// of [4 x double]. 4374/// 4375/// \headerfile <x86intrin.h> 4376/// 4377/// This intrinsic has no corresponding instruction. 4378/// 4379/// \param __a 4380/// A 256-bit integer vector. 4381/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4382/// bitwise pattern as the parameter. 4383static __inline __m256d __DEFAULT_FN_ATTRS 4384_mm256_castsi256_pd(__m256i __a) 4385{ 4386 return (__m256d)__a; 4387} 4388 4389/// Returns the lower 128 bits of a 256-bit floating-point vector of 4390/// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4391/// 4392/// \headerfile <x86intrin.h> 4393/// 4394/// This intrinsic has no corresponding instruction. 4395/// 4396/// \param __a 4397/// A 256-bit floating-point vector of [4 x double]. 4398/// \returns A 128-bit floating-point vector of [2 x double] containing the 4399/// lower 128 bits of the parameter. 4400static __inline __m128d __DEFAULT_FN_ATTRS 4401_mm256_castpd256_pd128(__m256d __a) 4402{ 4403 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4404} 4405 4406/// Returns the lower 128 bits of a 256-bit floating-point vector of 4407/// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4408/// 4409/// \headerfile <x86intrin.h> 4410/// 4411/// This intrinsic has no corresponding instruction. 4412/// 4413/// \param __a 4414/// A 256-bit floating-point vector of [8 x float]. 4415/// \returns A 128-bit floating-point vector of [4 x float] containing the 4416/// lower 128 bits of the parameter. 4417static __inline __m128 __DEFAULT_FN_ATTRS 4418_mm256_castps256_ps128(__m256 __a) 4419{ 4420 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4421} 4422 4423/// Truncates a 256-bit integer vector into a 128-bit integer vector. 4424/// 4425/// \headerfile <x86intrin.h> 4426/// 4427/// This intrinsic has no corresponding instruction. 4428/// 4429/// \param __a 4430/// A 256-bit integer vector. 4431/// \returns A 128-bit integer vector containing the lower 128 bits of the 4432/// parameter. 4433static __inline __m128i __DEFAULT_FN_ATTRS 4434_mm256_castsi256_si128(__m256i __a) 4435{ 4436 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4437} 4438 4439/// Constructs a 256-bit floating-point vector of [4 x double] from a 4440/// 128-bit floating-point vector of [2 x double]. 4441/// 4442/// The lower 128 bits contain the value of the source vector. The contents 4443/// of the upper 128 bits are undefined. 4444/// 4445/// \headerfile <x86intrin.h> 4446/// 4447/// This intrinsic has no corresponding instruction. 4448/// 4449/// \param __a 4450/// A 128-bit vector of [2 x double]. 4451/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4452/// contain the value of the parameter. The contents of the upper 128 bits 4453/// are undefined. 4454static __inline __m256d __DEFAULT_FN_ATTRS 4455_mm256_castpd128_pd256(__m128d __a) 4456{ 4457 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4458} 4459 4460/// Constructs a 256-bit floating-point vector of [8 x float] from a 4461/// 128-bit floating-point vector of [4 x float]. 4462/// 4463/// The lower 128 bits contain the value of the source vector. The contents 4464/// of the upper 128 bits are undefined. 4465/// 4466/// \headerfile <x86intrin.h> 4467/// 4468/// This intrinsic has no corresponding instruction. 4469/// 4470/// \param __a 4471/// A 128-bit vector of [4 x float]. 4472/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4473/// contain the value of the parameter. The contents of the upper 128 bits 4474/// are undefined. 4475static __inline __m256 __DEFAULT_FN_ATTRS 4476_mm256_castps128_ps256(__m128 __a) 4477{ 4478 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4479} 4480 4481/// Constructs a 256-bit integer vector from a 128-bit integer vector. 4482/// 4483/// The lower 128 bits contain the value of the source vector. The contents 4484/// of the upper 128 bits are undefined. 4485/// 4486/// \headerfile <x86intrin.h> 4487/// 4488/// This intrinsic has no corresponding instruction. 4489/// 4490/// \param __a 4491/// A 128-bit integer vector. 4492/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4493/// the parameter. The contents of the upper 128 bits are undefined. 4494static __inline __m256i __DEFAULT_FN_ATTRS 4495_mm256_castsi128_si256(__m128i __a) 4496{ 4497 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4498} 4499 4500/// Constructs a 256-bit floating-point vector of [4 x double] from a 4501/// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4502/// contain the value of the source vector. The upper 128 bits are set 4503/// to zero. 4504/// 4505/// \headerfile <x86intrin.h> 4506/// 4507/// This intrinsic has no corresponding instruction. 4508/// 4509/// \param __a 4510/// A 128-bit vector of [2 x double]. 4511/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4512/// contain the value of the parameter. The upper 128 bits are set to zero. 4513static __inline __m256d __DEFAULT_FN_ATTRS 4514_mm256_zextpd128_pd256(__m128d __a) 4515{ 4516 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4517} 4518 4519/// Constructs a 256-bit floating-point vector of [8 x float] from a 4520/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4521/// the value of the source vector. The upper 128 bits are set to zero. 4522/// 4523/// \headerfile <x86intrin.h> 4524/// 4525/// This intrinsic has no corresponding instruction. 4526/// 4527/// \param __a 4528/// A 128-bit vector of [4 x float]. 4529/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4530/// contain the value of the parameter. The upper 128 bits are set to zero. 4531static __inline __m256 __DEFAULT_FN_ATTRS 4532_mm256_zextps128_ps256(__m128 __a) 4533{ 4534 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4535} 4536 4537/// Constructs a 256-bit integer vector from a 128-bit integer vector. 4538/// The lower 128 bits contain the value of the source vector. The upper 4539/// 128 bits are set to zero. 4540/// 4541/// \headerfile <x86intrin.h> 4542/// 4543/// This intrinsic has no corresponding instruction. 4544/// 4545/// \param __a 4546/// A 128-bit integer vector. 4547/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4548/// the parameter. The upper 128 bits are set to zero. 4549static __inline __m256i __DEFAULT_FN_ATTRS 4550_mm256_zextsi128_si256(__m128i __a) 4551{ 4552 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4553} 4554 4555/* 4556 Vector insert. 4557 We use macros rather than inlines because we only want to accept 4558 invocations where the immediate M is a constant expression. 4559*/ 4560/// Constructs a new 256-bit vector of [8 x float] by first duplicating 4561/// a 256-bit vector of [8 x float] given in the first parameter, and then 4562/// replacing either the upper or the lower 128 bits with the contents of a 4563/// 128-bit vector of [4 x float] in the second parameter. 4564/// 4565/// The immediate integer parameter determines between the upper or the lower 4566/// 128 bits. 4567/// 4568/// \headerfile <x86intrin.h> 4569/// 4570/// \code 4571/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4572/// \endcode 4573/// 4574/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4575/// 4576/// \param V1 4577/// A 256-bit vector of [8 x float]. This vector is copied to the result 4578/// first, and then either the upper or the lower 128 bits of the result will 4579/// be replaced by the contents of \a V2. 4580/// \param V2 4581/// A 128-bit vector of [4 x float]. The contents of this parameter are 4582/// written to either the upper or the lower 128 bits of the result depending 4583/// on the value of parameter \a M. 4584/// \param M 4585/// An immediate integer. The least significant bit determines how the values 4586/// from the two parameters are interleaved: \n 4587/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4588/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4589/// result. \n 4590/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4591/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4592/// result. 4593/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4594#define _mm256_insertf128_ps(V1, V2, M) \ 4595 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ 4596 (__v4sf)(__m128)(V2), (int)(M)) 4597 4598/// Constructs a new 256-bit vector of [4 x double] by first duplicating 4599/// a 256-bit vector of [4 x double] given in the first parameter, and then 4600/// replacing either the upper or the lower 128 bits with the contents of a 4601/// 128-bit vector of [2 x double] in the second parameter. 4602/// 4603/// The immediate integer parameter determines between the upper or the lower 4604/// 128 bits. 4605/// 4606/// \headerfile <x86intrin.h> 4607/// 4608/// \code 4609/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4610/// \endcode 4611/// 4612/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4613/// 4614/// \param V1 4615/// A 256-bit vector of [4 x double]. This vector is copied to the result 4616/// first, and then either the upper or the lower 128 bits of the result will 4617/// be replaced by the contents of \a V2. 4618/// \param V2 4619/// A 128-bit vector of [2 x double]. The contents of this parameter are 4620/// written to either the upper or the lower 128 bits of the result depending 4621/// on the value of parameter \a M. 4622/// \param M 4623/// An immediate integer. The least significant bit determines how the values 4624/// from the two parameters are interleaved: \n 4625/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4626/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4627/// result. \n 4628/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4629/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4630/// result. 4631/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4632#define _mm256_insertf128_pd(V1, V2, M) \ 4633 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ 4634 (__v2df)(__m128d)(V2), (int)(M)) 4635 4636/// Constructs a new 256-bit integer vector by first duplicating a 4637/// 256-bit integer vector given in the first parameter, and then replacing 4638/// either the upper or the lower 128 bits with the contents of a 128-bit 4639/// integer vector in the second parameter. 4640/// 4641/// The immediate integer parameter determines between the upper or the lower 4642/// 128 bits. 4643/// 4644/// \headerfile <x86intrin.h> 4645/// 4646/// \code 4647/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4648/// \endcode 4649/// 4650/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4651/// 4652/// \param V1 4653/// A 256-bit integer vector. This vector is copied to the result first, and 4654/// then either the upper or the lower 128 bits of the result will be 4655/// replaced by the contents of \a V2. 4656/// \param V2 4657/// A 128-bit integer vector. The contents of this parameter are written to 4658/// either the upper or the lower 128 bits of the result depending on the 4659/// value of parameter \a M. 4660/// \param M 4661/// An immediate integer. The least significant bit determines how the values 4662/// from the two parameters are interleaved: \n 4663/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4664/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4665/// result. \n 4666/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4667/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4668/// result. 4669/// \returns A 256-bit integer vector containing the interleaved values. 4670#define _mm256_insertf128_si256(V1, V2, M) \ 4671 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ 4672 (__v4si)(__m128i)(V2), (int)(M)) 4673 4674/* 4675 Vector extract. 4676 We use macros rather than inlines because we only want to accept 4677 invocations where the immediate M is a constant expression. 4678*/ 4679/// Extracts either the upper or the lower 128 bits from a 256-bit vector 4680/// of [8 x float], as determined by the immediate integer parameter, and 4681/// returns the extracted bits as a 128-bit vector of [4 x float]. 4682/// 4683/// \headerfile <x86intrin.h> 4684/// 4685/// \code 4686/// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4687/// \endcode 4688/// 4689/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4690/// 4691/// \param V 4692/// A 256-bit vector of [8 x float]. 4693/// \param M 4694/// An immediate integer. The least significant bit determines which bits are 4695/// extracted from the first parameter: \n 4696/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4697/// result. \n 4698/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4699/// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4700#define _mm256_extractf128_ps(V, M) \ 4701 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)) 4702 4703/// Extracts either the upper or the lower 128 bits from a 256-bit vector 4704/// of [4 x double], as determined by the immediate integer parameter, and 4705/// returns the extracted bits as a 128-bit vector of [2 x double]. 4706/// 4707/// \headerfile <x86intrin.h> 4708/// 4709/// \code 4710/// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4711/// \endcode 4712/// 4713/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4714/// 4715/// \param V 4716/// A 256-bit vector of [4 x double]. 4717/// \param M 4718/// An immediate integer. The least significant bit determines which bits are 4719/// extracted from the first parameter: \n 4720/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4721/// result. \n 4722/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4723/// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4724#define _mm256_extractf128_pd(V, M) \ 4725 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)) 4726 4727/// Extracts either the upper or the lower 128 bits from a 256-bit 4728/// integer vector, as determined by the immediate integer parameter, and 4729/// returns the extracted bits as a 128-bit integer vector. 4730/// 4731/// \headerfile <x86intrin.h> 4732/// 4733/// \code 4734/// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4735/// \endcode 4736/// 4737/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4738/// 4739/// \param V 4740/// A 256-bit integer vector. 4741/// \param M 4742/// An immediate integer. The least significant bit determines which bits are 4743/// extracted from the first parameter: \n 4744/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4745/// result. \n 4746/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4747/// \returns A 128-bit integer vector containing the extracted bits. 4748#define _mm256_extractf128_si256(V, M) \ 4749 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)) 4750 4751/* SIMD load ops (unaligned) */ 4752/// Loads two 128-bit floating-point vectors of [4 x float] from 4753/// unaligned memory locations and constructs a 256-bit floating-point vector 4754/// of [8 x float] by concatenating the two 128-bit vectors. 4755/// 4756/// \headerfile <x86intrin.h> 4757/// 4758/// This intrinsic corresponds to load instructions followed by the 4759/// <c> VINSERTF128 </c> instruction. 4760/// 4761/// \param __addr_hi 4762/// A pointer to a 128-bit memory location containing 4 consecutive 4763/// single-precision floating-point values. These values are to be copied to 4764/// bits[255:128] of the result. The address of the memory location does not 4765/// have to be aligned. 4766/// \param __addr_lo 4767/// A pointer to a 128-bit memory location containing 4 consecutive 4768/// single-precision floating-point values. These values are to be copied to 4769/// bits[127:0] of the result. The address of the memory location does not 4770/// have to be aligned. 4771/// \returns A 256-bit floating-point vector of [8 x float] containing the 4772/// concatenated result. 4773static __inline __m256 __DEFAULT_FN_ATTRS 4774_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4775{ 4776 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4777 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4778} 4779 4780/// Loads two 128-bit floating-point vectors of [2 x double] from 4781/// unaligned memory locations and constructs a 256-bit floating-point vector 4782/// of [4 x double] by concatenating the two 128-bit vectors. 4783/// 4784/// \headerfile <x86intrin.h> 4785/// 4786/// This intrinsic corresponds to load instructions followed by the 4787/// <c> VINSERTF128 </c> instruction. 4788/// 4789/// \param __addr_hi 4790/// A pointer to a 128-bit memory location containing two consecutive 4791/// double-precision floating-point values. These values are to be copied to 4792/// bits[255:128] of the result. The address of the memory location does not 4793/// have to be aligned. 4794/// \param __addr_lo 4795/// A pointer to a 128-bit memory location containing two consecutive 4796/// double-precision floating-point values. These values are to be copied to 4797/// bits[127:0] of the result. The address of the memory location does not 4798/// have to be aligned. 4799/// \returns A 256-bit floating-point vector of [4 x double] containing the 4800/// concatenated result. 4801static __inline __m256d __DEFAULT_FN_ATTRS 4802_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4803{ 4804 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4805 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4806} 4807 4808/// Loads two 128-bit integer vectors from unaligned memory locations and 4809/// constructs a 256-bit integer vector by concatenating the two 128-bit 4810/// vectors. 4811/// 4812/// \headerfile <x86intrin.h> 4813/// 4814/// This intrinsic corresponds to load instructions followed by the 4815/// <c> VINSERTF128 </c> instruction. 4816/// 4817/// \param __addr_hi 4818/// A pointer to a 128-bit memory location containing a 128-bit integer 4819/// vector. This vector is to be copied to bits[255:128] of the result. The 4820/// address of the memory location does not have to be aligned. 4821/// \param __addr_lo 4822/// A pointer to a 128-bit memory location containing a 128-bit integer 4823/// vector. This vector is to be copied to bits[127:0] of the result. The 4824/// address of the memory location does not have to be aligned. 4825/// \returns A 256-bit integer vector containing the concatenated result. 4826static __inline __m256i __DEFAULT_FN_ATTRS 4827_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) 4828{ 4829 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4830 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4831} 4832 4833/* SIMD store ops (unaligned) */ 4834/// Stores the upper and lower 128 bits of a 256-bit floating-point 4835/// vector of [8 x float] into two different unaligned memory locations. 4836/// 4837/// \headerfile <x86intrin.h> 4838/// 4839/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4840/// store instructions. 4841/// 4842/// \param __addr_hi 4843/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4844/// copied to this memory location. The address of this memory location does 4845/// not have to be aligned. 4846/// \param __addr_lo 4847/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4848/// copied to this memory location. The address of this memory location does 4849/// not have to be aligned. 4850/// \param __a 4851/// A 256-bit floating-point vector of [8 x float]. 4852static __inline void __DEFAULT_FN_ATTRS 4853_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4854{ 4855 __m128 __v128; 4856 4857 __v128 = _mm256_castps256_ps128(__a); 4858 _mm_storeu_ps(__addr_lo, __v128); 4859 __v128 = _mm256_extractf128_ps(__a, 1); 4860 _mm_storeu_ps(__addr_hi, __v128); 4861} 4862 4863/// Stores the upper and lower 128 bits of a 256-bit floating-point 4864/// vector of [4 x double] into two different unaligned memory locations. 4865/// 4866/// \headerfile <x86intrin.h> 4867/// 4868/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4869/// store instructions. 4870/// 4871/// \param __addr_hi 4872/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4873/// copied to this memory location. The address of this memory location does 4874/// not have to be aligned. 4875/// \param __addr_lo 4876/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4877/// copied to this memory location. The address of this memory location does 4878/// not have to be aligned. 4879/// \param __a 4880/// A 256-bit floating-point vector of [4 x double]. 4881static __inline void __DEFAULT_FN_ATTRS 4882_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4883{ 4884 __m128d __v128; 4885 4886 __v128 = _mm256_castpd256_pd128(__a); 4887 _mm_storeu_pd(__addr_lo, __v128); 4888 __v128 = _mm256_extractf128_pd(__a, 1); 4889 _mm_storeu_pd(__addr_hi, __v128); 4890} 4891 4892/// Stores the upper and lower 128 bits of a 256-bit integer vector into 4893/// two different unaligned memory locations. 4894/// 4895/// \headerfile <x86intrin.h> 4896/// 4897/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4898/// store instructions. 4899/// 4900/// \param __addr_hi 4901/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4902/// copied to this memory location. The address of this memory location does 4903/// not have to be aligned. 4904/// \param __addr_lo 4905/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4906/// copied to this memory location. The address of this memory location does 4907/// not have to be aligned. 4908/// \param __a 4909/// A 256-bit integer vector. 4910static __inline void __DEFAULT_FN_ATTRS 4911_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) 4912{ 4913 __m128i __v128; 4914 4915 __v128 = _mm256_castsi256_si128(__a); 4916 _mm_storeu_si128(__addr_lo, __v128); 4917 __v128 = _mm256_extractf128_si256(__a, 1); 4918 _mm_storeu_si128(__addr_hi, __v128); 4919} 4920 4921/// Constructs a 256-bit floating-point vector of [8 x float] by 4922/// concatenating two 128-bit floating-point vectors of [4 x float]. 4923/// 4924/// \headerfile <x86intrin.h> 4925/// 4926/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4927/// 4928/// \param __hi 4929/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4930/// 128 bits of the result. 4931/// \param __lo 4932/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4933/// 128 bits of the result. 4934/// \returns A 256-bit floating-point vector of [8 x float] containing the 4935/// concatenated result. 4936static __inline __m256 __DEFAULT_FN_ATTRS 4937_mm256_set_m128 (__m128 __hi, __m128 __lo) 4938{ 4939 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4940} 4941 4942/// Constructs a 256-bit floating-point vector of [4 x double] by 4943/// concatenating two 128-bit floating-point vectors of [2 x double]. 4944/// 4945/// \headerfile <x86intrin.h> 4946/// 4947/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4948/// 4949/// \param __hi 4950/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4951/// 128 bits of the result. 4952/// \param __lo 4953/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4954/// 128 bits of the result. 4955/// \returns A 256-bit floating-point vector of [4 x double] containing the 4956/// concatenated result. 4957static __inline __m256d __DEFAULT_FN_ATTRS 4958_mm256_set_m128d (__m128d __hi, __m128d __lo) 4959{ 4960 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); 4961} 4962 4963/// Constructs a 256-bit integer vector by concatenating two 128-bit 4964/// integer vectors. 4965/// 4966/// \headerfile <x86intrin.h> 4967/// 4968/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4969/// 4970/// \param __hi 4971/// A 128-bit integer vector to be copied to the upper 128 bits of the 4972/// result. 4973/// \param __lo 4974/// A 128-bit integer vector to be copied to the lower 128 bits of the 4975/// result. 4976/// \returns A 256-bit integer vector containing the concatenated result. 4977static __inline __m256i __DEFAULT_FN_ATTRS 4978_mm256_set_m128i (__m128i __hi, __m128i __lo) 4979{ 4980 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); 4981} 4982 4983/// Constructs a 256-bit floating-point vector of [8 x float] by 4984/// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4985/// similar to _mm256_set_m128, but the order of the input parameters is 4986/// swapped. 4987/// 4988/// \headerfile <x86intrin.h> 4989/// 4990/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4991/// 4992/// \param __lo 4993/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4994/// 128 bits of the result. 4995/// \param __hi 4996/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4997/// 128 bits of the result. 4998/// \returns A 256-bit floating-point vector of [8 x float] containing the 4999/// concatenated result. 5000static __inline __m256 __DEFAULT_FN_ATTRS 5001_mm256_setr_m128 (__m128 __lo, __m128 __hi) 5002{ 5003 return _mm256_set_m128(__hi, __lo); 5004} 5005 5006/// Constructs a 256-bit floating-point vector of [4 x double] by 5007/// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5008/// similar to _mm256_set_m128d, but the order of the input parameters is 5009/// swapped. 5010/// 5011/// \headerfile <x86intrin.h> 5012/// 5013/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5014/// 5015/// \param __lo 5016/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5017/// 128 bits of the result. 5018/// \param __hi 5019/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5020/// 128 bits of the result. 5021/// \returns A 256-bit floating-point vector of [4 x double] containing the 5022/// concatenated result. 5023static __inline __m256d __DEFAULT_FN_ATTRS 5024_mm256_setr_m128d (__m128d __lo, __m128d __hi) 5025{ 5026 return (__m256d)_mm256_set_m128d(__hi, __lo); 5027} 5028 5029/// Constructs a 256-bit integer vector by concatenating two 128-bit 5030/// integer vectors. This is similar to _mm256_set_m128i, but the order of 5031/// the input parameters is swapped. 5032/// 5033/// \headerfile <x86intrin.h> 5034/// 5035/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5036/// 5037/// \param __lo 5038/// A 128-bit integer vector to be copied to the lower 128 bits of the 5039/// result. 5040/// \param __hi 5041/// A 128-bit integer vector to be copied to the upper 128 bits of the 5042/// result. 5043/// \returns A 256-bit integer vector containing the concatenated result. 5044static __inline __m256i __DEFAULT_FN_ATTRS 5045_mm256_setr_m128i (__m128i __lo, __m128i __hi) 5046{ 5047 return (__m256i)_mm256_set_m128i(__hi, __lo); 5048} 5049 5050#undef __DEFAULT_FN_ATTRS 5051#undef __DEFAULT_FN_ATTRS128 5052 5053#endif /* __AVXINTRIN_H */ 5054