1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 12#endif 13 14#ifndef __AVX2INTRIN_H 15#define __AVX2INTRIN_H 16 17/* Define the default attributes for the functions in this file. */ 18#define __DEFAULT_FN_ATTRS256 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx2,no-evex512"), __min_vector_width__(256))) 21#define __DEFAULT_FN_ATTRS128 \ 22 __attribute__((__always_inline__, __nodebug__, \ 23 __target__("avx2,no-evex512"), __min_vector_width__(128))) 24 25/* SSE4 Multiple Packed Sums of Absolute Difference. */ 26/// Computes sixteen sum of absolute difference (SAD) operations on sets of 27/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and 28/// \a Y. 29/// 30/// Eight SAD results are computed using the lower half of the input 31/// vectors, and another eight using the upper half. These 16-bit values 32/// are returned in the lower and upper halves of the 256-bit result, 33/// respectively. 34/// 35/// A single SAD operation selects four bytes from \a X and four bytes from 36/// \a Y as input. It computes the differences between each \a X byte and 37/// the corresponding \a Y byte, takes the absolute value of each 38/// difference, and sums these four values to form one 16-bit result. The 39/// intrinsic computes 16 of these results with different sets of input 40/// bytes. 41/// 42/// For each set of eight results, the SAD operations use the same four 43/// bytes from \a Y; the starting bit position for these four bytes is 44/// specified by \a M[1:0] times 32. The eight operations use successive 45/// sets of four bytes from \a X; the starting bit position for the first 46/// set of four bytes is specified by \a M[2] times 32. These bit positions 47/// are all relative to the 128-bit lane for each set of eight operations. 48/// 49/// \code{.operation} 50/// r := 0 51/// FOR i := 0 TO 1 52/// j := i*3 53/// Ybase := M[j+1:j]*32 + i*128 54/// Xbase := M[j+2]*32 + i*128 55/// FOR k := 0 TO 3 56/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) 57/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) 58/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) 59/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) 60/// result[r+15:r] := temp0 + temp1 + temp2 + temp3 61/// Xbase := Xbase + 8 62/// r := r + 16 63/// ENDFOR 64/// ENDFOR 65/// \endcode 66/// 67/// \headerfile <immintrin.h> 68/// 69/// \code 70/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); 71/// \endcode 72/// 73/// This intrinsic corresponds to the \c VMPSADBW instruction. 74/// 75/// \param X 76/// A 256-bit integer vector containing one of the inputs. 77/// \param Y 78/// A 256-bit integer vector containing one of the inputs. 79/// \param M 80/// An unsigned immediate value specifying the starting positions of the 81/// bytes to operate on. 82/// \returns A 256-bit vector of [16 x i16] containing the result. 83#define _mm256_mpsadbw_epu8(X, Y, M) \ 84 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 85 (__v32qi)(__m256i)(Y), (int)(M))) 86 87/// Computes the absolute value of each signed byte in the 256-bit integer 88/// vector \a __a and returns each value in the corresponding byte of 89/// the result. 90/// 91/// \headerfile <immintrin.h> 92/// 93/// This intrinsic corresponds to the \c VPABSB instruction. 94/// 95/// \param __a 96/// A 256-bit integer vector. 97/// \returns A 256-bit integer vector containing the result. 98static __inline__ __m256i __DEFAULT_FN_ATTRS256 99_mm256_abs_epi8(__m256i __a) 100{ 101 return (__m256i)__builtin_elementwise_abs((__v32qs)__a); 102} 103 104/// Computes the absolute value of each signed 16-bit element in the 256-bit 105/// vector of [16 x i16] in \a __a and returns each value in the 106/// corresponding element of the result. 107/// 108/// \headerfile <immintrin.h> 109/// 110/// This intrinsic corresponds to the \c VPABSW instruction. 111/// 112/// \param __a 113/// A 256-bit vector of [16 x i16]. 114/// \returns A 256-bit vector of [16 x i16] containing the result. 115static __inline__ __m256i __DEFAULT_FN_ATTRS256 116_mm256_abs_epi16(__m256i __a) 117{ 118 return (__m256i)__builtin_elementwise_abs((__v16hi)__a); 119} 120 121/// Computes the absolute value of each signed 32-bit element in the 256-bit 122/// vector of [8 x i32] in \a __a and returns each value in the 123/// corresponding element of the result. 124/// 125/// \headerfile <immintrin.h> 126/// 127/// This intrinsic corresponds to the \c VPABSD instruction. 128/// 129/// \param __a 130/// A 256-bit vector of [8 x i32]. 131/// \returns A 256-bit vector of [8 x i32] containing the result. 132static __inline__ __m256i __DEFAULT_FN_ATTRS256 133_mm256_abs_epi32(__m256i __a) 134{ 135 return (__m256i)__builtin_elementwise_abs((__v8si)__a); 136} 137 138/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit 139/// integers using signed saturation, and returns the 256-bit result. 140/// 141/// \code{.operation} 142/// FOR i := 0 TO 7 143/// j := i*16 144/// k := i*8 145/// result[7+k:k] := SATURATE8(__a[15+j:j]) 146/// result[71+k:64+k] := SATURATE8(__b[15+j:j]) 147/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j]) 148/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j]) 149/// ENDFOR 150/// \endcode 151/// 152/// \headerfile <immintrin.h> 153/// 154/// This intrinsic corresponds to the \c VPACKSSWB instruction. 155/// 156/// \param __a 157/// A 256-bit vector of [16 x i16] used to generate result[63:0] and 158/// result[191:128]. 159/// \param __b 160/// A 256-bit vector of [16 x i16] used to generate result[127:64] and 161/// result[255:192]. 162/// \returns A 256-bit integer vector containing the result. 163static __inline__ __m256i __DEFAULT_FN_ATTRS256 164_mm256_packs_epi16(__m256i __a, __m256i __b) 165{ 166 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 167} 168 169/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit 170/// integers using signed saturation, and returns the resulting 256-bit 171/// vector of [16 x i16]. 172/// 173/// \code{.operation} 174/// FOR i := 0 TO 3 175/// j := i*32 176/// k := i*16 177/// result[15+k:k] := SATURATE16(__a[31+j:j]) 178/// result[79+k:64+k] := SATURATE16(__b[31+j:j]) 179/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j]) 180/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j]) 181/// ENDFOR 182/// \endcode 183/// 184/// \headerfile <immintrin.h> 185/// 186/// This intrinsic corresponds to the \c VPACKSSDW instruction. 187/// 188/// \param __a 189/// A 256-bit vector of [8 x i32] used to generate result[63:0] and 190/// result[191:128]. 191/// \param __b 192/// A 256-bit vector of [8 x i32] used to generate result[127:64] and 193/// result[255:192]. 194/// \returns A 256-bit vector of [16 x i16] containing the result. 195static __inline__ __m256i __DEFAULT_FN_ATTRS256 196_mm256_packs_epi32(__m256i __a, __m256i __b) 197{ 198 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 199} 200 201/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers 202/// using unsigned saturation, and returns the 256-bit result. 203/// 204/// \code{.operation} 205/// FOR i := 0 TO 7 206/// j := i*16 207/// k := i*8 208/// result[7+k:k] := SATURATE8U(__a[15+j:j]) 209/// result[71+k:64+k] := SATURATE8U(__b[15+j:j]) 210/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j]) 211/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j]) 212/// ENDFOR 213/// \endcode 214/// 215/// \headerfile <immintrin.h> 216/// 217/// This intrinsic corresponds to the \c VPACKUSWB instruction. 218/// 219/// \param __a 220/// A 256-bit vector of [16 x i16] used to generate result[63:0] and 221/// result[191:128]. 222/// \param __b 223/// A 256-bit vector of [16 x i16] used to generate result[127:64] and 224/// result[255:192]. 225/// \returns A 256-bit integer vector containing the result. 226static __inline__ __m256i __DEFAULT_FN_ATTRS256 227_mm256_packus_epi16(__m256i __a, __m256i __b) 228{ 229 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 230} 231 232/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers 233/// using unsigned saturation, and returns the resulting 256-bit vector of 234/// [16 x i16]. 235/// 236/// \code{.operation} 237/// FOR i := 0 TO 3 238/// j := i*32 239/// k := i*16 240/// result[15+k:k] := SATURATE16U(__V1[31+j:j]) 241/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j]) 242/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j]) 243/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j]) 244/// ENDFOR 245/// \endcode 246/// 247/// \headerfile <immintrin.h> 248/// 249/// This intrinsic corresponds to the \c VPACKUSDW instruction. 250/// 251/// \param __V1 252/// A 256-bit vector of [8 x i32] used to generate result[63:0] and 253/// result[191:128]. 254/// \param __V2 255/// A 256-bit vector of [8 x i32] used to generate result[127:64] and 256/// result[255:192]. 257/// \returns A 256-bit vector of [16 x i16] containing the result. 258static __inline__ __m256i __DEFAULT_FN_ATTRS256 259_mm256_packus_epi32(__m256i __V1, __m256i __V2) 260{ 261 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 262} 263 264/// Adds 8-bit integers from corresponding bytes of two 256-bit integer 265/// vectors and returns the lower 8 bits of each sum in the corresponding 266/// byte of the 256-bit integer vector result (overflow is ignored). 267/// 268/// \headerfile <immintrin.h> 269/// 270/// This intrinsic corresponds to the \c VPADDB instruction. 271/// 272/// \param __a 273/// A 256-bit integer vector containing one of the source operands. 274/// \param __b 275/// A 256-bit integer vector containing one of the source operands. 276/// \returns A 256-bit integer vector containing the sums. 277static __inline__ __m256i __DEFAULT_FN_ATTRS256 278_mm256_add_epi8(__m256i __a, __m256i __b) 279{ 280 return (__m256i)((__v32qu)__a + (__v32qu)__b); 281} 282 283/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 284/// [16 x i16] and returns the lower 16 bits of each sum in the 285/// corresponding element of the [16 x i16] result (overflow is ignored). 286/// 287/// \headerfile <immintrin.h> 288/// 289/// This intrinsic corresponds to the \c VPADDW instruction. 290/// 291/// \param __a 292/// A 256-bit vector of [16 x i16] containing one of the source operands. 293/// \param __b 294/// A 256-bit vector of [16 x i16] containing one of the source operands. 295/// \returns A 256-bit vector of [16 x i16] containing the sums. 296static __inline__ __m256i __DEFAULT_FN_ATTRS256 297_mm256_add_epi16(__m256i __a, __m256i __b) 298{ 299 return (__m256i)((__v16hu)__a + (__v16hu)__b); 300} 301 302/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of 303/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding 304/// element of the [8 x i32] result (overflow is ignored). 305/// 306/// \headerfile <immintrin.h> 307/// 308/// This intrinsic corresponds to the \c VPADDD instruction. 309/// 310/// \param __a 311/// A 256-bit vector of [8 x i32] containing one of the source operands. 312/// \param __b 313/// A 256-bit vector of [8 x i32] containing one of the source operands. 314/// \returns A 256-bit vector of [8 x i32] containing the sums. 315static __inline__ __m256i __DEFAULT_FN_ATTRS256 316_mm256_add_epi32(__m256i __a, __m256i __b) 317{ 318 return (__m256i)((__v8su)__a + (__v8su)__b); 319} 320 321/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of 322/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding 323/// element of the [4 x i64] result (overflow is ignored). 324/// 325/// \headerfile <immintrin.h> 326/// 327/// This intrinsic corresponds to the \c VPADDQ instruction. 328/// 329/// \param __a 330/// A 256-bit vector of [4 x i64] containing one of the source operands. 331/// \param __b 332/// A 256-bit vector of [4 x i64] containing one of the source operands. 333/// \returns A 256-bit vector of [4 x i64] containing the sums. 334static __inline__ __m256i __DEFAULT_FN_ATTRS256 335_mm256_add_epi64(__m256i __a, __m256i __b) 336{ 337 return (__m256i)((__v4du)__a + (__v4du)__b); 338} 339 340/// Adds 8-bit integers from corresponding bytes of two 256-bit integer 341/// vectors using signed saturation, and returns each sum in the 342/// corresponding byte of the 256-bit integer vector result. 343/// 344/// \headerfile <immintrin.h> 345/// 346/// This intrinsic corresponds to the \c VPADDSB instruction. 347/// 348/// \param __a 349/// A 256-bit integer vector containing one of the source operands. 350/// \param __b 351/// A 256-bit integer vector containing one of the source operands. 352/// \returns A 256-bit integer vector containing the sums. 353static __inline__ __m256i __DEFAULT_FN_ATTRS256 354_mm256_adds_epi8(__m256i __a, __m256i __b) 355{ 356 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); 357} 358 359/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 360/// [16 x i16] using signed saturation, and returns the [16 x i16] result. 361/// 362/// \headerfile <immintrin.h> 363/// 364/// This intrinsic corresponds to the \c VPADDSW instruction. 365/// 366/// \param __a 367/// A 256-bit vector of [16 x i16] containing one of the source operands. 368/// \param __b 369/// A 256-bit vector of [16 x i16] containing one of the source operands. 370/// \returns A 256-bit vector of [16 x i16] containing the sums. 371static __inline__ __m256i __DEFAULT_FN_ATTRS256 372_mm256_adds_epi16(__m256i __a, __m256i __b) 373{ 374 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); 375} 376 377/// Adds 8-bit integers from corresponding bytes of two 256-bit integer 378/// vectors using unsigned saturation, and returns each sum in the 379/// corresponding byte of the 256-bit integer vector result. 380/// 381/// \headerfile <immintrin.h> 382/// 383/// This intrinsic corresponds to the \c VPADDUSB instruction. 384/// 385/// \param __a 386/// A 256-bit integer vector containing one of the source operands. 387/// \param __b 388/// A 256-bit integer vector containing one of the source operands. 389/// \returns A 256-bit integer vector containing the sums. 390static __inline__ __m256i __DEFAULT_FN_ATTRS256 391_mm256_adds_epu8(__m256i __a, __m256i __b) 392{ 393 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); 394} 395 396/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 397/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. 398/// 399/// \headerfile <immintrin.h> 400/// 401/// This intrinsic corresponds to the \c VPADDUSW instruction. 402/// 403/// \param __a 404/// A 256-bit vector of [16 x i16] containing one of the source operands. 405/// \param __b 406/// A 256-bit vector of [16 x i16] containing one of the source operands. 407/// \returns A 256-bit vector of [16 x i16] containing the sums. 408static __inline__ __m256i __DEFAULT_FN_ATTRS256 409_mm256_adds_epu16(__m256i __a, __m256i __b) 410{ 411 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); 412} 413 414/// Uses the lower half of the 256-bit vector \a a as the upper half of a 415/// temporary 256-bit value, and the lower half of the 256-bit vector \a b 416/// as the lower half of the temporary value. Right-shifts the temporary 417/// value by \a n bytes, and uses the lower 16 bytes of the shifted value 418/// as the lower 16 bytes of the result. Uses the upper halves of \a a and 419/// \a b to make another temporary value, right shifts by \a n, and uses 420/// the lower 16 bytes of the shifted value as the upper 16 bytes of the 421/// result. 422/// 423/// \headerfile <immintrin.h> 424/// 425/// \code 426/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n); 427/// \endcode 428/// 429/// This intrinsic corresponds to the \c VPALIGNR instruction. 430/// 431/// \param a 432/// A 256-bit integer vector containing source values. 433/// \param b 434/// A 256-bit integer vector containing source values. 435/// \param n 436/// An immediate value specifying the number of bytes to shift. 437/// \returns A 256-bit integer vector containing the result. 438#define _mm256_alignr_epi8(a, b, n) \ 439 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 440 (__v32qi)(__m256i)(b), (n))) 441 442/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and 443/// \a __b. 444/// 445/// \headerfile <immintrin.h> 446/// 447/// This intrinsic corresponds to the \c VPAND instruction. 448/// 449/// \param __a 450/// A 256-bit integer vector. 451/// \param __b 452/// A 256-bit integer vector. 453/// \returns A 256-bit integer vector containing the result. 454static __inline__ __m256i __DEFAULT_FN_ATTRS256 455_mm256_and_si256(__m256i __a, __m256i __b) 456{ 457 return (__m256i)((__v4du)__a & (__v4du)__b); 458} 459 460/// Computes the bitwise AND of the 256-bit integer vector in \a __b with 461/// the bitwise NOT of the 256-bit integer vector in \a __a. 462/// 463/// \headerfile <immintrin.h> 464/// 465/// This intrinsic corresponds to the \c VPANDN instruction. 466/// 467/// \param __a 468/// A 256-bit integer vector. 469/// \param __b 470/// A 256-bit integer vector. 471/// \returns A 256-bit integer vector containing the result. 472static __inline__ __m256i __DEFAULT_FN_ATTRS256 473_mm256_andnot_si256(__m256i __a, __m256i __b) 474{ 475 return (__m256i)(~(__v4du)__a & (__v4du)__b); 476} 477 478/// Computes the averages of the corresponding unsigned bytes in the two 479/// 256-bit integer vectors in \a __a and \a __b and returns each 480/// average in the corresponding byte of the 256-bit result. 481/// 482/// \code{.operation} 483/// FOR i := 0 TO 31 484/// j := i*8 485/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 486/// ENDFOR 487/// \endcode 488/// 489/// \headerfile <immintrin.h> 490/// 491/// This intrinsic corresponds to the \c VPAVGB instruction. 492/// 493/// \param __a 494/// A 256-bit integer vector. 495/// \param __b 496/// A 256-bit integer vector. 497/// \returns A 256-bit integer vector containing the result. 498static __inline__ __m256i __DEFAULT_FN_ATTRS256 499_mm256_avg_epu8(__m256i __a, __m256i __b) 500{ 501 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 502} 503 504/// Computes the averages of the corresponding unsigned 16-bit integers in 505/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns 506/// each average in the corresponding element of the 256-bit result. 507/// 508/// \code{.operation} 509/// FOR i := 0 TO 15 510/// j := i*16 511/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 512/// ENDFOR 513/// \endcode 514/// 515/// \headerfile <immintrin.h> 516/// 517/// This intrinsic corresponds to the \c VPAVGW instruction. 518/// 519/// \param __a 520/// A 256-bit vector of [16 x i16]. 521/// \param __b 522/// A 256-bit vector of [16 x i16]. 523/// \returns A 256-bit vector of [16 x i16] containing the result. 524static __inline__ __m256i __DEFAULT_FN_ATTRS256 525_mm256_avg_epu16(__m256i __a, __m256i __b) 526{ 527 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 528} 529 530/// Merges 8-bit integer values from either of the two 256-bit vectors 531/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns 532/// the resulting 256-bit integer vector. 533/// 534/// \code{.operation} 535/// FOR i := 0 TO 31 536/// j := i*8 537/// IF __M[7+i] == 0 538/// result[7+j:j] := __V1[7+j:j] 539/// ELSE 540/// result[7+j:j] := __V2[7+j:j] 541/// FI 542/// ENDFOR 543/// \endcode 544/// 545/// \headerfile <immintrin.h> 546/// 547/// This intrinsic corresponds to the \c VPBLENDVB instruction. 548/// 549/// \param __V1 550/// A 256-bit integer vector containing source values. 551/// \param __V2 552/// A 256-bit integer vector containing source values. 553/// \param __M 554/// A 256-bit integer vector, with bit [7] of each byte specifying the 555/// source for each corresponding byte of the result. When the mask bit 556/// is 0, the byte is copied from \a __V1; otherwise, it is copied from 557/// \a __V2. 558/// \returns A 256-bit integer vector containing the result. 559static __inline__ __m256i __DEFAULT_FN_ATTRS256 560_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 561{ 562 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 563 (__v32qi)__M); 564} 565 566/// Merges 16-bit integer values from either of the two 256-bit vectors 567/// \a V1 or \a V2, as specified by the immediate integer operand \a M, 568/// and returns the resulting 256-bit vector of [16 x i16]. 569/// 570/// \code{.operation} 571/// FOR i := 0 TO 7 572/// j := i*16 573/// IF M[i] == 0 574/// result[7+j:j] := V1[7+j:j] 575/// result[135+j:128+j] := V1[135+j:128+j] 576/// ELSE 577/// result[7+j:j] := V2[7+j:j] 578/// result[135+j:128+j] := V2[135+j:128+j] 579/// FI 580/// ENDFOR 581/// \endcode 582/// 583/// \headerfile <immintrin.h> 584/// 585/// \code 586/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M); 587/// \endcode 588/// 589/// This intrinsic corresponds to the \c VPBLENDW instruction. 590/// 591/// \param V1 592/// A 256-bit vector of [16 x i16] containing source values. 593/// \param V2 594/// A 256-bit vector of [16 x i16] containing source values. 595/// \param M 596/// An immediate 8-bit integer operand, with bits [7:0] specifying the 597/// source for each element of the result. The position of the mask bit 598/// corresponds to the index of a copied value. When a mask bit is 0, the 599/// element is copied from \a V1; otherwise, it is copied from \a V2. 600/// \a M[0] determines the source for elements 0 and 8, \a M[1] for 601/// elements 1 and 9, and so forth. 602/// \returns A 256-bit vector of [16 x i16] containing the result. 603#define _mm256_blend_epi16(V1, V2, M) \ 604 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 605 (__v16hi)(__m256i)(V2), (int)(M))) 606 607/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and 608/// \a __b for equality and returns the outcomes in the corresponding 609/// bytes of the 256-bit result. 610/// 611/// \code{.operation} 612/// FOR i := 0 TO 31 613/// j := i*8 614/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0 615/// ENDFOR 616/// \endcode 617/// 618/// \headerfile <immintrin.h> 619/// 620/// This intrinsic corresponds to the \c VPCMPEQB instruction. 621/// 622/// \param __a 623/// A 256-bit integer vector containing one of the inputs. 624/// \param __b 625/// A 256-bit integer vector containing one of the inputs. 626/// \returns A 256-bit integer vector containing the result. 627static __inline__ __m256i __DEFAULT_FN_ATTRS256 628_mm256_cmpeq_epi8(__m256i __a, __m256i __b) 629{ 630 return (__m256i)((__v32qi)__a == (__v32qi)__b); 631} 632 633/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in 634/// \a __a and \a __b for equality and returns the outcomes in the 635/// corresponding elements of the 256-bit result. 636/// 637/// \code{.operation} 638/// FOR i := 0 TO 15 639/// j := i*16 640/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0 641/// ENDFOR 642/// \endcode 643/// 644/// \headerfile <immintrin.h> 645/// 646/// This intrinsic corresponds to the \c VPCMPEQW instruction. 647/// 648/// \param __a 649/// A 256-bit vector of [16 x i16] containing one of the inputs. 650/// \param __b 651/// A 256-bit vector of [16 x i16] containing one of the inputs. 652/// \returns A 256-bit vector of [16 x i16] containing the result. 653static __inline__ __m256i __DEFAULT_FN_ATTRS256 654_mm256_cmpeq_epi16(__m256i __a, __m256i __b) 655{ 656 return (__m256i)((__v16hi)__a == (__v16hi)__b); 657} 658 659/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in 660/// \a __a and \a __b for equality and returns the outcomes in the 661/// corresponding elements of the 256-bit result. 662/// 663/// \code{.operation} 664/// FOR i := 0 TO 7 665/// j := i*32 666/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0 667/// ENDFOR 668/// \endcode 669/// 670/// \headerfile <immintrin.h> 671/// 672/// This intrinsic corresponds to the \c VPCMPEQD instruction. 673/// 674/// \param __a 675/// A 256-bit vector of [8 x i32] containing one of the inputs. 676/// \param __b 677/// A 256-bit vector of [8 x i32] containing one of the inputs. 678/// \returns A 256-bit vector of [8 x i32] containing the result. 679static __inline__ __m256i __DEFAULT_FN_ATTRS256 680_mm256_cmpeq_epi32(__m256i __a, __m256i __b) 681{ 682 return (__m256i)((__v8si)__a == (__v8si)__b); 683} 684 685/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in 686/// \a __a and \a __b for equality and returns the outcomes in the 687/// corresponding elements of the 256-bit result. 688/// 689/// \code{.operation} 690/// FOR i := 0 TO 3 691/// j := i*64 692/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 693/// ENDFOR 694/// \endcode 695/// 696/// \headerfile <immintrin.h> 697/// 698/// This intrinsic corresponds to the \c VPCMPEQQ instruction. 699/// 700/// \param __a 701/// A 256-bit vector of [4 x i64] containing one of the inputs. 702/// \param __b 703/// A 256-bit vector of [4 x i64] containing one of the inputs. 704/// \returns A 256-bit vector of [4 x i64] containing the result. 705static __inline__ __m256i __DEFAULT_FN_ATTRS256 706_mm256_cmpeq_epi64(__m256i __a, __m256i __b) 707{ 708 return (__m256i)((__v4di)__a == (__v4di)__b); 709} 710 711/// Compares corresponding signed bytes in the 256-bit integer vectors in 712/// \a __a and \a __b for greater-than and returns the outcomes in the 713/// corresponding bytes of the 256-bit result. 714/// 715/// \code{.operation} 716/// FOR i := 0 TO 31 717/// j := i*8 718/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0 719/// ENDFOR 720/// \endcode 721/// 722/// \headerfile <immintrin.h> 723/// 724/// This intrinsic corresponds to the \c VPCMPGTB instruction. 725/// 726/// \param __a 727/// A 256-bit integer vector containing one of the inputs. 728/// \param __b 729/// A 256-bit integer vector containing one of the inputs. 730/// \returns A 256-bit integer vector containing the result. 731static __inline__ __m256i __DEFAULT_FN_ATTRS256 732_mm256_cmpgt_epi8(__m256i __a, __m256i __b) 733{ 734 /* This function always performs a signed comparison, but __v32qi is a char 735 which may be signed or unsigned, so use __v32qs. */ 736 return (__m256i)((__v32qs)__a > (__v32qs)__b); 737} 738 739/// Compares corresponding signed elements in the 256-bit vectors of 740/// [16 x i16] in \a __a and \a __b for greater-than and returns the 741/// outcomes in the corresponding elements of the 256-bit result. 742/// 743/// \code{.operation} 744/// FOR i := 0 TO 15 745/// j := i*16 746/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0 747/// ENDFOR 748/// \endcode 749/// 750/// \headerfile <immintrin.h> 751/// 752/// This intrinsic corresponds to the \c VPCMPGTW instruction. 753/// 754/// \param __a 755/// A 256-bit vector of [16 x i16] containing one of the inputs. 756/// \param __b 757/// A 256-bit vector of [16 x i16] containing one of the inputs. 758/// \returns A 256-bit vector of [16 x i16] containing the result. 759static __inline__ __m256i __DEFAULT_FN_ATTRS256 760_mm256_cmpgt_epi16(__m256i __a, __m256i __b) 761{ 762 return (__m256i)((__v16hi)__a > (__v16hi)__b); 763} 764 765/// Compares corresponding signed elements in the 256-bit vectors of 766/// [8 x i32] in \a __a and \a __b for greater-than and returns the 767/// outcomes in the corresponding elements of the 256-bit result. 768/// 769/// \code{.operation} 770/// FOR i := 0 TO 7 771/// j := i*32 772/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0 773/// ENDFOR 774/// \endcode 775/// 776/// \headerfile <immintrin.h> 777/// 778/// This intrinsic corresponds to the \c VPCMPGTD instruction. 779/// 780/// \param __a 781/// A 256-bit vector of [8 x i32] containing one of the inputs. 782/// \param __b 783/// A 256-bit vector of [8 x i32] containing one of the inputs. 784/// \returns A 256-bit vector of [8 x i32] containing the result. 785static __inline__ __m256i __DEFAULT_FN_ATTRS256 786_mm256_cmpgt_epi32(__m256i __a, __m256i __b) 787{ 788 return (__m256i)((__v8si)__a > (__v8si)__b); 789} 790 791/// Compares corresponding signed elements in the 256-bit vectors of 792/// [4 x i64] in \a __a and \a __b for greater-than and returns the 793/// outcomes in the corresponding elements of the 256-bit result. 794/// 795/// \code{.operation} 796/// FOR i := 0 TO 3 797/// j := i*64 798/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 799/// ENDFOR 800/// \endcode 801/// 802/// \headerfile <immintrin.h> 803/// 804/// This intrinsic corresponds to the \c VPCMPGTQ instruction. 805/// 806/// \param __a 807/// A 256-bit vector of [4 x i64] containing one of the inputs. 808/// \param __b 809/// A 256-bit vector of [4 x i64] containing one of the inputs. 810/// \returns A 256-bit vector of [4 x i64] containing the result. 811static __inline__ __m256i __DEFAULT_FN_ATTRS256 812_mm256_cmpgt_epi64(__m256i __a, __m256i __b) 813{ 814 return (__m256i)((__v4di)__a > (__v4di)__b); 815} 816 817/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 818/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an 819/// element of the [16 x i16] result (overflow is ignored). Sums from 820/// \a __a are returned in the lower 64 bits of each 128-bit half of the 821/// result; sums from \a __b are returned in the upper 64 bits of each 822/// 128-bit half of the result. 823/// 824/// \code{.operation} 825/// FOR i := 0 TO 1 826/// j := i*128 827/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] 828/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] 829/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] 830/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] 831/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] 832/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] 833/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] 834/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] 835/// ENDFOR 836/// \endcode 837/// 838/// \headerfile <immintrin.h> 839/// 840/// This intrinsic corresponds to the \c VPHADDW instruction. 841/// 842/// \param __a 843/// A 256-bit vector of [16 x i16] containing one of the source operands. 844/// \param __b 845/// A 256-bit vector of [16 x i16] containing one of the source operands. 846/// \returns A 256-bit vector of [16 x i16] containing the sums. 847static __inline__ __m256i __DEFAULT_FN_ATTRS256 848_mm256_hadd_epi16(__m256i __a, __m256i __b) 849{ 850 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 851} 852 853/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit 854/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an 855/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a 856/// are returned in the lower 64 bits of each 128-bit half of the result; 857/// sums from \a __b are returned in the upper 64 bits of each 128-bit half 858/// of the result. 859/// 860/// \code{.operation} 861/// FOR i := 0 TO 1 862/// j := i*128 863/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] 864/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] 865/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] 866/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] 867/// ENDFOR 868/// \endcode 869/// 870/// \headerfile <immintrin.h> 871/// 872/// This intrinsic corresponds to the \c VPHADDD instruction. 873/// 874/// \param __a 875/// A 256-bit vector of [8 x i32] containing one of the source operands. 876/// \param __b 877/// A 256-bit vector of [8 x i32] containing one of the source operands. 878/// \returns A 256-bit vector of [8 x i32] containing the sums. 879static __inline__ __m256i __DEFAULT_FN_ATTRS256 880_mm256_hadd_epi32(__m256i __a, __m256i __b) 881{ 882 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 883} 884 885/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 886/// vectors of [16 x i16] using signed saturation and returns each sum in 887/// an element of the [16 x i16] result. Sums from \a __a are returned in 888/// the lower 64 bits of each 128-bit half of the result; sums from \a __b 889/// are returned in the upper 64 bits of each 128-bit half of the result. 890/// 891/// \code{.operation} 892/// FOR i := 0 TO 1 893/// j := i*128 894/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) 895/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) 896/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) 897/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) 898/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) 899/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) 900/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) 901/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) 902/// ENDFOR 903/// \endcode 904/// 905/// \headerfile <immintrin.h> 906/// 907/// This intrinsic corresponds to the \c VPHADDSW instruction. 908/// 909/// \param __a 910/// A 256-bit vector of [16 x i16] containing one of the source operands. 911/// \param __b 912/// A 256-bit vector of [16 x i16] containing one of the source operands. 913/// \returns A 256-bit vector of [16 x i16] containing the sums. 914static __inline__ __m256i __DEFAULT_FN_ATTRS256 915_mm256_hadds_epi16(__m256i __a, __m256i __b) 916{ 917 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 918} 919 920/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 921/// vectors of [16 x i16] and returns the lower 16 bits of each difference 922/// in an element of the [16 x i16] result (overflow is ignored). 923/// Differences from \a __a are returned in the lower 64 bits of each 924/// 128-bit half of the result; differences from \a __b are returned in the 925/// upper 64 bits of each 128-bit half of the result. 926/// 927/// \code{.operation} 928/// FOR i := 0 TO 1 929/// j := i*128 930/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] 931/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] 932/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] 933/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] 934/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] 935/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] 936/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] 937/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] 938/// ENDFOR 939/// \endcode 940/// 941/// \headerfile <immintrin.h> 942/// 943/// This intrinsic corresponds to the \c VPHSUBW instruction. 944/// 945/// \param __a 946/// A 256-bit vector of [16 x i16] containing one of the source operands. 947/// \param __b 948/// A 256-bit vector of [16 x i16] containing one of the source operands. 949/// \returns A 256-bit vector of [16 x i16] containing the differences. 950static __inline__ __m256i __DEFAULT_FN_ATTRS256 951_mm256_hsub_epi16(__m256i __a, __m256i __b) 952{ 953 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 954} 955 956/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit 957/// vectors of [8 x i32] and returns the lower 32 bits of each difference in 958/// an element of the [8 x i32] result (overflow is ignored). Differences 959/// from \a __a are returned in the lower 64 bits of each 128-bit half of 960/// the result; differences from \a __b are returned in the upper 64 bits 961/// of each 128-bit half of the result. 962/// 963/// \code{.operation} 964/// FOR i := 0 TO 1 965/// j := i*128 966/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] 967/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] 968/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] 969/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] 970/// ENDFOR 971/// \endcode 972/// 973/// \headerfile <immintrin.h> 974/// 975/// This intrinsic corresponds to the \c VPHSUBD instruction. 976/// 977/// \param __a 978/// A 256-bit vector of [8 x i32] containing one of the source operands. 979/// \param __b 980/// A 256-bit vector of [8 x i32] containing one of the source operands. 981/// \returns A 256-bit vector of [8 x i32] containing the differences. 982static __inline__ __m256i __DEFAULT_FN_ATTRS256 983_mm256_hsub_epi32(__m256i __a, __m256i __b) 984{ 985 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 986} 987 988/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 989/// vectors of [16 x i16] using signed saturation and returns each sum in 990/// an element of the [16 x i16] result. Differences from \a __a are 991/// returned in the lower 64 bits of each 128-bit half of the result; 992/// differences from \a __b are returned in the upper 64 bits of each 993/// 128-bit half of the result. 994/// 995/// \code{.operation} 996/// FOR i := 0 TO 1 997/// j := i*128 998/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) 999/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) 1000/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) 1001/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) 1002/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) 1003/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) 1004/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) 1005/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) 1006/// ENDFOR 1007/// \endcode 1008/// 1009/// \headerfile <immintrin.h> 1010/// 1011/// This intrinsic corresponds to the \c VPHSUBSW instruction. 1012/// 1013/// \param __a 1014/// A 256-bit vector of [16 x i16] containing one of the source operands. 1015/// \param __b 1016/// A 256-bit vector of [16 x i16] containing one of the source operands. 1017/// \returns A 256-bit vector of [16 x i16] containing the differences. 1018static __inline__ __m256i __DEFAULT_FN_ATTRS256 1019_mm256_hsubs_epi16(__m256i __a, __m256i __b) 1020{ 1021 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 1022} 1023 1024/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a 1025/// with the corresponding signed byte from the 256-bit integer vector in 1026/// \a __b, forming signed 16-bit intermediate products. Adds adjacent 1027/// pairs of those products using signed saturation to form 16-bit sums 1028/// returned as elements of the [16 x i16] result. 1029/// 1030/// \code{.operation} 1031/// FOR i := 0 TO 15 1032/// j := i*16 1033/// temp1 := __a[j+7:j] * __b[j+7:j] 1034/// temp2 := __a[j+15:j+8] * __b[j+15:j+8] 1035/// result[j+15:j] := SATURATE16(temp1 + temp2) 1036/// ENDFOR 1037/// \endcode 1038/// 1039/// \headerfile <immintrin.h> 1040/// 1041/// This intrinsic corresponds to the \c VPMADDUBSW instruction. 1042/// 1043/// \param __a 1044/// A 256-bit vector containing one of the source operands. 1045/// \param __b 1046/// A 256-bit vector containing one of the source operands. 1047/// \returns A 256-bit vector of [16 x i16] containing the result. 1048static __inline__ __m256i __DEFAULT_FN_ATTRS256 1049_mm256_maddubs_epi16(__m256i __a, __m256i __b) 1050{ 1051 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 1052} 1053 1054/// Multiplies corresponding 16-bit elements of two 256-bit vectors of 1055/// [16 x i16], forming 32-bit intermediate products, and adds pairs of 1056/// those products to form 32-bit sums returned as elements of the 1057/// [8 x i32] result. 1058/// 1059/// There is only one wraparound case: when all four of the 16-bit sources 1060/// are \c 0x8000, the result will be \c 0x80000000. 1061/// 1062/// \code{.operation} 1063/// FOR i := 0 TO 7 1064/// j := i*32 1065/// temp1 := __a[j+15:j] * __b[j+15:j] 1066/// temp2 := __a[j+31:j+16] * __b[j+31:j+16] 1067/// result[j+31:j] := temp1 + temp2 1068/// ENDFOR 1069/// \endcode 1070/// 1071/// \headerfile <immintrin.h> 1072/// 1073/// This intrinsic corresponds to the \c VPMADDWD instruction. 1074/// 1075/// \param __a 1076/// A 256-bit vector of [16 x i16] containing one of the source operands. 1077/// \param __b 1078/// A 256-bit vector of [16 x i16] containing one of the source operands. 1079/// \returns A 256-bit vector of [8 x i32] containing the result. 1080static __inline__ __m256i __DEFAULT_FN_ATTRS256 1081_mm256_madd_epi16(__m256i __a, __m256i __b) 1082{ 1083 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 1084} 1085 1086/// Compares the corresponding signed bytes in the two 256-bit integer vectors 1087/// in \a __a and \a __b and returns the larger of each pair in the 1088/// corresponding byte of the 256-bit result. 1089/// 1090/// \headerfile <immintrin.h> 1091/// 1092/// This intrinsic corresponds to the \c VPMAXSB instruction. 1093/// 1094/// \param __a 1095/// A 256-bit integer vector. 1096/// \param __b 1097/// A 256-bit integer vector. 1098/// \returns A 256-bit integer vector containing the result. 1099static __inline__ __m256i __DEFAULT_FN_ATTRS256 1100_mm256_max_epi8(__m256i __a, __m256i __b) 1101{ 1102 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); 1103} 1104 1105/// Compares the corresponding signed 16-bit integers in the two 256-bit 1106/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1107/// each pair in the corresponding element of the 256-bit result. 1108/// 1109/// \headerfile <immintrin.h> 1110/// 1111/// This intrinsic corresponds to the \c VPMAXSW instruction. 1112/// 1113/// \param __a 1114/// A 256-bit vector of [16 x i16]. 1115/// \param __b 1116/// A 256-bit vector of [16 x i16]. 1117/// \returns A 256-bit vector of [16 x i16] containing the result. 1118static __inline__ __m256i __DEFAULT_FN_ATTRS256 1119_mm256_max_epi16(__m256i __a, __m256i __b) 1120{ 1121 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); 1122} 1123 1124/// Compares the corresponding signed 32-bit integers in the two 256-bit 1125/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1126/// each pair in the corresponding element of the 256-bit result. 1127/// 1128/// \headerfile <immintrin.h> 1129/// 1130/// This intrinsic corresponds to the \c VPMAXSD instruction. 1131/// 1132/// \param __a 1133/// A 256-bit vector of [8 x i32]. 1134/// \param __b 1135/// A 256-bit vector of [8 x i32]. 1136/// \returns A 256-bit vector of [8 x i32] containing the result. 1137static __inline__ __m256i __DEFAULT_FN_ATTRS256 1138_mm256_max_epi32(__m256i __a, __m256i __b) 1139{ 1140 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); 1141} 1142 1143/// Compares the corresponding unsigned bytes in the two 256-bit integer 1144/// vectors in \a __a and \a __b and returns the larger of each pair in 1145/// the corresponding byte of the 256-bit result. 1146/// 1147/// \headerfile <immintrin.h> 1148/// 1149/// This intrinsic corresponds to the \c VPMAXUB instruction. 1150/// 1151/// \param __a 1152/// A 256-bit integer vector. 1153/// \param __b 1154/// A 256-bit integer vector. 1155/// \returns A 256-bit integer vector containing the result. 1156static __inline__ __m256i __DEFAULT_FN_ATTRS256 1157_mm256_max_epu8(__m256i __a, __m256i __b) 1158{ 1159 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); 1160} 1161 1162/// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1163/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1164/// each pair in the corresponding element of the 256-bit result. 1165/// 1166/// \headerfile <immintrin.h> 1167/// 1168/// This intrinsic corresponds to the \c VPMAXUW instruction. 1169/// 1170/// \param __a 1171/// A 256-bit vector of [16 x i16]. 1172/// \param __b 1173/// A 256-bit vector of [16 x i16]. 1174/// \returns A 256-bit vector of [16 x i16] containing the result. 1175static __inline__ __m256i __DEFAULT_FN_ATTRS256 1176_mm256_max_epu16(__m256i __a, __m256i __b) 1177{ 1178 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); 1179} 1180 1181/// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1182/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1183/// each pair in the corresponding element of the 256-bit result. 1184/// 1185/// \headerfile <immintrin.h> 1186/// 1187/// This intrinsic corresponds to the \c VPMAXUD instruction. 1188/// 1189/// \param __a 1190/// A 256-bit vector of [8 x i32]. 1191/// \param __b 1192/// A 256-bit vector of [8 x i32]. 1193/// \returns A 256-bit vector of [8 x i32] containing the result. 1194static __inline__ __m256i __DEFAULT_FN_ATTRS256 1195_mm256_max_epu32(__m256i __a, __m256i __b) 1196{ 1197 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); 1198} 1199 1200/// Compares the corresponding signed bytes in the two 256-bit integer vectors 1201/// in \a __a and \a __b and returns the smaller of each pair in the 1202/// corresponding byte of the 256-bit result. 1203/// 1204/// \headerfile <immintrin.h> 1205/// 1206/// This intrinsic corresponds to the \c VPMINSB instruction. 1207/// 1208/// \param __a 1209/// A 256-bit integer vector. 1210/// \param __b 1211/// A 256-bit integer vector. 1212/// \returns A 256-bit integer vector containing the result. 1213static __inline__ __m256i __DEFAULT_FN_ATTRS256 1214_mm256_min_epi8(__m256i __a, __m256i __b) 1215{ 1216 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); 1217} 1218 1219/// Compares the corresponding signed 16-bit integers in the two 256-bit 1220/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1221/// each pair in the corresponding element of the 256-bit result. 1222/// 1223/// \headerfile <immintrin.h> 1224/// 1225/// This intrinsic corresponds to the \c VPMINSW instruction. 1226/// 1227/// \param __a 1228/// A 256-bit vector of [16 x i16]. 1229/// \param __b 1230/// A 256-bit vector of [16 x i16]. 1231/// \returns A 256-bit vector of [16 x i16] containing the result. 1232static __inline__ __m256i __DEFAULT_FN_ATTRS256 1233_mm256_min_epi16(__m256i __a, __m256i __b) 1234{ 1235 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); 1236} 1237 1238/// Compares the corresponding signed 32-bit integers in the two 256-bit 1239/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1240/// each pair in the corresponding element of the 256-bit result. 1241/// 1242/// \headerfile <immintrin.h> 1243/// 1244/// This intrinsic corresponds to the \c VPMINSD instruction. 1245/// 1246/// \param __a 1247/// A 256-bit vector of [8 x i32]. 1248/// \param __b 1249/// A 256-bit vector of [8 x i32]. 1250/// \returns A 256-bit vector of [8 x i32] containing the result. 1251static __inline__ __m256i __DEFAULT_FN_ATTRS256 1252_mm256_min_epi32(__m256i __a, __m256i __b) 1253{ 1254 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); 1255} 1256 1257/// Compares the corresponding unsigned bytes in the two 256-bit integer 1258/// vectors in \a __a and \a __b and returns the smaller of each pair in 1259/// the corresponding byte of the 256-bit result. 1260/// 1261/// \headerfile <immintrin.h> 1262/// 1263/// This intrinsic corresponds to the \c VPMINUB instruction. 1264/// 1265/// \param __a 1266/// A 256-bit integer vector. 1267/// \param __b 1268/// A 256-bit integer vector. 1269/// \returns A 256-bit integer vector containing the result. 1270static __inline__ __m256i __DEFAULT_FN_ATTRS256 1271_mm256_min_epu8(__m256i __a, __m256i __b) 1272{ 1273 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); 1274} 1275 1276/// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1277/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1278/// each pair in the corresponding element of the 256-bit result. 1279/// 1280/// \headerfile <immintrin.h> 1281/// 1282/// This intrinsic corresponds to the \c VPMINUW instruction. 1283/// 1284/// \param __a 1285/// A 256-bit vector of [16 x i16]. 1286/// \param __b 1287/// A 256-bit vector of [16 x i16]. 1288/// \returns A 256-bit vector of [16 x i16] containing the result. 1289static __inline__ __m256i __DEFAULT_FN_ATTRS256 1290_mm256_min_epu16(__m256i __a, __m256i __b) 1291{ 1292 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); 1293} 1294 1295/// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1296/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1297/// each pair in the corresponding element of the 256-bit result. 1298/// 1299/// \headerfile <immintrin.h> 1300/// 1301/// This intrinsic corresponds to the \c VPMINUD instruction. 1302/// 1303/// \param __a 1304/// A 256-bit vector of [8 x i32]. 1305/// \param __b 1306/// A 256-bit vector of [8 x i32]. 1307/// \returns A 256-bit vector of [8 x i32] containing the result. 1308static __inline__ __m256i __DEFAULT_FN_ATTRS256 1309_mm256_min_epu32(__m256i __a, __m256i __b) 1310{ 1311 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); 1312} 1313 1314/// Creates a 32-bit integer mask from the most significant bit of each byte 1315/// in the 256-bit integer vector in \a __a and returns the result. 1316/// 1317/// \code{.operation} 1318/// FOR i := 0 TO 31 1319/// j := i*8 1320/// result[i] := __a[j+7] 1321/// ENDFOR 1322/// \endcode 1323/// 1324/// \headerfile <immintrin.h> 1325/// 1326/// This intrinsic corresponds to the \c VPMOVMSKB instruction. 1327/// 1328/// \param __a 1329/// A 256-bit integer vector containing the source bytes. 1330/// \returns The 32-bit integer mask. 1331static __inline__ int __DEFAULT_FN_ATTRS256 1332_mm256_movemask_epi8(__m256i __a) 1333{ 1334 return __builtin_ia32_pmovmskb256((__v32qi)__a); 1335} 1336 1337/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns 1338/// the 16-bit values in the corresponding elements of a 256-bit vector 1339/// of [16 x i16]. 1340/// 1341/// \code{.operation} 1342/// FOR i := 0 TO 15 1343/// j := i*8 1344/// k := i*16 1345/// result[k+15:k] := SignExtend(__V[j+7:j]) 1346/// ENDFOR 1347/// \endcode 1348/// 1349/// \headerfile <immintrin.h> 1350/// 1351/// This intrinsic corresponds to the \c VPMOVSXBW instruction. 1352/// 1353/// \param __V 1354/// A 128-bit integer vector containing the source bytes. 1355/// \returns A 256-bit vector of [16 x i16] containing the sign-extended 1356/// values. 1357static __inline__ __m256i __DEFAULT_FN_ATTRS256 1358_mm256_cvtepi8_epi16(__m128i __V) 1359{ 1360 /* This function always performs a signed extension, but __v16qi is a char 1361 which may be signed or unsigned, so use __v16qs. */ 1362 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 1363} 1364 1365/// Sign-extends bytes from the lower half of the 128-bit integer vector in 1366/// \a __V and returns the 32-bit values in the corresponding elements of a 1367/// 256-bit vector of [8 x i32]. 1368/// 1369/// \code{.operation} 1370/// FOR i := 0 TO 7 1371/// j := i*8 1372/// k := i*32 1373/// result[k+31:k] := SignExtend(__V[j+7:j]) 1374/// ENDFOR 1375/// \endcode 1376/// 1377/// \headerfile <immintrin.h> 1378/// 1379/// This intrinsic corresponds to the \c VPMOVSXBD instruction. 1380/// 1381/// \param __V 1382/// A 128-bit integer vector containing the source bytes. 1383/// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1384/// values. 1385static __inline__ __m256i __DEFAULT_FN_ATTRS256 1386_mm256_cvtepi8_epi32(__m128i __V) 1387{ 1388 /* This function always performs a signed extension, but __v16qi is a char 1389 which may be signed or unsigned, so use __v16qs. */ 1390 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1391} 1392 1393/// Sign-extends the first four bytes from the 128-bit integer vector in 1394/// \a __V and returns the 64-bit values in the corresponding elements of a 1395/// 256-bit vector of [4 x i64]. 1396/// 1397/// \code{.operation} 1398/// result[63:0] := SignExtend(__V[7:0]) 1399/// result[127:64] := SignExtend(__V[15:8]) 1400/// result[191:128] := SignExtend(__V[23:16]) 1401/// result[255:192] := SignExtend(__V[31:24]) 1402/// \endcode 1403/// 1404/// \headerfile <immintrin.h> 1405/// 1406/// This intrinsic corresponds to the \c VPMOVSXBQ instruction. 1407/// 1408/// \param __V 1409/// A 128-bit integer vector containing the source bytes. 1410/// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1411/// values. 1412static __inline__ __m256i __DEFAULT_FN_ATTRS256 1413_mm256_cvtepi8_epi64(__m128i __V) 1414{ 1415 /* This function always performs a signed extension, but __v16qi is a char 1416 which may be signed or unsigned, so use __v16qs. */ 1417 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 1418} 1419 1420/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1421/// \a __V and returns the 32-bit values in the corresponding elements of a 1422/// 256-bit vector of [8 x i32]. 1423/// 1424/// \code{.operation} 1425/// FOR i := 0 TO 7 1426/// j := i*16 1427/// k := i*32 1428/// result[k+31:k] := SignExtend(__V[j+15:j]) 1429/// ENDFOR 1430/// \endcode 1431/// 1432/// \headerfile <immintrin.h> 1433/// 1434/// This intrinsic corresponds to the \c VPMOVSXWD instruction. 1435/// 1436/// \param __V 1437/// A 128-bit vector of [8 x i16] containing the source values. 1438/// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1439/// values. 1440static __inline__ __m256i __DEFAULT_FN_ATTRS256 1441_mm256_cvtepi16_epi32(__m128i __V) 1442{ 1443 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 1444} 1445 1446/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of 1447/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1448/// elements of a 256-bit vector of [4 x i64]. 1449/// 1450/// \code{.operation} 1451/// result[63:0] := SignExtend(__V[15:0]) 1452/// result[127:64] := SignExtend(__V[31:16]) 1453/// result[191:128] := SignExtend(__V[47:32]) 1454/// result[255:192] := SignExtend(__V[64:48]) 1455/// \endcode 1456/// 1457/// \headerfile <immintrin.h> 1458/// 1459/// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1460/// 1461/// \param __V 1462/// A 128-bit vector of [8 x i16] containing the source values. 1463/// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1464/// values. 1465static __inline__ __m256i __DEFAULT_FN_ATTRS256 1466_mm256_cvtepi16_epi64(__m128i __V) 1467{ 1468 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 1469} 1470 1471/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1472/// \a __V and returns the 64-bit values in the corresponding elements of a 1473/// 256-bit vector of [4 x i64]. 1474/// 1475/// \code{.operation} 1476/// result[63:0] := SignExtend(__V[31:0]) 1477/// result[127:64] := SignExtend(__V[63:32]) 1478/// result[191:128] := SignExtend(__V[95:64]) 1479/// result[255:192] := SignExtend(__V[127:96]) 1480/// \endcode 1481/// 1482/// \headerfile <immintrin.h> 1483/// 1484/// This intrinsic corresponds to the \c VPMOVSXDQ instruction. 1485/// 1486/// \param __V 1487/// A 128-bit vector of [4 x i32] containing the source values. 1488/// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1489/// values. 1490static __inline__ __m256i __DEFAULT_FN_ATTRS256 1491_mm256_cvtepi32_epi64(__m128i __V) 1492{ 1493 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 1494} 1495 1496/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns 1497/// the 16-bit values in the corresponding elements of a 256-bit vector 1498/// of [16 x i16]. 1499/// 1500/// \code{.operation} 1501/// FOR i := 0 TO 15 1502/// j := i*8 1503/// k := i*16 1504/// result[k+15:k] := ZeroExtend(__V[j+7:j]) 1505/// ENDFOR 1506/// \endcode 1507/// 1508/// \headerfile <immintrin.h> 1509/// 1510/// This intrinsic corresponds to the \c VPMOVZXBW instruction. 1511/// 1512/// \param __V 1513/// A 128-bit integer vector containing the source bytes. 1514/// \returns A 256-bit vector of [16 x i16] containing the zero-extended 1515/// values. 1516static __inline__ __m256i __DEFAULT_FN_ATTRS256 1517_mm256_cvtepu8_epi16(__m128i __V) 1518{ 1519 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 1520} 1521 1522/// Zero-extends bytes from the lower half of the 128-bit integer vector in 1523/// \a __V and returns the 32-bit values in the corresponding elements of a 1524/// 256-bit vector of [8 x i32]. 1525/// 1526/// \code{.operation} 1527/// FOR i := 0 TO 7 1528/// j := i*8 1529/// k := i*32 1530/// result[k+31:k] := ZeroExtend(__V[j+7:j]) 1531/// ENDFOR 1532/// \endcode 1533/// 1534/// \headerfile <immintrin.h> 1535/// 1536/// This intrinsic corresponds to the \c VPMOVZXBD instruction. 1537/// 1538/// \param __V 1539/// A 128-bit integer vector containing the source bytes. 1540/// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1541/// values. 1542static __inline__ __m256i __DEFAULT_FN_ATTRS256 1543_mm256_cvtepu8_epi32(__m128i __V) 1544{ 1545 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1546} 1547 1548/// Zero-extends the first four bytes from the 128-bit integer vector in 1549/// \a __V and returns the 64-bit values in the corresponding elements of a 1550/// 256-bit vector of [4 x i64]. 1551/// 1552/// \code{.operation} 1553/// result[63:0] := ZeroExtend(__V[7:0]) 1554/// result[127:64] := ZeroExtend(__V[15:8]) 1555/// result[191:128] := ZeroExtend(__V[23:16]) 1556/// result[255:192] := ZeroExtend(__V[31:24]) 1557/// \endcode 1558/// 1559/// \headerfile <immintrin.h> 1560/// 1561/// This intrinsic corresponds to the \c VPMOVZXBQ instruction. 1562/// 1563/// \param __V 1564/// A 128-bit integer vector containing the source bytes. 1565/// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1566/// values. 1567static __inline__ __m256i __DEFAULT_FN_ATTRS256 1568_mm256_cvtepu8_epi64(__m128i __V) 1569{ 1570 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 1571} 1572 1573/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1574/// \a __V and returns the 32-bit values in the corresponding elements of a 1575/// 256-bit vector of [8 x i32]. 1576/// 1577/// \code{.operation} 1578/// FOR i := 0 TO 7 1579/// j := i*16 1580/// k := i*32 1581/// result[k+31:k] := ZeroExtend(__V[j+15:j]) 1582/// ENDFOR 1583/// \endcode 1584/// 1585/// \headerfile <immintrin.h> 1586/// 1587/// This intrinsic corresponds to the \c VPMOVZXWD instruction. 1588/// 1589/// \param __V 1590/// A 128-bit vector of [8 x i16] containing the source values. 1591/// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1592/// values. 1593static __inline__ __m256i __DEFAULT_FN_ATTRS256 1594_mm256_cvtepu16_epi32(__m128i __V) 1595{ 1596 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 1597} 1598 1599/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of 1600/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1601/// elements of a 256-bit vector of [4 x i64]. 1602/// 1603/// \code{.operation} 1604/// result[63:0] := ZeroExtend(__V[15:0]) 1605/// result[127:64] := ZeroExtend(__V[31:16]) 1606/// result[191:128] := ZeroExtend(__V[47:32]) 1607/// result[255:192] := ZeroExtend(__V[64:48]) 1608/// \endcode 1609/// 1610/// \headerfile <immintrin.h> 1611/// 1612/// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1613/// 1614/// \param __V 1615/// A 128-bit vector of [8 x i16] containing the source values. 1616/// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1617/// values. 1618static __inline__ __m256i __DEFAULT_FN_ATTRS256 1619_mm256_cvtepu16_epi64(__m128i __V) 1620{ 1621 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 1622} 1623 1624/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1625/// \a __V and returns the 64-bit values in the corresponding elements of a 1626/// 256-bit vector of [4 x i64]. 1627/// 1628/// \code{.operation} 1629/// result[63:0] := ZeroExtend(__V[31:0]) 1630/// result[127:64] := ZeroExtend(__V[63:32]) 1631/// result[191:128] := ZeroExtend(__V[95:64]) 1632/// result[255:192] := ZeroExtend(__V[127:96]) 1633/// \endcode 1634/// 1635/// \headerfile <immintrin.h> 1636/// 1637/// This intrinsic corresponds to the \c VPMOVZXDQ instruction. 1638/// 1639/// \param __V 1640/// A 128-bit vector of [4 x i32] containing the source values. 1641/// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1642/// values. 1643static __inline__ __m256i __DEFAULT_FN_ATTRS256 1644_mm256_cvtepu32_epi64(__m128i __V) 1645{ 1646 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 1647} 1648 1649/// Multiplies signed 32-bit integers from even-numbered elements of two 1650/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1651/// [4 x i64] result. 1652/// 1653/// \code{.operation} 1654/// result[63:0] := __a[31:0] * __b[31:0] 1655/// result[127:64] := __a[95:64] * __b[95:64] 1656/// result[191:128] := __a[159:128] * __b[159:128] 1657/// result[255:192] := __a[223:192] * __b[223:192] 1658/// \endcode 1659/// 1660/// \headerfile <immintrin.h> 1661/// 1662/// This intrinsic corresponds to the \c VPMULDQ instruction. 1663/// 1664/// \param __a 1665/// A 256-bit vector of [8 x i32] containing one of the source operands. 1666/// \param __b 1667/// A 256-bit vector of [8 x i32] containing one of the source operands. 1668/// \returns A 256-bit vector of [4 x i64] containing the products. 1669static __inline__ __m256i __DEFAULT_FN_ATTRS256 1670_mm256_mul_epi32(__m256i __a, __m256i __b) 1671{ 1672 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 1673} 1674 1675/// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1676/// [16 x i16], truncates the 32-bit results to the most significant 18 1677/// bits, rounds by adding 1, and returns bits [16:1] of each rounded 1678/// product in the [16 x i16] result. 1679/// 1680/// \code{.operation} 1681/// FOR i := 0 TO 15 1682/// j := i*16 1683/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 1684/// result[j+15:j] := temp[16:1] 1685/// \endcode 1686/// 1687/// \headerfile <immintrin.h> 1688/// 1689/// This intrinsic corresponds to the \c VPMULHRSW instruction. 1690/// 1691/// \param __a 1692/// A 256-bit vector of [16 x i16] containing one of the source operands. 1693/// \param __b 1694/// A 256-bit vector of [16 x i16] containing one of the source operands. 1695/// \returns A 256-bit vector of [16 x i16] containing the rounded products. 1696static __inline__ __m256i __DEFAULT_FN_ATTRS256 1697_mm256_mulhrs_epi16(__m256i __a, __m256i __b) 1698{ 1699 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 1700} 1701 1702/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of 1703/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1704/// [16 x i16] result. 1705/// 1706/// \headerfile <immintrin.h> 1707/// 1708/// This intrinsic corresponds to the \c VPMULHUW instruction. 1709/// 1710/// \param __a 1711/// A 256-bit vector of [16 x i16] containing one of the source operands. 1712/// \param __b 1713/// A 256-bit vector of [16 x i16] containing one of the source operands. 1714/// \returns A 256-bit vector of [16 x i16] containing the products. 1715static __inline__ __m256i __DEFAULT_FN_ATTRS256 1716_mm256_mulhi_epu16(__m256i __a, __m256i __b) 1717{ 1718 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 1719} 1720 1721/// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1722/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1723/// [16 x i16] result. 1724/// 1725/// \headerfile <immintrin.h> 1726/// 1727/// This intrinsic corresponds to the \c VPMULHW instruction. 1728/// 1729/// \param __a 1730/// A 256-bit vector of [16 x i16] containing one of the source operands. 1731/// \param __b 1732/// A 256-bit vector of [16 x i16] containing one of the source operands. 1733/// \returns A 256-bit vector of [16 x i16] containing the products. 1734static __inline__ __m256i __DEFAULT_FN_ATTRS256 1735_mm256_mulhi_epi16(__m256i __a, __m256i __b) 1736{ 1737 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 1738} 1739 1740/// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1741/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the 1742/// [16 x i16] result. 1743/// 1744/// \headerfile <immintrin.h> 1745/// 1746/// This intrinsic corresponds to the \c VPMULLW instruction. 1747/// 1748/// \param __a 1749/// A 256-bit vector of [16 x i16] containing one of the source operands. 1750/// \param __b 1751/// A 256-bit vector of [16 x i16] containing one of the source operands. 1752/// \returns A 256-bit vector of [16 x i16] containing the products. 1753static __inline__ __m256i __DEFAULT_FN_ATTRS256 1754_mm256_mullo_epi16(__m256i __a, __m256i __b) 1755{ 1756 return (__m256i)((__v16hu)__a * (__v16hu)__b); 1757} 1758 1759/// Multiplies signed 32-bit integer elements of two 256-bit vectors of 1760/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the 1761/// [8 x i32] result. 1762/// 1763/// \headerfile <immintrin.h> 1764/// 1765/// This intrinsic corresponds to the \c VPMULLD instruction. 1766/// 1767/// \param __a 1768/// A 256-bit vector of [8 x i32] containing one of the source operands. 1769/// \param __b 1770/// A 256-bit vector of [8 x i32] containing one of the source operands. 1771/// \returns A 256-bit vector of [8 x i32] containing the products. 1772static __inline__ __m256i __DEFAULT_FN_ATTRS256 1773_mm256_mullo_epi32 (__m256i __a, __m256i __b) 1774{ 1775 return (__m256i)((__v8su)__a * (__v8su)__b); 1776} 1777 1778/// Multiplies unsigned 32-bit integers from even-numered elements of two 1779/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1780/// [4 x i64] result. 1781/// 1782/// \code{.operation} 1783/// result[63:0] := __a[31:0] * __b[31:0] 1784/// result[127:64] := __a[95:64] * __b[95:64] 1785/// result[191:128] := __a[159:128] * __b[159:128] 1786/// result[255:192] := __a[223:192] * __b[223:192] 1787/// \endcode 1788/// 1789/// \headerfile <immintrin.h> 1790/// 1791/// This intrinsic corresponds to the \c VPMULUDQ instruction. 1792/// 1793/// \param __a 1794/// A 256-bit vector of [8 x i32] containing one of the source operands. 1795/// \param __b 1796/// A 256-bit vector of [8 x i32] containing one of the source operands. 1797/// \returns A 256-bit vector of [4 x i64] containing the products. 1798static __inline__ __m256i __DEFAULT_FN_ATTRS256 1799_mm256_mul_epu32(__m256i __a, __m256i __b) 1800{ 1801 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 1802} 1803 1804/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and 1805/// \a __b. 1806/// 1807/// \headerfile <immintrin.h> 1808/// 1809/// This intrinsic corresponds to the \c VPOR instruction. 1810/// 1811/// \param __a 1812/// A 256-bit integer vector. 1813/// \param __b 1814/// A 256-bit integer vector. 1815/// \returns A 256-bit integer vector containing the result. 1816static __inline__ __m256i __DEFAULT_FN_ATTRS256 1817_mm256_or_si256(__m256i __a, __m256i __b) 1818{ 1819 return (__m256i)((__v4du)__a | (__v4du)__b); 1820} 1821 1822/// Computes four sum of absolute difference (SAD) operations on sets of eight 1823/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and 1824/// \a __b. 1825/// 1826/// One SAD result is computed for each set of eight bytes from \a __a and 1827/// eight bytes from \a __b. The zero-extended SAD value is returned in the 1828/// corresponding 64-bit element of the result. 1829/// 1830/// A single SAD operation takes the differences between the corresponding 1831/// bytes of \a __a and \a __b, takes the absolute value of each difference, 1832/// and sums these eight values to form one 16-bit result. This operation 1833/// is repeated four times with successive sets of eight bytes. 1834/// 1835/// \code{.operation} 1836/// FOR i := 0 TO 3 1837/// j := i*64 1838/// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) 1839/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) 1840/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) 1841/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) 1842/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) 1843/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) 1844/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) 1845/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) 1846/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + 1847/// temp4 + temp5 + temp6 + temp7 1848/// result[j+63:j+16] := 0 1849/// ENDFOR 1850/// \endcode 1851/// 1852/// \headerfile <immintrin.h> 1853/// 1854/// This intrinsic corresponds to the \c VPSADBW instruction. 1855/// 1856/// \param __a 1857/// A 256-bit integer vector. 1858/// \param __b 1859/// A 256-bit integer vector. 1860/// \returns A 256-bit integer vector containing the result. 1861static __inline__ __m256i __DEFAULT_FN_ATTRS256 1862_mm256_sad_epu8(__m256i __a, __m256i __b) 1863{ 1864 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 1865} 1866 1867/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according 1868/// to control information in the 256-bit integer vector \a __b, and 1869/// returns the 256-bit result. In effect there are two separate 128-bit 1870/// shuffles in the lower and upper halves. 1871/// 1872/// \code{.operation} 1873/// FOR i := 0 TO 31 1874/// j := i*8 1875/// IF __b[j+7] == 1 1876/// result[j+7:j] := 0 1877/// ELSE 1878/// k := __b[j+3:j] * 8 1879/// IF i > 15 1880/// k := k + 128 1881/// FI 1882/// result[j+7:j] := __a[k+7:k] 1883/// FI 1884/// ENDFOR 1885/// \endcode 1886/// 1887/// \headerfile <immintrin.h> 1888/// 1889/// This intrinsic corresponds to the \c VPSHUFB instruction. 1890/// 1891/// \param __a 1892/// A 256-bit integer vector containing source values. 1893/// \param __b 1894/// A 256-bit integer vector containing control information to determine 1895/// what goes into the corresponding byte of the result. If bit 7 of the 1896/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the 1897/// control byte specify the index (within the same 128-bit half) of \a __a 1898/// to copy to the result byte. 1899/// \returns A 256-bit integer vector containing the result. 1900static __inline__ __m256i __DEFAULT_FN_ATTRS256 1901_mm256_shuffle_epi8(__m256i __a, __m256i __b) 1902{ 1903 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 1904} 1905 1906/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a 1907/// according to control information in the integer literal \a imm, and 1908/// returns the 256-bit result. In effect there are two parallel 128-bit 1909/// shuffles in the lower and upper halves. 1910/// 1911/// \code{.operation} 1912/// FOR i := 0 to 3 1913/// j := i*32 1914/// k := (imm >> i*2)[1:0] * 32 1915/// result[j+31:j] := a[k+31:k] 1916/// result[128+j+31:128+j] := a[128+k+31:128+k] 1917/// ENDFOR 1918/// \endcode 1919/// 1920/// \headerfile <immintrin.h> 1921/// 1922/// \code 1923/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm); 1924/// \endcode 1925/// 1926/// This intrinsic corresponds to the \c VPSHUFB instruction. 1927/// 1928/// \param a 1929/// A 256-bit vector of [8 x i32] containing source values. 1930/// \param imm 1931/// An immediate 8-bit value specifying which elements to copy from \a a. 1932/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the 1933/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so 1934/// forth. 1935/// \returns A 256-bit vector of [8 x i32] containing the result. 1936#define _mm256_shuffle_epi32(a, imm) \ 1937 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) 1938 1939/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a 1940/// according to control information in the integer literal \a imm, and 1941/// returns the 256-bit result. The upper 64 bits of each 128-bit half 1942/// are shuffled in parallel; the lower 64 bits of each 128-bit half are 1943/// copied from \a a unchanged. 1944/// 1945/// \code{.operation} 1946/// result[63:0] := a[63:0] 1947/// result[191:128] := a[191:128] 1948/// FOR i := 0 TO 3 1949/// j := i * 16 + 64 1950/// k := (imm >> i*2)[1:0] * 16 + 64 1951/// result[j+15:j] := a[k+15:k] 1952/// result[128+j+15:128+j] := a[128+k+15:128+k] 1953/// ENDFOR 1954/// \endcode 1955/// 1956/// \headerfile <immintrin.h> 1957/// 1958/// \code 1959/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm); 1960/// \endcode 1961/// 1962/// This intrinsic corresponds to the \c VPSHUFHW instruction. 1963/// 1964/// \param a 1965/// A 256-bit vector of [16 x i16] containing source values. 1966/// \param imm 1967/// An immediate 8-bit value specifying which elements to copy from \a a. 1968/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the 1969/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so 1970/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth). 1971/// \returns A 256-bit vector of [16 x i16] containing the result. 1972#define _mm256_shufflehi_epi16(a, imm) \ 1973 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) 1974 1975/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a 1976/// according to control information in the integer literal \a imm, and 1977/// returns the 256-bit [16 x i16] result. The lower 64 bits of each 1978/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are 1979/// copied from \a a unchanged. 1980/// 1981/// \code{.operation} 1982/// result[127:64] := a[127:64] 1983/// result[255:192] := a[255:192] 1984/// FOR i := 0 TO 3 1985/// j := i * 16 1986/// k := (imm >> i*2)[1:0] * 16 1987/// result[j+15:j] := a[k+15:k] 1988/// result[128+j+15:128+j] := a[128+k+15:128+k] 1989/// ENDFOR 1990/// \endcode 1991/// 1992/// \headerfile <immintrin.h> 1993/// 1994/// \code 1995/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm); 1996/// \endcode 1997/// 1998/// This intrinsic corresponds to the \c VPSHUFLW instruction. 1999/// 2000/// \param a 2001/// A 256-bit vector of [16 x i16] to use as a source of data for the 2002/// result. 2003/// \param imm 2004/// An immediate 8-bit value specifying which elements to copy from \a a. 2005/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the 2006/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so 2007/// forth. 2008/// \returns A 256-bit vector of [16 x i16] containing the result. 2009#define _mm256_shufflelo_epi16(a, imm) \ 2010 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) 2011 2012/// Sets each byte of the result to the corresponding byte of the 256-bit 2013/// integer vector in \a __a, the negative of that byte, or zero, depending 2014/// on whether the corresponding byte of the 256-bit integer vector in 2015/// \a __b is greater than zero, less than zero, or equal to zero, 2016/// respectively. 2017/// 2018/// \headerfile <immintrin.h> 2019/// 2020/// This intrinsic corresponds to the \c VPSIGNB instruction. 2021/// 2022/// \param __a 2023/// A 256-bit integer vector. 2024/// \param __b 2025/// A 256-bit integer vector]. 2026/// \returns A 256-bit integer vector containing the result. 2027static __inline__ __m256i __DEFAULT_FN_ATTRS256 2028_mm256_sign_epi8(__m256i __a, __m256i __b) 2029{ 2030 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 2031} 2032 2033/// Sets each element of the result to the corresponding element of the 2034/// 256-bit vector of [16 x i16] in \a __a, the negative of that element, 2035/// or zero, depending on whether the corresponding element of the 256-bit 2036/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or 2037/// equal to zero, respectively. 2038/// 2039/// \headerfile <immintrin.h> 2040/// 2041/// This intrinsic corresponds to the \c VPSIGNW instruction. 2042/// 2043/// \param __a 2044/// A 256-bit vector of [16 x i16]. 2045/// \param __b 2046/// A 256-bit vector of [16 x i16]. 2047/// \returns A 256-bit vector of [16 x i16] containing the result. 2048static __inline__ __m256i __DEFAULT_FN_ATTRS256 2049_mm256_sign_epi16(__m256i __a, __m256i __b) 2050{ 2051 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 2052} 2053 2054/// Sets each element of the result to the corresponding element of the 2055/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or 2056/// zero, depending on whether the corresponding element of the 256-bit 2057/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or 2058/// equal to zero, respectively. 2059/// 2060/// \headerfile <immintrin.h> 2061/// 2062/// This intrinsic corresponds to the \c VPSIGND instruction. 2063/// 2064/// \param __a 2065/// A 256-bit vector of [8 x i32]. 2066/// \param __b 2067/// A 256-bit vector of [8 x i32]. 2068/// \returns A 256-bit vector of [8 x i32] containing the result. 2069static __inline__ __m256i __DEFAULT_FN_ATTRS256 2070_mm256_sign_epi32(__m256i __a, __m256i __b) 2071{ 2072 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 2073} 2074 2075/// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2076/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2077/// is greater than 15, the returned result is all zeroes. 2078/// 2079/// \headerfile <immintrin.h> 2080/// 2081/// \code 2082/// __m256i _mm256_slli_si256(__m256i a, const int imm); 2083/// \endcode 2084/// 2085/// This intrinsic corresponds to the \c VPSLLDQ instruction. 2086/// 2087/// \param a 2088/// A 256-bit integer vector to be shifted. 2089/// \param imm 2090/// An unsigned immediate value specifying the shift count (in bytes). 2091/// \returns A 256-bit integer vector containing the result. 2092#define _mm256_slli_si256(a, imm) \ 2093 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2094 2095/// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2096/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2097/// is greater than 15, the returned result is all zeroes. 2098/// 2099/// \headerfile <immintrin.h> 2100/// 2101/// \code 2102/// __m256i _mm256_bslli_epi128(__m256i a, const int imm); 2103/// \endcode 2104/// 2105/// This intrinsic corresponds to the \c VPSLLDQ instruction. 2106/// 2107/// \param a 2108/// A 256-bit integer vector to be shifted. 2109/// \param imm 2110/// An unsigned immediate value specifying the shift count (in bytes). 2111/// \returns A 256-bit integer vector containing the result. 2112#define _mm256_bslli_epi128(a, imm) \ 2113 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2114 2115/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2116/// left by \a __count bits, shifting in zero bits, and returns the result. 2117/// If \a __count is greater than 15, the returned result is all zeroes. 2118/// 2119/// \headerfile <immintrin.h> 2120/// 2121/// This intrinsic corresponds to the \c VPSLLW instruction. 2122/// 2123/// \param __a 2124/// A 256-bit vector of [16 x i16] to be shifted. 2125/// \param __count 2126/// An unsigned integer value specifying the shift count (in bits). 2127/// \returns A 256-bit vector of [16 x i16] containing the result. 2128static __inline__ __m256i __DEFAULT_FN_ATTRS256 2129_mm256_slli_epi16(__m256i __a, int __count) 2130{ 2131 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 2132} 2133 2134/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2135/// left by the number of bits specified by the lower 64 bits of \a __count, 2136/// shifting in zero bits, and returns the result. If \a __count is greater 2137/// than 15, the returned result is all zeroes. 2138/// 2139/// \headerfile <immintrin.h> 2140/// 2141/// This intrinsic corresponds to the \c VPSLLW instruction. 2142/// 2143/// \param __a 2144/// A 256-bit vector of [16 x i16] to be shifted. 2145/// \param __count 2146/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2147/// shift count (in bits). The upper element is ignored. 2148/// \returns A 256-bit vector of [16 x i16] containing the result. 2149static __inline__ __m256i __DEFAULT_FN_ATTRS256 2150_mm256_sll_epi16(__m256i __a, __m128i __count) 2151{ 2152 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 2153} 2154 2155/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2156/// left by \a __count bits, shifting in zero bits, and returns the result. 2157/// If \a __count is greater than 31, the returned result is all zeroes. 2158/// 2159/// \headerfile <immintrin.h> 2160/// 2161/// This intrinsic corresponds to the \c VPSLLD instruction. 2162/// 2163/// \param __a 2164/// A 256-bit vector of [8 x i32] to be shifted. 2165/// \param __count 2166/// An unsigned integer value specifying the shift count (in bits). 2167/// \returns A 256-bit vector of [8 x i32] containing the result. 2168static __inline__ __m256i __DEFAULT_FN_ATTRS256 2169_mm256_slli_epi32(__m256i __a, int __count) 2170{ 2171 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 2172} 2173 2174/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2175/// left by the number of bits given in the lower 64 bits of \a __count, 2176/// shifting in zero bits, and returns the result. If \a __count is greater 2177/// than 31, the returned result is all zeroes. 2178/// 2179/// \headerfile <immintrin.h> 2180/// 2181/// This intrinsic corresponds to the \c VPSLLD instruction. 2182/// 2183/// \param __a 2184/// A 256-bit vector of [8 x i32] to be shifted. 2185/// \param __count 2186/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2187/// shift count (in bits). The upper element is ignored. 2188/// \returns A 256-bit vector of [8 x i32] containing the result. 2189static __inline__ __m256i __DEFAULT_FN_ATTRS256 2190_mm256_sll_epi32(__m256i __a, __m128i __count) 2191{ 2192 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 2193} 2194 2195/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2196/// left by \a __count bits, shifting in zero bits, and returns the result. 2197/// If \a __count is greater than 63, the returned result is all zeroes. 2198/// 2199/// \headerfile <immintrin.h> 2200/// 2201/// This intrinsic corresponds to the \c VPSLLQ instruction. 2202/// 2203/// \param __a 2204/// A 256-bit vector of [4 x i64] to be shifted. 2205/// \param __count 2206/// An unsigned integer value specifying the shift count (in bits). 2207/// \returns A 256-bit vector of [4 x i64] containing the result. 2208static __inline__ __m256i __DEFAULT_FN_ATTRS256 2209_mm256_slli_epi64(__m256i __a, int __count) 2210{ 2211 return __builtin_ia32_psllqi256((__v4di)__a, __count); 2212} 2213 2214/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2215/// left by the number of bits given in the lower 64 bits of \a __count, 2216/// shifting in zero bits, and returns the result. If \a __count is greater 2217/// than 63, the returned result is all zeroes. 2218/// 2219/// \headerfile <immintrin.h> 2220/// 2221/// This intrinsic corresponds to the \c VPSLLQ instruction. 2222/// 2223/// \param __a 2224/// A 256-bit vector of [4 x i64] to be shifted. 2225/// \param __count 2226/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2227/// shift count (in bits). The upper element is ignored. 2228/// \returns A 256-bit vector of [4 x i64] containing the result. 2229static __inline__ __m256i __DEFAULT_FN_ATTRS256 2230_mm256_sll_epi64(__m256i __a, __m128i __count) 2231{ 2232 return __builtin_ia32_psllq256((__v4di)__a, __count); 2233} 2234 2235/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2236/// right by \a __count bits, shifting in sign bits, and returns the result. 2237/// If \a __count is greater than 15, each element of the result is either 2238/// 0 or -1 according to the corresponding input sign bit. 2239/// 2240/// \headerfile <immintrin.h> 2241/// 2242/// This intrinsic corresponds to the \c VPSRAW instruction. 2243/// 2244/// \param __a 2245/// A 256-bit vector of [16 x i16] to be shifted. 2246/// \param __count 2247/// An unsigned integer value specifying the shift count (in bits). 2248/// \returns A 256-bit vector of [16 x i16] containing the result. 2249static __inline__ __m256i __DEFAULT_FN_ATTRS256 2250_mm256_srai_epi16(__m256i __a, int __count) 2251{ 2252 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 2253} 2254 2255/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2256/// right by the number of bits given in the lower 64 bits of \a __count, 2257/// shifting in sign bits, and returns the result. If \a __count is greater 2258/// than 15, each element of the result is either 0 or -1 according to the 2259/// corresponding input sign bit. 2260/// 2261/// \headerfile <immintrin.h> 2262/// 2263/// This intrinsic corresponds to the \c VPSRAW instruction. 2264/// 2265/// \param __a 2266/// A 256-bit vector of [16 x i16] to be shifted. 2267/// \param __count 2268/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2269/// shift count (in bits). The upper element is ignored. 2270/// \returns A 256-bit vector of [16 x i16] containing the result. 2271static __inline__ __m256i __DEFAULT_FN_ATTRS256 2272_mm256_sra_epi16(__m256i __a, __m128i __count) 2273{ 2274 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 2275} 2276 2277/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2278/// right by \a __count bits, shifting in sign bits, and returns the result. 2279/// If \a __count is greater than 31, each element of the result is either 2280/// 0 or -1 according to the corresponding input sign bit. 2281/// 2282/// \headerfile <immintrin.h> 2283/// 2284/// This intrinsic corresponds to the \c VPSRAD instruction. 2285/// 2286/// \param __a 2287/// A 256-bit vector of [8 x i32] to be shifted. 2288/// \param __count 2289/// An unsigned integer value specifying the shift count (in bits). 2290/// \returns A 256-bit vector of [8 x i32] containing the result. 2291static __inline__ __m256i __DEFAULT_FN_ATTRS256 2292_mm256_srai_epi32(__m256i __a, int __count) 2293{ 2294 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 2295} 2296 2297/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2298/// right by the number of bits given in the lower 64 bits of \a __count, 2299/// shifting in sign bits, and returns the result. If \a __count is greater 2300/// than 31, each element of the result is either 0 or -1 according to the 2301/// corresponding input sign bit. 2302/// 2303/// \headerfile <immintrin.h> 2304/// 2305/// This intrinsic corresponds to the \c VPSRAD instruction. 2306/// 2307/// \param __a 2308/// A 256-bit vector of [8 x i32] to be shifted. 2309/// \param __count 2310/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2311/// shift count (in bits). The upper element is ignored. 2312/// \returns A 256-bit vector of [8 x i32] containing the result. 2313static __inline__ __m256i __DEFAULT_FN_ATTRS256 2314_mm256_sra_epi32(__m256i __a, __m128i __count) 2315{ 2316 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 2317} 2318 2319/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2320/// \a imm bytes, shifting in zero bytes, and returns the result. If 2321/// \a imm is greater than 15, the returned result is all zeroes. 2322/// 2323/// \headerfile <immintrin.h> 2324/// 2325/// \code 2326/// __m256i _mm256_srli_si256(__m256i a, const int imm); 2327/// \endcode 2328/// 2329/// This intrinsic corresponds to the \c VPSRLDQ instruction. 2330/// 2331/// \param a 2332/// A 256-bit integer vector to be shifted. 2333/// \param imm 2334/// An unsigned immediate value specifying the shift count (in bytes). 2335/// \returns A 256-bit integer vector containing the result. 2336#define _mm256_srli_si256(a, imm) \ 2337 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2338 2339/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2340/// \a imm bytes, shifting in zero bytes, and returns the result. If 2341/// \a imm is greater than 15, the returned result is all zeroes. 2342/// 2343/// \headerfile <immintrin.h> 2344/// 2345/// \code 2346/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm); 2347/// \endcode 2348/// 2349/// This intrinsic corresponds to the \c VPSRLDQ instruction. 2350/// 2351/// \param a 2352/// A 256-bit integer vector to be shifted. 2353/// \param imm 2354/// An unsigned immediate value specifying the shift count (in bytes). 2355/// \returns A 256-bit integer vector containing the result. 2356#define _mm256_bsrli_epi128(a, imm) \ 2357 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2358 2359/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2360/// right by \a __count bits, shifting in zero bits, and returns the result. 2361/// If \a __count is greater than 15, the returned result is all zeroes. 2362/// 2363/// \headerfile <immintrin.h> 2364/// 2365/// This intrinsic corresponds to the \c VPSRLW instruction. 2366/// 2367/// \param __a 2368/// A 256-bit vector of [16 x i16] to be shifted. 2369/// \param __count 2370/// An unsigned integer value specifying the shift count (in bits). 2371/// \returns A 256-bit vector of [16 x i16] containing the result. 2372static __inline__ __m256i __DEFAULT_FN_ATTRS256 2373_mm256_srli_epi16(__m256i __a, int __count) 2374{ 2375 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 2376} 2377 2378/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2379/// right by the number of bits given in the lower 64 bits of \a __count, 2380/// shifting in zero bits, and returns the result. If \a __count is greater 2381/// than 15, the returned result is all zeroes. 2382/// 2383/// \headerfile <immintrin.h> 2384/// 2385/// This intrinsic corresponds to the \c VPSRLW instruction. 2386/// 2387/// \param __a 2388/// A 256-bit vector of [16 x i16] to be shifted. 2389/// \param __count 2390/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2391/// shift count (in bits). The upper element is ignored. 2392/// \returns A 256-bit vector of [16 x i16] containing the result. 2393static __inline__ __m256i __DEFAULT_FN_ATTRS256 2394_mm256_srl_epi16(__m256i __a, __m128i __count) 2395{ 2396 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 2397} 2398 2399/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2400/// right by \a __count bits, shifting in zero bits, and returns the result. 2401/// If \a __count is greater than 31, the returned result is all zeroes. 2402/// 2403/// \headerfile <immintrin.h> 2404/// 2405/// This intrinsic corresponds to the \c VPSRLD instruction. 2406/// 2407/// \param __a 2408/// A 256-bit vector of [8 x i32] to be shifted. 2409/// \param __count 2410/// An unsigned integer value specifying the shift count (in bits). 2411/// \returns A 256-bit vector of [8 x i32] containing the result. 2412static __inline__ __m256i __DEFAULT_FN_ATTRS256 2413_mm256_srli_epi32(__m256i __a, int __count) 2414{ 2415 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 2416} 2417 2418/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2419/// right by the number of bits given in the lower 64 bits of \a __count, 2420/// shifting in zero bits, and returns the result. If \a __count is greater 2421/// than 31, the returned result is all zeroes. 2422/// 2423/// \headerfile <immintrin.h> 2424/// 2425/// This intrinsic corresponds to the \c VPSRLD instruction. 2426/// 2427/// \param __a 2428/// A 256-bit vector of [8 x i32] to be shifted. 2429/// \param __count 2430/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2431/// shift count (in bits). The upper element is ignored. 2432/// \returns A 256-bit vector of [8 x i32] containing the result. 2433static __inline__ __m256i __DEFAULT_FN_ATTRS256 2434_mm256_srl_epi32(__m256i __a, __m128i __count) 2435{ 2436 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 2437} 2438 2439/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2440/// right by \a __count bits, shifting in zero bits, and returns the result. 2441/// If \a __count is greater than 63, the returned result is all zeroes. 2442/// 2443/// \headerfile <immintrin.h> 2444/// 2445/// This intrinsic corresponds to the \c VPSRLQ instruction. 2446/// 2447/// \param __a 2448/// A 256-bit vector of [4 x i64] to be shifted. 2449/// \param __count 2450/// An unsigned integer value specifying the shift count (in bits). 2451/// \returns A 256-bit vector of [4 x i64] containing the result. 2452static __inline__ __m256i __DEFAULT_FN_ATTRS256 2453_mm256_srli_epi64(__m256i __a, int __count) 2454{ 2455 return __builtin_ia32_psrlqi256((__v4di)__a, __count); 2456} 2457 2458/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2459/// right by the number of bits given in the lower 64 bits of \a __count, 2460/// shifting in zero bits, and returns the result. If \a __count is greater 2461/// than 63, the returned result is all zeroes. 2462/// 2463/// \headerfile <immintrin.h> 2464/// 2465/// This intrinsic corresponds to the \c VPSRLQ instruction. 2466/// 2467/// \param __a 2468/// A 256-bit vector of [4 x i64] to be shifted. 2469/// \param __count 2470/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2471/// shift count (in bits). The upper element is ignored. 2472/// \returns A 256-bit vector of [4 x i64] containing the result. 2473static __inline__ __m256i __DEFAULT_FN_ATTRS256 2474_mm256_srl_epi64(__m256i __a, __m128i __count) 2475{ 2476 return __builtin_ia32_psrlq256((__v4di)__a, __count); 2477} 2478 2479/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2480/// vectors. Returns the lower 8 bits of each difference in the 2481/// corresponding byte of the 256-bit integer vector result (overflow is 2482/// ignored). 2483/// 2484/// \code{.operation} 2485/// FOR i := 0 TO 31 2486/// j := i*8 2487/// result[j+7:j] := __a[j+7:j] - __b[j+7:j] 2488/// ENDFOR 2489/// \endcode 2490/// 2491/// \headerfile <immintrin.h> 2492/// 2493/// This intrinsic corresponds to the \c VPSUBB instruction. 2494/// 2495/// \param __a 2496/// A 256-bit integer vector containing the minuends. 2497/// \param __b 2498/// A 256-bit integer vector containing the subtrahends. 2499/// \returns A 256-bit integer vector containing the differences. 2500static __inline__ __m256i __DEFAULT_FN_ATTRS256 2501_mm256_sub_epi8(__m256i __a, __m256i __b) 2502{ 2503 return (__m256i)((__v32qu)__a - (__v32qu)__b); 2504} 2505 2506/// Subtracts 16-bit integers from corresponding elements of two 256-bit 2507/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in 2508/// the corresponding element of the [16 x i16] result (overflow is 2509/// ignored). 2510/// 2511/// \code{.operation} 2512/// FOR i := 0 TO 15 2513/// j := i*16 2514/// result[j+15:j] := __a[j+15:j] - __b[j+15:j] 2515/// ENDFOR 2516/// \endcode 2517/// 2518/// \headerfile <immintrin.h> 2519/// 2520/// This intrinsic corresponds to the \c VPSUBW instruction. 2521/// 2522/// \param __a 2523/// A 256-bit vector of [16 x i16] containing the minuends. 2524/// \param __b 2525/// A 256-bit vector of [16 x i16] containing the subtrahends. 2526/// \returns A 256-bit vector of [16 x i16] containing the differences. 2527static __inline__ __m256i __DEFAULT_FN_ATTRS256 2528_mm256_sub_epi16(__m256i __a, __m256i __b) 2529{ 2530 return (__m256i)((__v16hu)__a - (__v16hu)__b); 2531} 2532 2533/// Subtracts 32-bit integers from corresponding elements of two 256-bit 2534/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in 2535/// the corresponding element of the [8 x i32] result (overflow is ignored). 2536/// 2537/// \code{.operation} 2538/// FOR i := 0 TO 7 2539/// j := i*32 2540/// result[j+31:j] := __a[j+31:j] - __b[j+31:j] 2541/// ENDFOR 2542/// \endcode 2543/// 2544/// \headerfile <immintrin.h> 2545/// 2546/// This intrinsic corresponds to the \c VPSUBD instruction. 2547/// 2548/// \param __a 2549/// A 256-bit vector of [8 x i32] containing the minuends. 2550/// \param __b 2551/// A 256-bit vector of [8 x i32] containing the subtrahends. 2552/// \returns A 256-bit vector of [8 x i32] containing the differences. 2553static __inline__ __m256i __DEFAULT_FN_ATTRS256 2554_mm256_sub_epi32(__m256i __a, __m256i __b) 2555{ 2556 return (__m256i)((__v8su)__a - (__v8su)__b); 2557} 2558 2559/// Subtracts 64-bit integers from corresponding elements of two 256-bit 2560/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in 2561/// the corresponding element of the [4 x i64] result (overflow is ignored). 2562/// 2563/// \code{.operation} 2564/// FOR i := 0 TO 3 2565/// j := i*64 2566/// result[j+63:j] := __a[j+63:j] - __b[j+63:j] 2567/// ENDFOR 2568/// \endcode 2569/// 2570/// \headerfile <immintrin.h> 2571/// 2572/// This intrinsic corresponds to the \c VPSUBQ instruction. 2573/// 2574/// \param __a 2575/// A 256-bit vector of [4 x i64] containing the minuends. 2576/// \param __b 2577/// A 256-bit vector of [4 x i64] containing the subtrahends. 2578/// \returns A 256-bit vector of [4 x i64] containing the differences. 2579static __inline__ __m256i __DEFAULT_FN_ATTRS256 2580_mm256_sub_epi64(__m256i __a, __m256i __b) 2581{ 2582 return (__m256i)((__v4du)__a - (__v4du)__b); 2583} 2584 2585/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2586/// vectors using signed saturation, and returns each differences in the 2587/// corresponding byte of the 256-bit integer vector result. 2588/// 2589/// \code{.operation} 2590/// FOR i := 0 TO 31 2591/// j := i*8 2592/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) 2593/// ENDFOR 2594/// \endcode 2595/// 2596/// \headerfile <immintrin.h> 2597/// 2598/// This intrinsic corresponds to the \c VPSUBSB instruction. 2599/// 2600/// \param __a 2601/// A 256-bit integer vector containing the minuends. 2602/// \param __b 2603/// A 256-bit integer vector containing the subtrahends. 2604/// \returns A 256-bit integer vector containing the differences. 2605static __inline__ __m256i __DEFAULT_FN_ATTRS256 2606_mm256_subs_epi8(__m256i __a, __m256i __b) 2607{ 2608 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); 2609} 2610 2611/// Subtracts 16-bit integers from corresponding elements of two 256-bit 2612/// vectors of [16 x i16] using signed saturation, and returns each 2613/// difference in the corresponding element of the [16 x i16] result. 2614/// 2615/// \code{.operation} 2616/// FOR i := 0 TO 15 2617/// j := i*16 2618/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) 2619/// ENDFOR 2620/// \endcode 2621/// 2622/// \headerfile <immintrin.h> 2623/// 2624/// This intrinsic corresponds to the \c VPSUBSW instruction. 2625/// 2626/// \param __a 2627/// A 256-bit vector of [16 x i16] containing the minuends. 2628/// \param __b 2629/// A 256-bit vector of [16 x i16] containing the subtrahends. 2630/// \returns A 256-bit vector of [16 x i16] containing the differences. 2631static __inline__ __m256i __DEFAULT_FN_ATTRS256 2632_mm256_subs_epi16(__m256i __a, __m256i __b) 2633{ 2634 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); 2635} 2636 2637/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2638/// vectors using unsigned saturation, and returns each difference in the 2639/// corresponding byte of the 256-bit integer vector result. For each byte, 2640/// computes <c> result = __a - __b </c>. 2641/// 2642/// \code{.operation} 2643/// FOR i := 0 TO 31 2644/// j := i*8 2645/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) 2646/// ENDFOR 2647/// \endcode 2648/// 2649/// \headerfile <immintrin.h> 2650/// 2651/// This intrinsic corresponds to the \c VPSUBUSB instruction. 2652/// 2653/// \param __a 2654/// A 256-bit integer vector containing the minuends. 2655/// \param __b 2656/// A 256-bit integer vector containing the subtrahends. 2657/// \returns A 256-bit integer vector containing the differences. 2658static __inline__ __m256i __DEFAULT_FN_ATTRS256 2659_mm256_subs_epu8(__m256i __a, __m256i __b) 2660{ 2661 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); 2662} 2663 2664/// Subtracts 16-bit integers from corresponding elements of two 256-bit 2665/// vectors of [16 x i16] using unsigned saturation, and returns each 2666/// difference in the corresponding element of the [16 x i16] result. 2667/// 2668/// \code{.operation} 2669/// FOR i := 0 TO 15 2670/// j := i*16 2671/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) 2672/// ENDFOR 2673/// \endcode 2674/// 2675/// \headerfile <immintrin.h> 2676/// 2677/// This intrinsic corresponds to the \c VPSUBUSW instruction. 2678/// 2679/// \param __a 2680/// A 256-bit vector of [16 x i16] containing the minuends. 2681/// \param __b 2682/// A 256-bit vector of [16 x i16] containing the subtrahends. 2683/// \returns A 256-bit vector of [16 x i16] containing the differences. 2684static __inline__ __m256i __DEFAULT_FN_ATTRS256 2685_mm256_subs_epu16(__m256i __a, __m256i __b) 2686{ 2687 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); 2688} 2689 2690/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2691/// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2692/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as 2693/// input; other bits in these parameters are ignored. 2694/// 2695/// \code{.operation} 2696/// result[7:0] := __a[71:64] 2697/// result[15:8] := __b[71:64] 2698/// result[23:16] := __a[79:72] 2699/// result[31:24] := __b[79:72] 2700/// . . . 2701/// result[127:120] := __b[127:120] 2702/// result[135:128] := __a[199:192] 2703/// . . . 2704/// result[255:248] := __b[255:248] 2705/// \endcode 2706/// 2707/// \headerfile <immintrin.h> 2708/// 2709/// This intrinsic corresponds to the \c VPUNPCKHBW instruction. 2710/// 2711/// \param __a 2712/// A 256-bit integer vector used as the source for the even-numbered bytes 2713/// of the result. 2714/// \param __b 2715/// A 256-bit integer vector used as the source for the odd-numbered bytes 2716/// of the result. 2717/// \returns A 256-bit integer vector containing the result. 2718static __inline__ __m256i __DEFAULT_FN_ATTRS256 2719_mm256_unpackhi_epi8(__m256i __a, __m256i __b) 2720{ 2721 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 2722} 2723 2724/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2725/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2726/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each 2727/// 128-bit half of \a __a and \a __b as input; other bits in these 2728/// parameters are ignored. 2729/// 2730/// \code{.operation} 2731/// result[15:0] := __a[79:64] 2732/// result[31:16] := __b[79:64] 2733/// result[47:32] := __a[95:80] 2734/// result[63:48] := __b[95:80] 2735/// . . . 2736/// result[127:112] := __b[127:112] 2737/// result[143:128] := __a[211:196] 2738/// . . . 2739/// result[255:240] := __b[255:240] 2740/// \endcode 2741/// 2742/// \headerfile <immintrin.h> 2743/// 2744/// This intrinsic corresponds to the \c VPUNPCKHWD instruction. 2745/// 2746/// \param __a 2747/// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2748/// elements of the result. 2749/// \param __b 2750/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2751/// elements of the result. 2752/// \returns A 256-bit vector of [16 x i16] containing the result. 2753static __inline__ __m256i __DEFAULT_FN_ATTRS256 2754_mm256_unpackhi_epi16(__m256i __a, __m256i __b) 2755{ 2756 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2757} 2758 2759/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2760/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2761/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half 2762/// of \a __a and \a __b as input; other bits in these parameters are 2763/// ignored. 2764/// 2765/// \code{.operation} 2766/// result[31:0] := __a[95:64] 2767/// result[63:32] := __b[95:64] 2768/// result[95:64] := __a[127:96] 2769/// result[127:96] := __b[127:96] 2770/// result[159:128] := __a[223:192] 2771/// result[191:160] := __b[223:192] 2772/// result[223:192] := __a[255:224] 2773/// result[255:224] := __b[255:224] 2774/// \endcode 2775/// 2776/// \headerfile <immintrin.h> 2777/// 2778/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction. 2779/// 2780/// \param __a 2781/// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2782/// elements of the result. 2783/// \param __b 2784/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2785/// elements of the result. 2786/// \returns A 256-bit vector of [8 x i32] containing the result. 2787static __inline__ __m256i __DEFAULT_FN_ATTRS256 2788_mm256_unpackhi_epi32(__m256i __a, __m256i __b) 2789{ 2790 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 2791} 2792 2793/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2794/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2795/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half 2796/// of \a __a and \a __b as input; other bits in these parameters are 2797/// ignored. 2798/// 2799/// \code{.operation} 2800/// result[63:0] := __a[127:64] 2801/// result[127:64] := __b[127:64] 2802/// result[191:128] := __a[255:192] 2803/// result[255:192] := __b[255:192] 2804/// \endcode 2805/// 2806/// \headerfile <immintrin.h> 2807/// 2808/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction. 2809/// 2810/// \param __a 2811/// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2812/// elements of the result. 2813/// \param __b 2814/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2815/// elements of the result. 2816/// \returns A 256-bit vector of [4 x i64] containing the result. 2817static __inline__ __m256i __DEFAULT_FN_ATTRS256 2818_mm256_unpackhi_epi64(__m256i __a, __m256i __b) 2819{ 2820 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 2821} 2822 2823/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2824/// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2825/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as 2826/// input; other bits in these parameters are ignored. 2827/// 2828/// \code{.operation} 2829/// result[7:0] := __a[7:0] 2830/// result[15:8] := __b[7:0] 2831/// result[23:16] := __a[15:8] 2832/// result[31:24] := __b[15:8] 2833/// . . . 2834/// result[127:120] := __b[63:56] 2835/// result[135:128] := __a[135:128] 2836/// . . . 2837/// result[255:248] := __b[191:184] 2838/// \endcode 2839/// 2840/// \headerfile <immintrin.h> 2841/// 2842/// This intrinsic corresponds to the \c VPUNPCKLBW instruction. 2843/// 2844/// \param __a 2845/// A 256-bit integer vector used as the source for the even-numbered bytes 2846/// of the result. 2847/// \param __b 2848/// A 256-bit integer vector used as the source for the odd-numbered bytes 2849/// of the result. 2850/// \returns A 256-bit integer vector containing the result. 2851static __inline__ __m256i __DEFAULT_FN_ATTRS256 2852_mm256_unpacklo_epi8(__m256i __a, __m256i __b) 2853{ 2854 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 2855} 2856 2857/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2858/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2859/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each 2860/// 128-bit half of \a __a and \a __b as input; other bits in these 2861/// parameters are ignored. 2862/// 2863/// \code{.operation} 2864/// result[15:0] := __a[15:0] 2865/// result[31:16] := __b[15:0] 2866/// result[47:32] := __a[31:16] 2867/// result[63:48] := __b[31:16] 2868/// . . . 2869/// result[127:112] := __b[63:48] 2870/// result[143:128] := __a[143:128] 2871/// . . . 2872/// result[255:239] := __b[191:176] 2873/// \endcode 2874/// 2875/// \headerfile <immintrin.h> 2876/// 2877/// This intrinsic corresponds to the \c VPUNPCKLWD instruction. 2878/// 2879/// \param __a 2880/// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2881/// elements of the result. 2882/// \param __b 2883/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2884/// elements of the result. 2885/// \returns A 256-bit vector of [16 x i16] containing the result. 2886static __inline__ __m256i __DEFAULT_FN_ATTRS256 2887_mm256_unpacklo_epi16(__m256i __a, __m256i __b) 2888{ 2889 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 2890} 2891 2892/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2893/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2894/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half 2895/// of \a __a and \a __b as input; other bits in these parameters are 2896/// ignored. 2897/// 2898/// \code{.operation} 2899/// result[31:0] := __a[31:0] 2900/// result[63:32] := __b[31:0] 2901/// result[95:64] := __a[63:32] 2902/// result[127:96] := __b[63:32] 2903/// result[159:128] := __a[159:128] 2904/// result[191:160] := __b[159:128] 2905/// result[223:192] := __a[191:160] 2906/// result[255:224] := __b[191:190] 2907/// \endcode 2908/// 2909/// \headerfile <immintrin.h> 2910/// 2911/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction. 2912/// 2913/// \param __a 2914/// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2915/// elements of the result. 2916/// \param __b 2917/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2918/// elements of the result. 2919/// \returns A 256-bit vector of [8 x i32] containing the result. 2920static __inline__ __m256i __DEFAULT_FN_ATTRS256 2921_mm256_unpacklo_epi32(__m256i __a, __m256i __b) 2922{ 2923 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 2924} 2925 2926/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2927/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2928/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half 2929/// of \a __a and \a __b as input; other bits in these parameters are 2930/// ignored. 2931/// 2932/// \code{.operation} 2933/// result[63:0] := __a[63:0] 2934/// result[127:64] := __b[63:0] 2935/// result[191:128] := __a[191:128] 2936/// result[255:192] := __b[191:128] 2937/// \endcode 2938/// 2939/// \headerfile <immintrin.h> 2940/// 2941/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction. 2942/// 2943/// \param __a 2944/// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2945/// elements of the result. 2946/// \param __b 2947/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2948/// elements of the result. 2949/// \returns A 256-bit vector of [4 x i64] containing the result. 2950static __inline__ __m256i __DEFAULT_FN_ATTRS256 2951_mm256_unpacklo_epi64(__m256i __a, __m256i __b) 2952{ 2953 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 2954} 2955 2956/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and 2957/// \a __b. 2958/// 2959/// \headerfile <immintrin.h> 2960/// 2961/// This intrinsic corresponds to the \c VPXOR instruction. 2962/// 2963/// \param __a 2964/// A 256-bit integer vector. 2965/// \param __b 2966/// A 256-bit integer vector. 2967/// \returns A 256-bit integer vector containing the result. 2968static __inline__ __m256i __DEFAULT_FN_ATTRS256 2969_mm256_xor_si256(__m256i __a, __m256i __b) 2970{ 2971 return (__m256i)((__v4du)__a ^ (__v4du)__b); 2972} 2973 2974/// Loads the 256-bit integer vector from memory \a __V using a non-temporal 2975/// memory hint and returns the vector. \a __V must be aligned on a 32-byte 2976/// boundary. 2977/// 2978/// \headerfile <immintrin.h> 2979/// 2980/// This intrinsic corresponds to the \c VMOVNTDQA instruction. 2981/// 2982/// \param __V 2983/// A pointer to the 32-byte aligned memory containing the vector to load. 2984/// \returns A 256-bit integer vector loaded from memory. 2985static __inline__ __m256i __DEFAULT_FN_ATTRS256 2986_mm256_stream_load_si256(const void *__V) 2987{ 2988 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 2989 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 2990} 2991 2992/// Broadcasts the 32-bit floating-point value from the low element of the 2993/// 128-bit vector of [4 x float] in \a __X to all elements of the result's 2994/// 128-bit vector of [4 x float]. 2995/// 2996/// \headerfile <immintrin.h> 2997/// 2998/// This intrinsic corresponds to the \c VBROADCASTSS instruction. 2999/// 3000/// \param __X 3001/// A 128-bit vector of [4 x float] whose low element will be broadcast. 3002/// \returns A 128-bit vector of [4 x float] containing the result. 3003static __inline__ __m128 __DEFAULT_FN_ATTRS128 3004_mm_broadcastss_ps(__m128 __X) 3005{ 3006 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 3007} 3008 3009/// Broadcasts the 64-bit floating-point value from the low element of the 3010/// 128-bit vector of [2 x double] in \a __a to both elements of the 3011/// result's 128-bit vector of [2 x double]. 3012/// 3013/// \headerfile <immintrin.h> 3014/// 3015/// This intrinsic corresponds to the \c MOVDDUP instruction. 3016/// 3017/// \param __a 3018/// A 128-bit vector of [2 x double] whose low element will be broadcast. 3019/// \returns A 128-bit vector of [2 x double] containing the result. 3020static __inline__ __m128d __DEFAULT_FN_ATTRS128 3021_mm_broadcastsd_pd(__m128d __a) 3022{ 3023 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 3024} 3025 3026/// Broadcasts the 32-bit floating-point value from the low element of the 3027/// 128-bit vector of [4 x float] in \a __X to all elements of the 3028/// result's 256-bit vector of [8 x float]. 3029/// 3030/// \headerfile <immintrin.h> 3031/// 3032/// This intrinsic corresponds to the \c VBROADCASTSS instruction. 3033/// 3034/// \param __X 3035/// A 128-bit vector of [4 x float] whose low element will be broadcast. 3036/// \returns A 256-bit vector of [8 x float] containing the result. 3037static __inline__ __m256 __DEFAULT_FN_ATTRS256 3038_mm256_broadcastss_ps(__m128 __X) 3039{ 3040 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3041} 3042 3043/// Broadcasts the 64-bit floating-point value from the low element of the 3044/// 128-bit vector of [2 x double] in \a __X to all elements of the 3045/// result's 256-bit vector of [4 x double]. 3046/// 3047/// \headerfile <immintrin.h> 3048/// 3049/// This intrinsic corresponds to the \c VBROADCASTSD instruction. 3050/// 3051/// \param __X 3052/// A 128-bit vector of [2 x double] whose low element will be broadcast. 3053/// \returns A 256-bit vector of [4 x double] containing the result. 3054static __inline__ __m256d __DEFAULT_FN_ATTRS256 3055_mm256_broadcastsd_pd(__m128d __X) 3056{ 3057 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 3058} 3059 3060/// Broadcasts the 128-bit integer data from \a __X to both the lower and 3061/// upper halves of the 256-bit result. 3062/// 3063/// \headerfile <immintrin.h> 3064/// 3065/// This intrinsic corresponds to the \c VBROADCASTI128 instruction. 3066/// 3067/// \param __X 3068/// A 128-bit integer vector to be broadcast. 3069/// \returns A 256-bit integer vector containing the result. 3070static __inline__ __m256i __DEFAULT_FN_ATTRS256 3071_mm256_broadcastsi128_si256(__m128i __X) 3072{ 3073 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 3074} 3075 3076#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) 3077 3078/// Merges 32-bit integer elements from either of the two 128-bit vectors of 3079/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32], 3080/// as specified by the immediate integer operand \a M. 3081/// 3082/// \code{.operation} 3083/// FOR i := 0 TO 3 3084/// j := i*32 3085/// IF M[i] == 0 3086/// result[31+j:j] := V1[31+j:j] 3087/// ELSE 3088/// result[31+j:j] := V2[32+j:j] 3089/// FI 3090/// ENDFOR 3091/// \endcode 3092/// 3093/// \headerfile <immintrin.h> 3094/// 3095/// \code 3096/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M); 3097/// \endcode 3098/// 3099/// This intrinsic corresponds to the \c VPBLENDDD instruction. 3100/// 3101/// \param V1 3102/// A 128-bit vector of [4 x i32] containing source values. 3103/// \param V2 3104/// A 128-bit vector of [4 x i32] containing source values. 3105/// \param M 3106/// An immediate 8-bit integer operand, with bits [3:0] specifying the 3107/// source for each element of the result. The position of the mask bit 3108/// corresponds to the index of a copied value. When a mask bit is 0, the 3109/// element is copied from \a V1; otherwise, it is copied from \a V2. 3110/// \returns A 128-bit vector of [4 x i32] containing the result. 3111#define _mm_blend_epi32(V1, V2, M) \ 3112 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 3113 (__v4si)(__m128i)(V2), (int)(M))) 3114 3115/// Merges 32-bit integer elements from either of the two 256-bit vectors of 3116/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32], 3117/// as specified by the immediate integer operand \a M. 3118/// 3119/// \code{.operation} 3120/// FOR i := 0 TO 7 3121/// j := i*32 3122/// IF M[i] == 0 3123/// result[31+j:j] := V1[31+j:j] 3124/// ELSE 3125/// result[31+j:j] := V2[32+j:j] 3126/// FI 3127/// ENDFOR 3128/// \endcode 3129/// 3130/// \headerfile <immintrin.h> 3131/// 3132/// \code 3133/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M); 3134/// \endcode 3135/// 3136/// This intrinsic corresponds to the \c VPBLENDDD instruction. 3137/// 3138/// \param V1 3139/// A 256-bit vector of [8 x i32] containing source values. 3140/// \param V2 3141/// A 256-bit vector of [8 x i32] containing source values. 3142/// \param M 3143/// An immediate 8-bit integer operand, with bits [7:0] specifying the 3144/// source for each element of the result. The position of the mask bit 3145/// corresponds to the index of a copied value. When a mask bit is 0, the 3146/// element is copied from \a V1; otherwise, it is is copied from \a V2. 3147/// \returns A 256-bit vector of [8 x i32] containing the result. 3148#define _mm256_blend_epi32(V1, V2, M) \ 3149 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 3150 (__v8si)(__m256i)(V2), (int)(M))) 3151 3152/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3153/// bytes of the 256-bit result. 3154/// 3155/// \headerfile <immintrin.h> 3156/// 3157/// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3158/// 3159/// \param __X 3160/// A 128-bit integer vector whose low byte will be broadcast. 3161/// \returns A 256-bit integer vector containing the result. 3162static __inline__ __m256i __DEFAULT_FN_ATTRS256 3163_mm256_broadcastb_epi8(__m128i __X) 3164{ 3165 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3166} 3167 3168/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X 3169/// to all elements of the result's 256-bit vector of [16 x i16]. 3170/// 3171/// \headerfile <immintrin.h> 3172/// 3173/// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3174/// 3175/// \param __X 3176/// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3177/// \returns A 256-bit vector of [16 x i16] containing the result. 3178static __inline__ __m256i __DEFAULT_FN_ATTRS256 3179_mm256_broadcastw_epi16(__m128i __X) 3180{ 3181 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3182} 3183 3184/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3185/// to all elements of the result's 256-bit vector of [8 x i32]. 3186/// 3187/// \headerfile <immintrin.h> 3188/// 3189/// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3190/// 3191/// \param __X 3192/// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3193/// \returns A 256-bit vector of [8 x i32] containing the result. 3194static __inline__ __m256i __DEFAULT_FN_ATTRS256 3195_mm256_broadcastd_epi32(__m128i __X) 3196{ 3197 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3198} 3199 3200/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3201/// to all elements of the result's 256-bit vector of [4 x i64]. 3202/// 3203/// \headerfile <immintrin.h> 3204/// 3205/// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3206/// 3207/// \param __X 3208/// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3209/// \returns A 256-bit vector of [4 x i64] containing the result. 3210static __inline__ __m256i __DEFAULT_FN_ATTRS256 3211_mm256_broadcastq_epi64(__m128i __X) 3212{ 3213 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 3214} 3215 3216/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3217/// bytes of the 128-bit result. 3218/// 3219/// \headerfile <immintrin.h> 3220/// 3221/// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3222/// 3223/// \param __X 3224/// A 128-bit integer vector whose low byte will be broadcast. 3225/// \returns A 128-bit integer vector containing the result. 3226static __inline__ __m128i __DEFAULT_FN_ATTRS128 3227_mm_broadcastb_epi8(__m128i __X) 3228{ 3229 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3230} 3231 3232/// Broadcasts the low element from the 128-bit vector of [8 x i16] in 3233/// \a __X to all elements of the result's 128-bit vector of [8 x i16]. 3234/// 3235/// \headerfile <immintrin.h> 3236/// 3237/// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3238/// 3239/// \param __X 3240/// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3241/// \returns A 128-bit vector of [8 x i16] containing the result. 3242static __inline__ __m128i __DEFAULT_FN_ATTRS128 3243_mm_broadcastw_epi16(__m128i __X) 3244{ 3245 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3246} 3247 3248/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3249/// to all elements of the result's vector of [4 x i32]. 3250/// 3251/// \headerfile <immintrin.h> 3252/// 3253/// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3254/// 3255/// \param __X 3256/// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3257/// \returns A 128-bit vector of [4 x i32] containing the result. 3258static __inline__ __m128i __DEFAULT_FN_ATTRS128 3259_mm_broadcastd_epi32(__m128i __X) 3260{ 3261 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 3262} 3263 3264/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3265/// to both elements of the result's 128-bit vector of [2 x i64]. 3266/// 3267/// \headerfile <immintrin.h> 3268/// 3269/// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3270/// 3271/// \param __X 3272/// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3273/// \returns A 128-bit vector of [2 x i64] containing the result. 3274static __inline__ __m128i __DEFAULT_FN_ATTRS128 3275_mm_broadcastq_epi64(__m128i __X) 3276{ 3277 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 3278} 3279 3280/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 3281/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the 3282/// elements of the 256-bit vector of [8 x i32] in \a __b. 3283/// 3284/// \code{.operation} 3285/// FOR i := 0 TO 7 3286/// j := i*32 3287/// k := __b[j+2:j] * 32 3288/// result[j+31:j] := __a[k+31:k] 3289/// ENDFOR 3290/// \endcode 3291/// 3292/// \headerfile <immintrin.h> 3293/// 3294/// This intrinsic corresponds to the \c VPERMD instruction. 3295/// 3296/// \param __a 3297/// A 256-bit vector of [8 x i32] containing the source values. 3298/// \param __b 3299/// A 256-bit vector of [8 x i32] containing indexes of values to use from 3300/// \a __a. 3301/// \returns A 256-bit vector of [8 x i32] containing the result. 3302static __inline__ __m256i __DEFAULT_FN_ATTRS256 3303_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 3304{ 3305 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 3306} 3307 3308/// Sets the result's 256-bit vector of [4 x double] to copies of elements of 3309/// the 256-bit vector of [4 x double] in \a V as specified by the 3310/// immediate value \a M. 3311/// 3312/// \code{.operation} 3313/// FOR i := 0 TO 3 3314/// j := i*64 3315/// k := (M >> i*2)[1:0] * 64 3316/// result[j+63:j] := V[k+63:k] 3317/// ENDFOR 3318/// \endcode 3319/// 3320/// \headerfile <immintrin.h> 3321/// 3322/// \code 3323/// __m256d _mm256_permute4x64_pd(__m256d V, const int M); 3324/// \endcode 3325/// 3326/// This intrinsic corresponds to the \c VPERMPD instruction. 3327/// 3328/// \param V 3329/// A 256-bit vector of [4 x double] containing the source values. 3330/// \param M 3331/// An immediate 8-bit value specifying which elements to copy from \a V. 3332/// \a M[1:0] specifies the index in \a a for element 0 of the result, 3333/// \a M[3:2] specifies the index for element 1, and so forth. 3334/// \returns A 256-bit vector of [4 x double] containing the result. 3335#define _mm256_permute4x64_pd(V, M) \ 3336 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) 3337 3338/// Sets the result's 256-bit vector of [8 x float] to copies of elements of 3339/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in 3340/// the elements of the 256-bit vector of [8 x i32] in \a __b. 3341/// 3342/// \code{.operation} 3343/// FOR i := 0 TO 7 3344/// j := i*32 3345/// k := __b[j+2:j] * 32 3346/// result[j+31:j] := __a[k+31:k] 3347/// ENDFOR 3348/// \endcode 3349/// 3350/// \headerfile <immintrin.h> 3351/// 3352/// This intrinsic corresponds to the \c VPERMPS instruction. 3353/// 3354/// \param __a 3355/// A 256-bit vector of [8 x float] containing the source values. 3356/// \param __b 3357/// A 256-bit vector of [8 x i32] containing indexes of values to use from 3358/// \a __a. 3359/// \returns A 256-bit vector of [8 x float] containing the result. 3360static __inline__ __m256 __DEFAULT_FN_ATTRS256 3361_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 3362{ 3363 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 3364} 3365 3366/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements 3367/// of the 256-bit vector of [4 x i64] in \a V as specified by the 3368/// immediate value \a M. 3369/// 3370/// \code{.operation} 3371/// FOR i := 0 TO 3 3372/// j := i*64 3373/// k := (M >> i*2)[1:0] * 64 3374/// result[j+63:j] := V[k+63:k] 3375/// ENDFOR 3376/// \endcode 3377/// 3378/// \headerfile <immintrin.h> 3379/// 3380/// \code 3381/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M); 3382/// \endcode 3383/// 3384/// This intrinsic corresponds to the \c VPERMQ instruction. 3385/// 3386/// \param V 3387/// A 256-bit vector of [4 x i64] containing the source values. 3388/// \param M 3389/// An immediate 8-bit value specifying which elements to copy from \a V. 3390/// \a M[1:0] specifies the index in \a a for element 0 of the result, 3391/// \a M[3:2] specifies the index for element 1, and so forth. 3392/// \returns A 256-bit vector of [4 x i64] containing the result. 3393#define _mm256_permute4x64_epi64(V, M) \ 3394 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) 3395 3396/// Sets each half of the 256-bit result either to zero or to one of the 3397/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2, 3398/// as specified by the immediate value \a M. 3399/// 3400/// \code{.operation} 3401/// FOR i := 0 TO 1 3402/// j := i*128 3403/// k := M >> (i*4) 3404/// IF k[3] == 0 3405/// CASE (k[1:0]) OF 3406/// 0: result[127+j:j] := V1[127:0] 3407/// 1: result[127+j:j] := V1[255:128] 3408/// 2: result[127+j:j] := V2[127:0] 3409/// 3: result[127+j:j] := V2[255:128] 3410/// ESAC 3411/// ELSE 3412/// result[127+j:j] := 0 3413/// FI 3414/// ENDFOR 3415/// \endcode 3416/// 3417/// \headerfile <immintrin.h> 3418/// 3419/// \code 3420/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M); 3421/// \endcode 3422/// 3423/// This intrinsic corresponds to the \c VPERM2I128 instruction. 3424/// 3425/// \param V1 3426/// A 256-bit integer vector containing source values. 3427/// \param V2 3428/// A 256-bit integer vector containing source values. 3429/// \param M 3430/// An immediate value specifying how to form the result. Bits [3:0] 3431/// control the lower half of the result, bits [7:4] control the upper half. 3432/// Within each 4-bit control value, if bit 3 is 1, the result is zero, 3433/// otherwise bits [1:0] determine the source as follows. \n 3434/// 0: the lower half of \a V1 \n 3435/// 1: the upper half of \a V1 \n 3436/// 2: the lower half of \a V2 \n 3437/// 3: the upper half of \a V2 3438/// \returns A 256-bit integer vector containing the result. 3439#define _mm256_permute2x128_si256(V1, V2, M) \ 3440 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) 3441 3442/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0 3443/// of the immediate \a M is zero, extracts the lower half of the result; 3444/// otherwise, extracts the upper half. 3445/// 3446/// \headerfile <immintrin.h> 3447/// 3448/// \code 3449/// __m128i _mm256_extracti128_si256(__m256i V, const int M); 3450/// \endcode 3451/// 3452/// This intrinsic corresponds to the \c VEXTRACTI128 instruction. 3453/// 3454/// \param V 3455/// A 256-bit integer vector containing the source values. 3456/// \param M 3457/// An immediate value specifying which half of \a V to extract. 3458/// \returns A 128-bit integer vector containing the result. 3459#define _mm256_extracti128_si256(V, M) \ 3460 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) 3461 3462/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the 3463/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M 3464/// is zero, overwrites the lower half of the result; otherwise, 3465/// overwrites the upper half. 3466/// 3467/// \headerfile <immintrin.h> 3468/// 3469/// \code 3470/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M); 3471/// \endcode 3472/// 3473/// This intrinsic corresponds to the \c VINSERTI128 instruction. 3474/// 3475/// \param V1 3476/// A 256-bit integer vector containing a source value. 3477/// \param V2 3478/// A 128-bit integer vector containing a source value. 3479/// \param M 3480/// An immediate value specifying where to put \a V2 in the result. 3481/// \returns A 256-bit integer vector containing the result. 3482#define _mm256_inserti128_si256(V1, V2, M) \ 3483 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 3484 (__v2di)(__m128i)(V2), (int)(M))) 3485 3486/// Conditionally loads eight 32-bit integer elements from memory \a __X, if 3487/// the most significant bit of the corresponding element in the mask 3488/// \a __M is set; otherwise, sets that element of the result to zero. 3489/// Returns the 256-bit [8 x i32] result. 3490/// 3491/// \code{.operation} 3492/// FOR i := 0 TO 7 3493/// j := i*32 3494/// IF __M[j+31] == 1 3495/// result[j+31:j] := Load32(__X+(i*4)) 3496/// ELSE 3497/// result[j+31:j] := 0 3498/// FI 3499/// ENDFOR 3500/// \endcode 3501/// 3502/// \headerfile <immintrin.h> 3503/// 3504/// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3505/// 3506/// \param __X 3507/// A pointer to the memory used for loading values. 3508/// \param __M 3509/// A 256-bit vector of [8 x i32] containing the mask bits. 3510/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed 3511/// elements. 3512static __inline__ __m256i __DEFAULT_FN_ATTRS256 3513_mm256_maskload_epi32(int const *__X, __m256i __M) 3514{ 3515 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 3516} 3517 3518/// Conditionally loads four 64-bit integer elements from memory \a __X, if 3519/// the most significant bit of the corresponding element in the mask 3520/// \a __M is set; otherwise, sets that element of the result to zero. 3521/// Returns the 256-bit [4 x i64] result. 3522/// 3523/// \code{.operation} 3524/// FOR i := 0 TO 3 3525/// j := i*64 3526/// IF __M[j+63] == 1 3527/// result[j+63:j] := Load64(__X+(i*8)) 3528/// ELSE 3529/// result[j+63:j] := 0 3530/// FI 3531/// ENDFOR 3532/// \endcode 3533/// 3534/// \headerfile <immintrin.h> 3535/// 3536/// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3537/// 3538/// \param __X 3539/// A pointer to the memory used for loading values. 3540/// \param __M 3541/// A 256-bit vector of [4 x i64] containing the mask bits. 3542/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed 3543/// elements. 3544static __inline__ __m256i __DEFAULT_FN_ATTRS256 3545_mm256_maskload_epi64(long long const *__X, __m256i __M) 3546{ 3547 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 3548} 3549 3550/// Conditionally loads four 32-bit integer elements from memory \a __X, if 3551/// the most significant bit of the corresponding element in the mask 3552/// \a __M is set; otherwise, sets that element of the result to zero. 3553/// Returns the 128-bit [4 x i32] result. 3554/// 3555/// \code{.operation} 3556/// FOR i := 0 TO 3 3557/// j := i*32 3558/// IF __M[j+31] == 1 3559/// result[j+31:j] := Load32(__X+(i*4)) 3560/// ELSE 3561/// result[j+31:j] := 0 3562/// FI 3563/// ENDFOR 3564/// \endcode 3565/// 3566/// \headerfile <immintrin.h> 3567/// 3568/// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3569/// 3570/// \param __X 3571/// A pointer to the memory used for loading values. 3572/// \param __M 3573/// A 128-bit vector of [4 x i32] containing the mask bits. 3574/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed 3575/// elements. 3576static __inline__ __m128i __DEFAULT_FN_ATTRS128 3577_mm_maskload_epi32(int const *__X, __m128i __M) 3578{ 3579 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 3580} 3581 3582/// Conditionally loads two 64-bit integer elements from memory \a __X, if 3583/// the most significant bit of the corresponding element in the mask 3584/// \a __M is set; otherwise, sets that element of the result to zero. 3585/// Returns the 128-bit [2 x i64] result. 3586/// 3587/// \code{.operation} 3588/// FOR i := 0 TO 1 3589/// j := i*64 3590/// IF __M[j+63] == 1 3591/// result[j+63:j] := Load64(__X+(i*8)) 3592/// ELSE 3593/// result[j+63:j] := 0 3594/// FI 3595/// ENDFOR 3596/// \endcode 3597/// 3598/// \headerfile <immintrin.h> 3599/// 3600/// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3601/// 3602/// \param __X 3603/// A pointer to the memory used for loading values. 3604/// \param __M 3605/// A 128-bit vector of [2 x i64] containing the mask bits. 3606/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed 3607/// elements. 3608static __inline__ __m128i __DEFAULT_FN_ATTRS128 3609_mm_maskload_epi64(long long const *__X, __m128i __M) 3610{ 3611 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 3612} 3613 3614/// Conditionally stores eight 32-bit integer elements from the 256-bit vector 3615/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of 3616/// the corresponding element in the mask \a __M is set; otherwise, the 3617/// memory element is unchanged. 3618/// 3619/// \code{.operation} 3620/// FOR i := 0 TO 7 3621/// j := i*32 3622/// IF __M[j+31] == 1 3623/// Store32(__X+(i*4), __Y[j+31:j]) 3624/// FI 3625/// ENDFOR 3626/// \endcode 3627/// 3628/// \headerfile <immintrin.h> 3629/// 3630/// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3631/// 3632/// \param __X 3633/// A pointer to the memory used for storing values. 3634/// \param __M 3635/// A 256-bit vector of [8 x i32] containing the mask bits. 3636/// \param __Y 3637/// A 256-bit vector of [8 x i32] containing the values to store. 3638static __inline__ void __DEFAULT_FN_ATTRS256 3639_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 3640{ 3641 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 3642} 3643 3644/// Conditionally stores four 64-bit integer elements from the 256-bit vector 3645/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of 3646/// the corresponding element in the mask \a __M is set; otherwise, the 3647/// memory element is unchanged. 3648/// 3649/// \code{.operation} 3650/// FOR i := 0 TO 3 3651/// j := i*64 3652/// IF __M[j+63] == 1 3653/// Store64(__X+(i*8), __Y[j+63:j]) 3654/// FI 3655/// ENDFOR 3656/// \endcode 3657/// 3658/// \headerfile <immintrin.h> 3659/// 3660/// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3661/// 3662/// \param __X 3663/// A pointer to the memory used for storing values. 3664/// \param __M 3665/// A 256-bit vector of [4 x i64] containing the mask bits. 3666/// \param __Y 3667/// A 256-bit vector of [4 x i64] containing the values to store. 3668static __inline__ void __DEFAULT_FN_ATTRS256 3669_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 3670{ 3671 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 3672} 3673 3674/// Conditionally stores four 32-bit integer elements from the 128-bit vector 3675/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of 3676/// the corresponding element in the mask \a __M is set; otherwise, the 3677/// memory element is unchanged. 3678/// 3679/// \code{.operation} 3680/// FOR i := 0 TO 3 3681/// j := i*32 3682/// IF __M[j+31] == 1 3683/// Store32(__X+(i*4), __Y[j+31:j]) 3684/// FI 3685/// ENDFOR 3686/// \endcode 3687/// 3688/// \headerfile <immintrin.h> 3689/// 3690/// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3691/// 3692/// \param __X 3693/// A pointer to the memory used for storing values. 3694/// \param __M 3695/// A 128-bit vector of [4 x i32] containing the mask bits. 3696/// \param __Y 3697/// A 128-bit vector of [4 x i32] containing the values to store. 3698static __inline__ void __DEFAULT_FN_ATTRS128 3699_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 3700{ 3701 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 3702} 3703 3704/// Conditionally stores two 64-bit integer elements from the 128-bit vector 3705/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of 3706/// the corresponding element in the mask \a __M is set; otherwise, the 3707/// memory element is unchanged. 3708/// 3709/// \code{.operation} 3710/// FOR i := 0 TO 1 3711/// j := i*64 3712/// IF __M[j+63] == 1 3713/// Store64(__X+(i*8), __Y[j+63:j]) 3714/// FI 3715/// ENDFOR 3716/// \endcode 3717/// 3718/// \headerfile <immintrin.h> 3719/// 3720/// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3721/// 3722/// \param __X 3723/// A pointer to the memory used for storing values. 3724/// \param __M 3725/// A 128-bit vector of [2 x i64] containing the mask bits. 3726/// \param __Y 3727/// A 128-bit vector of [2 x i64] containing the values to store. 3728static __inline__ void __DEFAULT_FN_ATTRS128 3729_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 3730{ 3731 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 3732} 3733 3734/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3735/// left by the number of bits given in the corresponding element of the 3736/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3737/// returns the result. If the shift count for any element is greater than 3738/// 31, the result for that element is zero. 3739/// 3740/// \headerfile <immintrin.h> 3741/// 3742/// This intrinsic corresponds to the \c VPSLLVD instruction. 3743/// 3744/// \param __X 3745/// A 256-bit vector of [8 x i32] to be shifted. 3746/// \param __Y 3747/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3748/// bits). 3749/// \returns A 256-bit vector of [8 x i32] containing the result. 3750static __inline__ __m256i __DEFAULT_FN_ATTRS256 3751_mm256_sllv_epi32(__m256i __X, __m256i __Y) 3752{ 3753 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 3754} 3755 3756/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3757/// left by the number of bits given in the corresponding element of the 3758/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3759/// returns the result. If the shift count for any element is greater than 3760/// 31, the result for that element is zero. 3761/// 3762/// \headerfile <immintrin.h> 3763/// 3764/// This intrinsic corresponds to the \c VPSLLVD instruction. 3765/// 3766/// \param __X 3767/// A 128-bit vector of [4 x i32] to be shifted. 3768/// \param __Y 3769/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3770/// bits). 3771/// \returns A 128-bit vector of [4 x i32] containing the result. 3772static __inline__ __m128i __DEFAULT_FN_ATTRS128 3773_mm_sllv_epi32(__m128i __X, __m128i __Y) 3774{ 3775 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 3776} 3777 3778/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3779/// left by the number of bits given in the corresponding element of the 3780/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3781/// returns the result. If the shift count for any element is greater than 3782/// 63, the result for that element is zero. 3783/// 3784/// \headerfile <immintrin.h> 3785/// 3786/// This intrinsic corresponds to the \c VPSLLVQ instruction. 3787/// 3788/// \param __X 3789/// A 256-bit vector of [4 x i64] to be shifted. 3790/// \param __Y 3791/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3792/// bits). 3793/// \returns A 256-bit vector of [4 x i64] containing the result. 3794static __inline__ __m256i __DEFAULT_FN_ATTRS256 3795_mm256_sllv_epi64(__m256i __X, __m256i __Y) 3796{ 3797 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 3798} 3799 3800/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3801/// left by the number of bits given in the corresponding element of the 3802/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3803/// returns the result. If the shift count for any element is greater than 3804/// 63, the result for that element is zero. 3805/// 3806/// \headerfile <immintrin.h> 3807/// 3808/// This intrinsic corresponds to the \c VPSLLVQ instruction. 3809/// 3810/// \param __X 3811/// A 128-bit vector of [2 x i64] to be shifted. 3812/// \param __Y 3813/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3814/// bits). 3815/// \returns A 128-bit vector of [2 x i64] containing the result. 3816static __inline__ __m128i __DEFAULT_FN_ATTRS128 3817_mm_sllv_epi64(__m128i __X, __m128i __Y) 3818{ 3819 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 3820} 3821 3822/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3823/// right by the number of bits given in the corresponding element of the 3824/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and 3825/// returns the result. If the shift count for any element is greater than 3826/// 31, the result for that element is 0 or -1 according to the sign bit 3827/// for that element. 3828/// 3829/// \headerfile <immintrin.h> 3830/// 3831/// This intrinsic corresponds to the \c VPSRAVD instruction. 3832/// 3833/// \param __X 3834/// A 256-bit vector of [8 x i32] to be shifted. 3835/// \param __Y 3836/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3837/// bits). 3838/// \returns A 256-bit vector of [8 x i32] containing the result. 3839static __inline__ __m256i __DEFAULT_FN_ATTRS256 3840_mm256_srav_epi32(__m256i __X, __m256i __Y) 3841{ 3842 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 3843} 3844 3845/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3846/// right by the number of bits given in the corresponding element of the 3847/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and 3848/// returns the result. If the shift count for any element is greater than 3849/// 31, the result for that element is 0 or -1 according to the sign bit 3850/// for that element. 3851/// 3852/// \headerfile <immintrin.h> 3853/// 3854/// This intrinsic corresponds to the \c VPSRAVD instruction. 3855/// 3856/// \param __X 3857/// A 128-bit vector of [4 x i32] to be shifted. 3858/// \param __Y 3859/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3860/// bits). 3861/// \returns A 128-bit vector of [4 x i32] containing the result. 3862static __inline__ __m128i __DEFAULT_FN_ATTRS128 3863_mm_srav_epi32(__m128i __X, __m128i __Y) 3864{ 3865 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 3866} 3867 3868/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3869/// right by the number of bits given in the corresponding element of the 3870/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3871/// returns the result. If the shift count for any element is greater than 3872/// 31, the result for that element is zero. 3873/// 3874/// \headerfile <immintrin.h> 3875/// 3876/// This intrinsic corresponds to the \c VPSRLVD instruction. 3877/// 3878/// \param __X 3879/// A 256-bit vector of [8 x i32] to be shifted. 3880/// \param __Y 3881/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3882/// bits). 3883/// \returns A 256-bit vector of [8 x i32] containing the result. 3884static __inline__ __m256i __DEFAULT_FN_ATTRS256 3885_mm256_srlv_epi32(__m256i __X, __m256i __Y) 3886{ 3887 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 3888} 3889 3890/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3891/// right by the number of bits given in the corresponding element of the 3892/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3893/// returns the result. If the shift count for any element is greater than 3894/// 31, the result for that element is zero. 3895/// 3896/// \headerfile <immintrin.h> 3897/// 3898/// This intrinsic corresponds to the \c VPSRLVD instruction. 3899/// 3900/// \param __X 3901/// A 128-bit vector of [4 x i32] to be shifted. 3902/// \param __Y 3903/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3904/// bits). 3905/// \returns A 128-bit vector of [4 x i32] containing the result. 3906static __inline__ __m128i __DEFAULT_FN_ATTRS128 3907_mm_srlv_epi32(__m128i __X, __m128i __Y) 3908{ 3909 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 3910} 3911 3912/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3913/// right by the number of bits given in the corresponding element of the 3914/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3915/// returns the result. If the shift count for any element is greater than 3916/// 63, the result for that element is zero. 3917/// 3918/// \headerfile <immintrin.h> 3919/// 3920/// This intrinsic corresponds to the \c VPSRLVQ instruction. 3921/// 3922/// \param __X 3923/// A 256-bit vector of [4 x i64] to be shifted. 3924/// \param __Y 3925/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3926/// bits). 3927/// \returns A 256-bit vector of [4 x i64] containing the result. 3928static __inline__ __m256i __DEFAULT_FN_ATTRS256 3929_mm256_srlv_epi64(__m256i __X, __m256i __Y) 3930{ 3931 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 3932} 3933 3934/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3935/// right by the number of bits given in the corresponding element of the 3936/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3937/// returns the result. If the shift count for any element is greater than 3938/// 63, the result for that element is zero. 3939/// 3940/// \headerfile <immintrin.h> 3941/// 3942/// This intrinsic corresponds to the \c VPSRLVQ instruction. 3943/// 3944/// \param __X 3945/// A 128-bit vector of [2 x i64] to be shifted. 3946/// \param __Y 3947/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3948/// bits). 3949/// \returns A 128-bit vector of [2 x i64] containing the result. 3950static __inline__ __m128i __DEFAULT_FN_ATTRS128 3951_mm_srlv_epi64(__m128i __X, __m128i __Y) 3952{ 3953 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 3954} 3955 3956/// Conditionally gathers two 64-bit floating-point values, either from the 3957/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 3958/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 3959/// of [2 x double] in \a mask determines the source for each element. 3960/// 3961/// \code{.operation} 3962/// FOR element := 0 to 1 3963/// j := element*64 3964/// k := element*32 3965/// IF mask[j+63] == 0 3966/// result[j+63:j] := a[j+63:j] 3967/// ELSE 3968/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 3969/// FI 3970/// ENDFOR 3971/// \endcode 3972/// 3973/// \headerfile <immintrin.h> 3974/// 3975/// \code 3976/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i, 3977/// __m128d mask, const int s); 3978/// \endcode 3979/// 3980/// This intrinsic corresponds to the \c VGATHERDPD instruction. 3981/// 3982/// \param a 3983/// A 128-bit vector of [2 x double] used as the source when a mask bit is 3984/// zero. 3985/// \param m 3986/// A pointer to the memory used for loading values. 3987/// \param i 3988/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 3989/// the first two elements are used. 3990/// \param mask 3991/// A 128-bit vector of [2 x double] containing the mask. The most 3992/// significant bit of each element in the mask vector represents the mask 3993/// bits. If a mask bit is zero, the corresponding value from vector \a a 3994/// is gathered; otherwise the value is loaded from memory. 3995/// \param s 3996/// A literal constant scale factor for the indexes in \a i. Must be 3997/// 1, 2, 4, or 8. 3998/// \returns A 128-bit vector of [2 x double] containing the gathered values. 3999#define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 4000 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 4001 (double const *)(m), \ 4002 (__v4si)(__m128i)(i), \ 4003 (__v2df)(__m128d)(mask), (s))) 4004 4005/// Conditionally gathers four 64-bit floating-point values, either from the 4006/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4007/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4008/// of [4 x double] in \a mask determines the source for each element. 4009/// 4010/// \code{.operation} 4011/// FOR element := 0 to 3 4012/// j := element*64 4013/// k := element*32 4014/// IF mask[j+63] == 0 4015/// result[j+63:j] := a[j+63:j] 4016/// ELSE 4017/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4018/// FI 4019/// ENDFOR 4020/// \endcode 4021/// 4022/// \headerfile <immintrin.h> 4023/// 4024/// \code 4025/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i, 4026/// __m256d mask, const int s); 4027/// \endcode 4028/// 4029/// This intrinsic corresponds to the \c VGATHERDPD instruction. 4030/// 4031/// \param a 4032/// A 256-bit vector of [4 x double] used as the source when a mask bit is 4033/// zero. 4034/// \param m 4035/// A pointer to the memory used for loading values. 4036/// \param i 4037/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4038/// \param mask 4039/// A 256-bit vector of [4 x double] containing the mask. The most 4040/// significant bit of each element in the mask vector represents the mask 4041/// bits. If a mask bit is zero, the corresponding value from vector \a a 4042/// is gathered; otherwise the value is loaded from memory. 4043/// \param s 4044/// A literal constant scale factor for the indexes in \a i. Must be 4045/// 1, 2, 4, or 8. 4046/// \returns A 256-bit vector of [4 x double] containing the gathered values. 4047#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 4048 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 4049 (double const *)(m), \ 4050 (__v4si)(__m128i)(i), \ 4051 (__v4df)(__m256d)(mask), (s))) 4052 4053/// Conditionally gathers two 64-bit floating-point values, either from the 4054/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 4055/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4056/// of [2 x double] in \a mask determines the source for each element. 4057/// 4058/// \code{.operation} 4059/// FOR element := 0 to 1 4060/// j := element*64 4061/// k := element*64 4062/// IF mask[j+63] == 0 4063/// result[j+63:j] := a[j+63:j] 4064/// ELSE 4065/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4066/// FI 4067/// ENDFOR 4068/// \endcode 4069/// 4070/// \headerfile <immintrin.h> 4071/// 4072/// \code 4073/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i, 4074/// __m128d mask, const int s); 4075/// \endcode 4076/// 4077/// This intrinsic corresponds to the \c VGATHERQPD instruction. 4078/// 4079/// \param a 4080/// A 128-bit vector of [2 x double] used as the source when a mask bit is 4081/// zero. 4082/// \param m 4083/// A pointer to the memory used for loading values. 4084/// \param i 4085/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4086/// \param mask 4087/// A 128-bit vector of [2 x double] containing the mask. The most 4088/// significant bit of each element in the mask vector represents the mask 4089/// bits. If a mask bit is zero, the corresponding value from vector \a a 4090/// is gathered; otherwise the value is loaded from memory. 4091/// \param s 4092/// A literal constant scale factor for the indexes in \a i. Must be 4093/// 1, 2, 4, or 8. 4094/// \returns A 128-bit vector of [2 x double] containing the gathered values. 4095#define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 4096 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 4097 (double const *)(m), \ 4098 (__v2di)(__m128i)(i), \ 4099 (__v2df)(__m128d)(mask), (s))) 4100 4101/// Conditionally gathers four 64-bit floating-point values, either from the 4102/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4103/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4104/// of [4 x double] in \a mask determines the source for each element. 4105/// 4106/// \code{.operation} 4107/// FOR element := 0 to 3 4108/// j := element*64 4109/// k := element*64 4110/// IF mask[j+63] == 0 4111/// result[j+63:j] := a[j+63:j] 4112/// ELSE 4113/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4114/// FI 4115/// ENDFOR 4116/// \endcode 4117/// 4118/// \headerfile <immintrin.h> 4119/// 4120/// \code 4121/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i, 4122/// __m256d mask, const int s); 4123/// \endcode 4124/// 4125/// This intrinsic corresponds to the \c VGATHERQPD instruction. 4126/// 4127/// \param a 4128/// A 256-bit vector of [4 x double] used as the source when a mask bit is 4129/// zero. 4130/// \param m 4131/// A pointer to the memory used for loading values. 4132/// \param i 4133/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4134/// \param mask 4135/// A 256-bit vector of [4 x double] containing the mask. The most 4136/// significant bit of each element in the mask vector represents the mask 4137/// bits. If a mask bit is zero, the corresponding value from vector \a a 4138/// is gathered; otherwise the value is loaded from memory. 4139/// \param s 4140/// A literal constant scale factor for the indexes in \a i. Must be 4141/// 1, 2, 4, or 8. 4142/// \returns A 256-bit vector of [4 x double] containing the gathered values. 4143#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 4144 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 4145 (double const *)(m), \ 4146 (__v4di)(__m256i)(i), \ 4147 (__v4df)(__m256d)(mask), (s))) 4148 4149/// Conditionally gathers four 32-bit floating-point values, either from the 4150/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4151/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4152/// of [4 x float] in \a mask determines the source for each element. 4153/// 4154/// \code{.operation} 4155/// FOR element := 0 to 3 4156/// j := element*32 4157/// k := element*32 4158/// IF mask[j+31] == 0 4159/// result[j+31:j] := a[j+31:j] 4160/// ELSE 4161/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4162/// FI 4163/// ENDFOR 4164/// \endcode 4165/// 4166/// \headerfile <immintrin.h> 4167/// 4168/// \code 4169/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i, 4170/// __m128 mask, const int s); 4171/// \endcode 4172/// 4173/// This intrinsic corresponds to the \c VGATHERDPS instruction. 4174/// 4175/// \param a 4176/// A 128-bit vector of [4 x float] used as the source when a mask bit is 4177/// zero. 4178/// \param m 4179/// A pointer to the memory used for loading values. 4180/// \param i 4181/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4182/// \param mask 4183/// A 128-bit vector of [4 x float] containing the mask. The most 4184/// significant bit of each element in the mask vector represents the mask 4185/// bits. If a mask bit is zero, the corresponding value from vector \a a 4186/// is gathered; otherwise the value is loaded from memory. 4187/// \param s 4188/// A literal constant scale factor for the indexes in \a i. Must be 4189/// 1, 2, 4, or 8. 4190/// \returns A 128-bit vector of [4 x float] containing the gathered values. 4191#define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 4192 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 4193 (float const *)(m), \ 4194 (__v4si)(__m128i)(i), \ 4195 (__v4sf)(__m128)(mask), (s))) 4196 4197/// Conditionally gathers eight 32-bit floating-point values, either from the 4198/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled 4199/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4200/// of [8 x float] in \a mask determines the source for each element. 4201/// 4202/// \code{.operation} 4203/// FOR element := 0 to 7 4204/// j := element*32 4205/// k := element*32 4206/// IF mask[j+31] == 0 4207/// result[j+31:j] := a[j+31:j] 4208/// ELSE 4209/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4210/// FI 4211/// ENDFOR 4212/// \endcode 4213/// 4214/// \headerfile <immintrin.h> 4215/// 4216/// \code 4217/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i, 4218/// __m256 mask, const int s); 4219/// \endcode 4220/// 4221/// This intrinsic corresponds to the \c VGATHERDPS instruction. 4222/// 4223/// \param a 4224/// A 256-bit vector of [8 x float] used as the source when a mask bit is 4225/// zero. 4226/// \param m 4227/// A pointer to the memory used for loading values. 4228/// \param i 4229/// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4230/// \param mask 4231/// A 256-bit vector of [8 x float] containing the mask. The most 4232/// significant bit of each element in the mask vector represents the mask 4233/// bits. If a mask bit is zero, the corresponding value from vector \a a 4234/// is gathered; otherwise the value is loaded from memory. 4235/// \param s 4236/// A literal constant scale factor for the indexes in \a i. Must be 4237/// 1, 2, 4, or 8. 4238/// \returns A 256-bit vector of [8 x float] containing the gathered values. 4239#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 4240 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 4241 (float const *)(m), \ 4242 (__v8si)(__m256i)(i), \ 4243 (__v8sf)(__m256)(mask), (s))) 4244 4245/// Conditionally gathers two 32-bit floating-point values, either from the 4246/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4247/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4248/// of [4 x float] in \a mask determines the source for the lower two 4249/// elements. The upper two elements of the result are zeroed. 4250/// 4251/// \code{.operation} 4252/// FOR element := 0 to 1 4253/// j := element*32 4254/// k := element*64 4255/// IF mask[j+31] == 0 4256/// result[j+31:j] := a[j+31:j] 4257/// ELSE 4258/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4259/// FI 4260/// ENDFOR 4261/// result[127:64] := 0 4262/// \endcode 4263/// 4264/// \headerfile <immintrin.h> 4265/// 4266/// \code 4267/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i, 4268/// __m128 mask, const int s); 4269/// \endcode 4270/// 4271/// This intrinsic corresponds to the \c VGATHERQPS instruction. 4272/// 4273/// \param a 4274/// A 128-bit vector of [4 x float] used as the source when a mask bit is 4275/// zero. Only the first two elements are used. 4276/// \param m 4277/// A pointer to the memory used for loading values. 4278/// \param i 4279/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4280/// \param mask 4281/// A 128-bit vector of [4 x float] containing the mask. The most 4282/// significant bit of each element in the mask vector represents the mask 4283/// bits. If a mask bit is zero, the corresponding value from vector \a a 4284/// is gathered; otherwise the value is loaded from memory. Only the first 4285/// two elements are used. 4286/// \param s 4287/// A literal constant scale factor for the indexes in \a i. Must be 4288/// 1, 2, 4, or 8. 4289/// \returns A 128-bit vector of [4 x float] containing the gathered values. 4290#define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 4291 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 4292 (float const *)(m), \ 4293 (__v2di)(__m128i)(i), \ 4294 (__v4sf)(__m128)(mask), (s))) 4295 4296/// Conditionally gathers four 32-bit floating-point values, either from the 4297/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4298/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4299/// of [4 x float] in \a mask determines the source for each element. 4300/// 4301/// \code{.operation} 4302/// FOR element := 0 to 3 4303/// j := element*32 4304/// k := element*64 4305/// IF mask[j+31] == 0 4306/// result[j+31:j] := a[j+31:j] 4307/// ELSE 4308/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4309/// FI 4310/// ENDFOR 4311/// \endcode 4312/// 4313/// \headerfile <immintrin.h> 4314/// 4315/// \code 4316/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i, 4317/// __m128 mask, const int s); 4318/// \endcode 4319/// 4320/// This intrinsic corresponds to the \c VGATHERQPS instruction. 4321/// 4322/// \param a 4323/// A 128-bit vector of [4 x float] used as the source when a mask bit is 4324/// zero. 4325/// \param m 4326/// A pointer to the memory used for loading values. 4327/// \param i 4328/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4329/// \param mask 4330/// A 128-bit vector of [4 x float] containing the mask. The most 4331/// significant bit of each element in the mask vector represents the mask 4332/// bits. If a mask bit is zero, the corresponding value from vector \a a 4333/// is gathered; otherwise the value is loaded from memory. 4334/// \param s 4335/// A literal constant scale factor for the indexes in \a i. Must be 4336/// 1, 2, 4, or 8. 4337/// \returns A 128-bit vector of [4 x float] containing the gathered values. 4338#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 4339 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 4340 (float const *)(m), \ 4341 (__v4di)(__m256i)(i), \ 4342 (__v4sf)(__m128)(mask), (s))) 4343 4344/// Conditionally gathers four 32-bit integer values, either from the 4345/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4346/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4347/// of [4 x i32] in \a mask determines the source for each element. 4348/// 4349/// \code{.operation} 4350/// FOR element := 0 to 3 4351/// j := element*32 4352/// k := element*32 4353/// IF mask[j+31] == 0 4354/// result[j+31:j] := a[j+31:j] 4355/// ELSE 4356/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4357/// FI 4358/// ENDFOR 4359/// \endcode 4360/// 4361/// \headerfile <immintrin.h> 4362/// 4363/// \code 4364/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i, 4365/// __m128i mask, const int s); 4366/// \endcode 4367/// 4368/// This intrinsic corresponds to the \c VPGATHERDD instruction. 4369/// 4370/// \param a 4371/// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4372/// zero. 4373/// \param m 4374/// A pointer to the memory used for loading values. 4375/// \param i 4376/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4377/// \param mask 4378/// A 128-bit vector of [4 x i32] containing the mask. The most significant 4379/// bit of each element in the mask vector represents the mask bits. If a 4380/// mask bit is zero, the corresponding value from vector \a a is gathered; 4381/// otherwise the value is loaded from memory. 4382/// \param s 4383/// A literal constant scale factor for the indexes in \a i. Must be 4384/// 1, 2, 4, or 8. 4385/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4386#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 4387 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 4388 (int const *)(m), \ 4389 (__v4si)(__m128i)(i), \ 4390 (__v4si)(__m128i)(mask), (s))) 4391 4392/// Conditionally gathers eight 32-bit integer values, either from the 4393/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled 4394/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4395/// of [8 x i32] in \a mask determines the source for each element. 4396/// 4397/// \code{.operation} 4398/// FOR element := 0 to 7 4399/// j := element*32 4400/// k := element*32 4401/// IF mask[j+31] == 0 4402/// result[j+31:j] := a[j+31:j] 4403/// ELSE 4404/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4405/// FI 4406/// ENDFOR 4407/// \endcode 4408/// 4409/// \headerfile <immintrin.h> 4410/// 4411/// \code 4412/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i, 4413/// __m256i mask, const int s); 4414/// \endcode 4415/// 4416/// This intrinsic corresponds to the \c VPGATHERDD instruction. 4417/// 4418/// \param a 4419/// A 256-bit vector of [8 x i32] used as the source when a mask bit is 4420/// zero. 4421/// \param m 4422/// A pointer to the memory used for loading values. 4423/// \param i 4424/// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4425/// \param mask 4426/// A 256-bit vector of [8 x i32] containing the mask. The most significant 4427/// bit of each element in the mask vector represents the mask bits. If a 4428/// mask bit is zero, the corresponding value from vector \a a is gathered; 4429/// otherwise the value is loaded from memory. 4430/// \param s 4431/// A literal constant scale factor for the indexes in \a i. Must be 4432/// 1, 2, 4, or 8. 4433/// \returns A 256-bit vector of [8 x i32] containing the gathered values. 4434#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 4435 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 4436 (int const *)(m), \ 4437 (__v8si)(__m256i)(i), \ 4438 (__v8si)(__m256i)(mask), (s))) 4439 4440/// Conditionally gathers two 32-bit integer values, either from the 4441/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4442/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4443/// of [4 x i32] in \a mask determines the source for the lower two 4444/// elements. The upper two elements of the result are zeroed. 4445/// 4446/// \code{.operation} 4447/// FOR element := 0 to 1 4448/// j := element*32 4449/// k := element*64 4450/// IF mask[j+31] == 0 4451/// result[j+31:j] := a[j+31:j] 4452/// ELSE 4453/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4454/// FI 4455/// ENDFOR 4456/// result[127:64] := 0 4457/// \endcode 4458/// 4459/// \headerfile <immintrin.h> 4460/// 4461/// \code 4462/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i, 4463/// __m128i mask, const int s); 4464/// \endcode 4465/// 4466/// This intrinsic corresponds to the \c VPGATHERQD instruction. 4467/// 4468/// \param a 4469/// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4470/// zero. Only the first two elements are used. 4471/// \param m 4472/// A pointer to the memory used for loading values. 4473/// \param i 4474/// A 128-bit vector of [2 x i64] containing indexes into \a m. 4475/// \param mask 4476/// A 128-bit vector of [4 x i32] containing the mask. The most significant 4477/// bit of each element in the mask vector represents the mask bits. If a 4478/// mask bit is zero, the corresponding value from vector \a a is gathered; 4479/// otherwise the value is loaded from memory. Only the first two elements 4480/// are used. 4481/// \param s 4482/// A literal constant scale factor for the indexes in \a i. Must be 4483/// 1, 2, 4, or 8. 4484/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4485#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 4486 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 4487 (int const *)(m), \ 4488 (__v2di)(__m128i)(i), \ 4489 (__v4si)(__m128i)(mask), (s))) 4490 4491/// Conditionally gathers four 32-bit integer values, either from the 4492/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4493/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4494/// of [4 x i32] in \a mask determines the source for each element. 4495/// 4496/// \code{.operation} 4497/// FOR element := 0 to 3 4498/// j := element*32 4499/// k := element*64 4500/// IF mask[j+31] == 0 4501/// result[j+31:j] := a[j+31:j] 4502/// ELSE 4503/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4504/// FI 4505/// ENDFOR 4506/// \endcode 4507/// 4508/// \headerfile <immintrin.h> 4509/// 4510/// \code 4511/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i, 4512/// __m128i mask, const int s); 4513/// \endcode 4514/// 4515/// This intrinsic corresponds to the \c VPGATHERQD instruction. 4516/// 4517/// \param a 4518/// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4519/// zero. 4520/// \param m 4521/// A pointer to the memory used for loading values. 4522/// \param i 4523/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4524/// \param mask 4525/// A 128-bit vector of [4 x i32] containing the mask. The most significant 4526/// bit of each element in the mask vector represents the mask bits. If a 4527/// mask bit is zero, the corresponding value from vector \a a is gathered; 4528/// otherwise the value is loaded from memory. 4529/// \param s 4530/// A literal constant scale factor for the indexes in \a i. Must be 4531/// 1, 2, 4, or 8. 4532/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4533#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 4534 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 4535 (int const *)(m), \ 4536 (__v4di)(__m256i)(i), \ 4537 (__v4si)(__m128i)(mask), (s))) 4538 4539/// Conditionally gathers two 64-bit integer values, either from the 4540/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4541/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4542/// of [2 x i64] in \a mask determines the source for each element. 4543/// 4544/// \code{.operation} 4545/// FOR element := 0 to 1 4546/// j := element*64 4547/// k := element*32 4548/// IF mask[j+63] == 0 4549/// result[j+63:j] := a[j+63:j] 4550/// ELSE 4551/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4552/// FI 4553/// ENDFOR 4554/// \endcode 4555/// 4556/// \headerfile <immintrin.h> 4557/// 4558/// \code 4559/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i, 4560/// __m128i mask, const int s); 4561/// \endcode 4562/// 4563/// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4564/// 4565/// \param a 4566/// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4567/// zero. 4568/// \param m 4569/// A pointer to the memory used for loading values. 4570/// \param i 4571/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4572/// the first two elements are used. 4573/// \param mask 4574/// A 128-bit vector of [2 x i64] containing the mask. The most significant 4575/// bit of each element in the mask vector represents the mask bits. If a 4576/// mask bit is zero, the corresponding value from vector \a a is gathered; 4577/// otherwise the value is loaded from memory. 4578/// \param s 4579/// A literal constant scale factor for the indexes in \a i. Must be 4580/// 1, 2, 4, or 8. 4581/// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4582#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 4583 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 4584 (long long const *)(m), \ 4585 (__v4si)(__m128i)(i), \ 4586 (__v2di)(__m128i)(mask), (s))) 4587 4588/// Conditionally gathers four 64-bit integer values, either from the 4589/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4590/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4591/// of [4 x i64] in \a mask determines the source for each element. 4592/// 4593/// \code{.operation} 4594/// FOR element := 0 to 3 4595/// j := element*64 4596/// k := element*32 4597/// IF mask[j+63] == 0 4598/// result[j+63:j] := a[j+63:j] 4599/// ELSE 4600/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4601/// FI 4602/// ENDFOR 4603/// \endcode 4604/// 4605/// \headerfile <immintrin.h> 4606/// 4607/// \code 4608/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m, 4609/// __m128i i, __m256i mask, const int s); 4610/// \endcode 4611/// 4612/// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4613/// 4614/// \param a 4615/// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4616/// zero. 4617/// \param m 4618/// A pointer to the memory used for loading values. 4619/// \param i 4620/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4621/// \param mask 4622/// A 256-bit vector of [4 x i64] containing the mask. The most significant 4623/// bit of each element in the mask vector represents the mask bits. If a 4624/// mask bit is zero, the corresponding value from vector \a a is gathered; 4625/// otherwise the value is loaded from memory. 4626/// \param s 4627/// A literal constant scale factor for the indexes in \a i. Must be 4628/// 1, 2, 4, or 8. 4629/// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4630#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 4631 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 4632 (long long const *)(m), \ 4633 (__v4si)(__m128i)(i), \ 4634 (__v4di)(__m256i)(mask), (s))) 4635 4636/// Conditionally gathers two 64-bit integer values, either from the 4637/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4638/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4639/// of [2 x i64] in \a mask determines the source for each element. 4640/// 4641/// \code{.operation} 4642/// FOR element := 0 to 1 4643/// j := element*64 4644/// k := element*64 4645/// IF mask[j+63] == 0 4646/// result[j+63:j] := a[j+63:j] 4647/// ELSE 4648/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4649/// FI 4650/// ENDFOR 4651/// \endcode 4652/// 4653/// \headerfile <immintrin.h> 4654/// 4655/// \code 4656/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i, 4657/// __m128i mask, const int s); 4658/// \endcode 4659/// 4660/// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4661/// 4662/// \param a 4663/// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4664/// zero. 4665/// \param m 4666/// A pointer to the memory used for loading values. 4667/// \param i 4668/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4669/// \param mask 4670/// A 128-bit vector of [2 x i64] containing the mask. The most significant 4671/// bit of each element in the mask vector represents the mask bits. If a 4672/// mask bit is zero, the corresponding value from vector \a a is gathered; 4673/// otherwise the value is loaded from memory. 4674/// \param s 4675/// A literal constant scale factor for the indexes in \a i. Must be 4676/// 1, 2, 4, or 8. 4677/// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4678#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 4679 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 4680 (long long const *)(m), \ 4681 (__v2di)(__m128i)(i), \ 4682 (__v2di)(__m128i)(mask), (s))) 4683 4684/// Conditionally gathers four 64-bit integer values, either from the 4685/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4686/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4687/// of [4 x i64] in \a mask determines the source for each element. 4688/// 4689/// \code{.operation} 4690/// FOR element := 0 to 3 4691/// j := element*64 4692/// k := element*64 4693/// IF mask[j+63] == 0 4694/// result[j+63:j] := a[j+63:j] 4695/// ELSE 4696/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4697/// FI 4698/// ENDFOR 4699/// \endcode 4700/// 4701/// \headerfile <immintrin.h> 4702/// 4703/// \code 4704/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m, 4705/// __m256i i, __m256i mask, const int s); 4706/// \endcode 4707/// 4708/// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4709/// 4710/// \param a 4711/// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4712/// zero. 4713/// \param m 4714/// A pointer to the memory used for loading values. 4715/// \param i 4716/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4717/// \param mask 4718/// A 256-bit vector of [4 x i64] containing the mask. The most significant 4719/// bit of each element in the mask vector represents the mask bits. If a 4720/// mask bit is zero, the corresponding value from vector \a a is gathered; 4721/// otherwise the value is loaded from memory. 4722/// \param s 4723/// A literal constant scale factor for the indexes in \a i. Must be 4724/// 1, 2, 4, or 8. 4725/// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4726#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 4727 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 4728 (long long const *)(m), \ 4729 (__v4di)(__m256i)(i), \ 4730 (__v4di)(__m256i)(mask), (s))) 4731 4732/// Gathers two 64-bit floating-point values from memory \a m using scaled 4733/// indexes from the 128-bit vector of [4 x i32] in \a i. 4734/// 4735/// \code{.operation} 4736/// FOR element := 0 to 1 4737/// j := element*64 4738/// k := element*32 4739/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4740/// ENDFOR 4741/// \endcode 4742/// 4743/// \headerfile <immintrin.h> 4744/// 4745/// \code 4746/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s); 4747/// \endcode 4748/// 4749/// This intrinsic corresponds to the \c VGATHERDPD instruction. 4750/// 4751/// \param m 4752/// A pointer to the memory used for loading values. 4753/// \param i 4754/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4755/// the first two elements are used. 4756/// \param s 4757/// A literal constant scale factor for the indexes in \a i. Must be 4758/// 1, 2, 4, or 8. 4759/// \returns A 128-bit vector of [2 x double] containing the gathered values. 4760#define _mm_i32gather_pd(m, i, s) \ 4761 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 4762 (double const *)(m), \ 4763 (__v4si)(__m128i)(i), \ 4764 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4765 _mm_setzero_pd()), \ 4766 (s))) 4767 4768/// Gathers four 64-bit floating-point values from memory \a m using scaled 4769/// indexes from the 128-bit vector of [4 x i32] in \a i. 4770/// 4771/// \code{.operation} 4772/// FOR element := 0 to 3 4773/// j := element*64 4774/// k := element*32 4775/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4776/// ENDFOR 4777/// \endcode 4778/// 4779/// \headerfile <immintrin.h> 4780/// 4781/// \code 4782/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s); 4783/// \endcode 4784/// 4785/// This intrinsic corresponds to the \c VGATHERDPD instruction. 4786/// 4787/// \param m 4788/// A pointer to the memory used for loading values. 4789/// \param i 4790/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4791/// \param s 4792/// A literal constant scale factor for the indexes in \a i. Must be 4793/// 1, 2, 4, or 8. 4794/// \returns A 256-bit vector of [4 x double] containing the gathered values. 4795#define _mm256_i32gather_pd(m, i, s) \ 4796 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 4797 (double const *)(m), \ 4798 (__v4si)(__m128i)(i), \ 4799 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4800 _mm256_setzero_pd(), \ 4801 _CMP_EQ_OQ), \ 4802 (s))) 4803 4804/// Gathers two 64-bit floating-point values from memory \a m using scaled 4805/// indexes from the 128-bit vector of [2 x i64] in \a i. 4806/// 4807/// \code{.operation} 4808/// FOR element := 0 to 1 4809/// j := element*64 4810/// k := element*64 4811/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4812/// ENDFOR 4813/// \endcode 4814/// 4815/// \headerfile <immintrin.h> 4816/// 4817/// \code 4818/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s); 4819/// \endcode 4820/// 4821/// This intrinsic corresponds to the \c VGATHERQPD instruction. 4822/// 4823/// \param m 4824/// A pointer to the memory used for loading values. 4825/// \param i 4826/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4827/// \param s 4828/// A literal constant scale factor for the indexes in \a i. Must be 4829/// 1, 2, 4, or 8. 4830/// \returns A 128-bit vector of [2 x double] containing the gathered values. 4831#define _mm_i64gather_pd(m, i, s) \ 4832 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 4833 (double const *)(m), \ 4834 (__v2di)(__m128i)(i), \ 4835 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4836 _mm_setzero_pd()), \ 4837 (s))) 4838 4839/// Gathers four 64-bit floating-point values from memory \a m using scaled 4840/// indexes from the 256-bit vector of [4 x i64] in \a i. 4841/// 4842/// \code{.operation} 4843/// FOR element := 0 to 3 4844/// j := element*64 4845/// k := element*64 4846/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4847/// ENDFOR 4848/// \endcode 4849/// 4850/// \headerfile <immintrin.h> 4851/// 4852/// \code 4853/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s); 4854/// \endcode 4855/// 4856/// This intrinsic corresponds to the \c VGATHERQPD instruction. 4857/// 4858/// \param m 4859/// A pointer to the memory used for loading values. 4860/// \param i 4861/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4862/// \param s 4863/// A literal constant scale factor for the indexes in \a i. Must be 4864/// 1, 2, 4, or 8. 4865/// \returns A 256-bit vector of [4 x double] containing the gathered values. 4866#define _mm256_i64gather_pd(m, i, s) \ 4867 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 4868 (double const *)(m), \ 4869 (__v4di)(__m256i)(i), \ 4870 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4871 _mm256_setzero_pd(), \ 4872 _CMP_EQ_OQ), \ 4873 (s))) 4874 4875/// Gathers four 32-bit floating-point values from memory \a m using scaled 4876/// indexes from the 128-bit vector of [4 x i32] in \a i. 4877/// 4878/// \code{.operation} 4879/// FOR element := 0 to 3 4880/// j := element*32 4881/// k := element*32 4882/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4883/// ENDFOR 4884/// \endcode 4885/// 4886/// \headerfile <immintrin.h> 4887/// 4888/// \code 4889/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s); 4890/// \endcode 4891/// 4892/// This intrinsic corresponds to the \c VGATHERDPS instruction. 4893/// 4894/// \param m 4895/// A pointer to the memory used for loading values. 4896/// \param i 4897/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4898/// \param s 4899/// A literal constant scale factor for the indexes in \a i. Must be 4900/// 1, 2, 4, or 8. 4901/// \returns A 128-bit vector of [4 x float] containing the gathered values. 4902#define _mm_i32gather_ps(m, i, s) \ 4903 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 4904 (float const *)(m), \ 4905 (__v4si)(__m128i)(i), \ 4906 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4907 _mm_setzero_ps()), \ 4908 (s))) 4909 4910/// Gathers eight 32-bit floating-point values from memory \a m using scaled 4911/// indexes from the 256-bit vector of [8 x i32] in \a i. 4912/// 4913/// \code{.operation} 4914/// FOR element := 0 to 7 4915/// j := element*32 4916/// k := element*32 4917/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4918/// ENDFOR 4919/// \endcode 4920/// 4921/// \headerfile <immintrin.h> 4922/// 4923/// \code 4924/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s); 4925/// \endcode 4926/// 4927/// This intrinsic corresponds to the \c VGATHERDPS instruction. 4928/// 4929/// \param m 4930/// A pointer to the memory used for loading values. 4931/// \param i 4932/// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4933/// \param s 4934/// A literal constant scale factor for the indexes in \a i. Must be 4935/// 1, 2, 4, or 8. 4936/// \returns A 256-bit vector of [8 x float] containing the gathered values. 4937#define _mm256_i32gather_ps(m, i, s) \ 4938 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 4939 (float const *)(m), \ 4940 (__v8si)(__m256i)(i), \ 4941 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 4942 _mm256_setzero_ps(), \ 4943 _CMP_EQ_OQ), \ 4944 (s))) 4945 4946/// Gathers two 32-bit floating-point values from memory \a m using scaled 4947/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two 4948/// elements of the result are zeroed. 4949/// 4950/// \code{.operation} 4951/// FOR element := 0 to 1 4952/// j := element*32 4953/// k := element*64 4954/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4955/// ENDFOR 4956/// result[127:64] := 0 4957/// \endcode 4958/// 4959/// \headerfile <immintrin.h> 4960/// 4961/// \code 4962/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s); 4963/// \endcode 4964/// 4965/// This intrinsic corresponds to the \c VGATHERQPS instruction. 4966/// 4967/// \param m 4968/// A pointer to the memory used for loading values. 4969/// \param i 4970/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4971/// \param s 4972/// A literal constant scale factor for the indexes in \a i. Must be 4973/// 1, 2, 4, or 8. 4974/// \returns A 128-bit vector of [4 x float] containing the gathered values. 4975#define _mm_i64gather_ps(m, i, s) \ 4976 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 4977 (float const *)(m), \ 4978 (__v2di)(__m128i)(i), \ 4979 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4980 _mm_setzero_ps()), \ 4981 (s))) 4982 4983/// Gathers four 32-bit floating-point values from memory \a m using scaled 4984/// indexes from the 256-bit vector of [4 x i64] in \a i. 4985/// 4986/// \code{.operation} 4987/// FOR element := 0 to 3 4988/// j := element*32 4989/// k := element*64 4990/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s) 4991/// ENDFOR 4992/// \endcode 4993/// 4994/// \headerfile <immintrin.h> 4995/// 4996/// \code 4997/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s); 4998/// \endcode 4999/// 5000/// This intrinsic corresponds to the \c VGATHERQPS instruction. 5001/// 5002/// \param m 5003/// A pointer to the memory used for loading values. 5004/// \param i 5005/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5006/// \param s 5007/// A literal constant scale factor for the indexes in \a i. Must be 5008/// 1, 2, 4, or 8. 5009/// \returns A 128-bit vector of [4 x float] containing the gathered values. 5010#define _mm256_i64gather_ps(m, i, s) \ 5011 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 5012 (float const *)(m), \ 5013 (__v4di)(__m256i)(i), \ 5014 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 5015 _mm_setzero_ps()), \ 5016 (s))) 5017 5018/// Gathers four 32-bit floating-point values from memory \a m using scaled 5019/// indexes from the 128-bit vector of [4 x i32] in \a i. 5020/// 5021/// \code{.operation} 5022/// FOR element := 0 to 3 5023/// j := element*32 5024/// k := element*32 5025/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5026/// ENDFOR 5027/// \endcode 5028/// 5029/// \headerfile <immintrin.h> 5030/// 5031/// \code 5032/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s); 5033/// \endcode 5034/// 5035/// This intrinsic corresponds to the \c VPGATHERDD instruction. 5036/// 5037/// \param m 5038/// A pointer to the memory used for loading values. 5039/// \param i 5040/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5041/// \param s 5042/// A literal constant scale factor for the indexes in \a i. Must be 5043/// 1, 2, 4, or 8. 5044/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5045#define _mm_i32gather_epi32(m, i, s) \ 5046 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 5047 (int const *)(m), (__v4si)(__m128i)(i), \ 5048 (__v4si)_mm_set1_epi32(-1), (s))) 5049 5050/// Gathers eight 32-bit floating-point values from memory \a m using scaled 5051/// indexes from the 256-bit vector of [8 x i32] in \a i. 5052/// 5053/// \code{.operation} 5054/// FOR element := 0 to 7 5055/// j := element*32 5056/// k := element*32 5057/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5058/// ENDFOR 5059/// \endcode 5060/// 5061/// \headerfile <immintrin.h> 5062/// 5063/// \code 5064/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s); 5065/// \endcode 5066/// 5067/// This intrinsic corresponds to the \c VPGATHERDD instruction. 5068/// 5069/// \param m 5070/// A pointer to the memory used for loading values. 5071/// \param i 5072/// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 5073/// \param s 5074/// A literal constant scale factor for the indexes in \a i. Must be 5075/// 1, 2, 4, or 8. 5076/// \returns A 256-bit vector of [8 x i32] containing the gathered values. 5077#define _mm256_i32gather_epi32(m, i, s) \ 5078 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 5079 (int const *)(m), (__v8si)(__m256i)(i), \ 5080 (__v8si)_mm256_set1_epi32(-1), (s))) 5081 5082/// Gathers two 32-bit integer values from memory \a m using scaled indexes 5083/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements 5084/// of the result are zeroed. 5085/// 5086/// \code{.operation} 5087/// FOR element := 0 to 1 5088/// j := element*32 5089/// k := element*64 5090/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5091/// ENDFOR 5092/// result[127:64] := 0 5093/// \endcode 5094/// 5095/// \headerfile <immintrin.h> 5096/// 5097/// \code 5098/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s); 5099/// \endcode 5100/// 5101/// This intrinsic corresponds to the \c VPGATHERQD instruction. 5102/// 5103/// \param m 5104/// A pointer to the memory used for loading values. 5105/// \param i 5106/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5107/// \param s 5108/// A literal constant scale factor for the indexes in \a i. Must be 5109/// 1, 2, 4, or 8. 5110/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5111#define _mm_i64gather_epi32(m, i, s) \ 5112 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 5113 (int const *)(m), (__v2di)(__m128i)(i), \ 5114 (__v4si)_mm_set1_epi32(-1), (s))) 5115 5116/// Gathers four 32-bit integer values from memory \a m using scaled indexes 5117/// from the 256-bit vector of [4 x i64] in \a i. 5118/// 5119/// \code{.operation} 5120/// FOR element := 0 to 3 5121/// j := element*32 5122/// k := element*64 5123/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5124/// ENDFOR 5125/// \endcode 5126/// 5127/// \headerfile <immintrin.h> 5128/// 5129/// \code 5130/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s); 5131/// \endcode 5132/// 5133/// This intrinsic corresponds to the \c VPGATHERQD instruction. 5134/// 5135/// \param m 5136/// A pointer to the memory used for loading values. 5137/// \param i 5138/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5139/// \param s 5140/// A literal constant scale factor for the indexes in \a i. Must be 5141/// 1, 2, 4, or 8. 5142/// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5143#define _mm256_i64gather_epi32(m, i, s) \ 5144 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 5145 (int const *)(m), (__v4di)(__m256i)(i), \ 5146 (__v4si)_mm_set1_epi32(-1), (s))) 5147 5148/// Gathers two 64-bit integer values from memory \a m using scaled indexes 5149/// from the 128-bit vector of [4 x i32] in \a i. 5150/// 5151/// \code{.operation} 5152/// FOR element := 0 to 1 5153/// j := element*64 5154/// k := element*32 5155/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5156/// ENDFOR 5157/// \endcode 5158/// 5159/// \headerfile <immintrin.h> 5160/// 5161/// \code 5162/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s); 5163/// \endcode 5164/// 5165/// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5166/// 5167/// \param m 5168/// A pointer to the memory used for loading values. 5169/// \param i 5170/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 5171/// the first two elements are used. 5172/// \param s 5173/// A literal constant scale factor for the indexes in \a i. Must be 5174/// 1, 2, 4, or 8. 5175/// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5176#define _mm_i32gather_epi64(m, i, s) \ 5177 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 5178 (long long const *)(m), \ 5179 (__v4si)(__m128i)(i), \ 5180 (__v2di)_mm_set1_epi64x(-1), (s))) 5181 5182/// Gathers four 64-bit integer values from memory \a m using scaled indexes 5183/// from the 128-bit vector of [4 x i32] in \a i. 5184/// 5185/// \code{.operation} 5186/// FOR element := 0 to 3 5187/// j := element*64 5188/// k := element*32 5189/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5190/// ENDFOR 5191/// \endcode 5192/// 5193/// \headerfile <immintrin.h> 5194/// 5195/// \code 5196/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s); 5197/// \endcode 5198/// 5199/// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5200/// 5201/// \param m 5202/// A pointer to the memory used for loading values. 5203/// \param i 5204/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5205/// \param s 5206/// A literal constant scale factor for the indexes in \a i. Must be 5207/// 1, 2, 4, or 8. 5208/// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5209#define _mm256_i32gather_epi64(m, i, s) \ 5210 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 5211 (long long const *)(m), \ 5212 (__v4si)(__m128i)(i), \ 5213 (__v4di)_mm256_set1_epi64x(-1), (s))) 5214 5215/// Gathers two 64-bit integer values from memory \a m using scaled indexes 5216/// from the 128-bit vector of [2 x i64] in \a i. 5217/// 5218/// \code{.operation} 5219/// FOR element := 0 to 1 5220/// j := element*64 5221/// k := element*64 5222/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5223/// ENDFOR 5224/// \endcode 5225/// 5226/// \headerfile <immintrin.h> 5227/// 5228/// \code 5229/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s); 5230/// \endcode 5231/// 5232/// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5233/// 5234/// \param m 5235/// A pointer to the memory used for loading values. 5236/// \param i 5237/// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5238/// \param s 5239/// A literal constant scale factor for the indexes in \a i. Must be 5240/// 1, 2, 4, or 8. 5241/// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5242#define _mm_i64gather_epi64(m, i, s) \ 5243 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 5244 (long long const *)(m), \ 5245 (__v2di)(__m128i)(i), \ 5246 (__v2di)_mm_set1_epi64x(-1), (s))) 5247 5248/// Gathers four 64-bit integer values from memory \a m using scaled indexes 5249/// from the 256-bit vector of [4 x i64] in \a i. 5250/// 5251/// \code{.operation} 5252/// FOR element := 0 to 3 5253/// j := element*64 5254/// k := element*64 5255/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5256/// ENDFOR 5257/// \endcode 5258/// 5259/// \headerfile <immintrin.h> 5260/// 5261/// \code 5262/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s); 5263/// \endcode 5264/// 5265/// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5266/// 5267/// \param m 5268/// A pointer to the memory used for loading values. 5269/// \param i 5270/// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5271/// \param s 5272/// A literal constant scale factor for the indexes in \a i. Must be 5273/// 1, 2, 4, or 8. 5274/// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5275#define _mm256_i64gather_epi64(m, i, s) \ 5276 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 5277 (long long const *)(m), \ 5278 (__v4di)(__m256i)(i), \ 5279 (__v4di)_mm256_set1_epi64x(-1), (s))) 5280 5281#undef __DEFAULT_FN_ATTRS256 5282#undef __DEFAULT_FN_ATTRS128 5283 5284#endif /* __AVX2INTRIN_H */ 5285