avx2intrin.h revision 341825
1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26#endif 27 28#ifndef __AVX2INTRIN_H 29#define __AVX2INTRIN_H 30 31/* Define the default attributes for the functions in this file. */ 32#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256))) 33#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) 34 35/* SSE4 Multiple Packed Sums of Absolute Difference. */ 36#define _mm256_mpsadbw_epu8(X, Y, M) \ 37 (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 38 (__v32qi)(__m256i)(Y), (int)(M)) 39 40static __inline__ __m256i __DEFAULT_FN_ATTRS256 41_mm256_abs_epi8(__m256i __a) 42{ 43 return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a); 44} 45 46static __inline__ __m256i __DEFAULT_FN_ATTRS256 47_mm256_abs_epi16(__m256i __a) 48{ 49 return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a); 50} 51 52static __inline__ __m256i __DEFAULT_FN_ATTRS256 53_mm256_abs_epi32(__m256i __a) 54{ 55 return (__m256i)__builtin_ia32_pabsd256((__v8si)__a); 56} 57 58static __inline__ __m256i __DEFAULT_FN_ATTRS256 59_mm256_packs_epi16(__m256i __a, __m256i __b) 60{ 61 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 62} 63 64static __inline__ __m256i __DEFAULT_FN_ATTRS256 65_mm256_packs_epi32(__m256i __a, __m256i __b) 66{ 67 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 68} 69 70static __inline__ __m256i __DEFAULT_FN_ATTRS256 71_mm256_packus_epi16(__m256i __a, __m256i __b) 72{ 73 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 74} 75 76static __inline__ __m256i __DEFAULT_FN_ATTRS256 77_mm256_packus_epi32(__m256i __V1, __m256i __V2) 78{ 79 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 80} 81 82static __inline__ __m256i __DEFAULT_FN_ATTRS256 83_mm256_add_epi8(__m256i __a, __m256i __b) 84{ 85 return (__m256i)((__v32qu)__a + (__v32qu)__b); 86} 87 88static __inline__ __m256i __DEFAULT_FN_ATTRS256 89_mm256_add_epi16(__m256i __a, __m256i __b) 90{ 91 return (__m256i)((__v16hu)__a + (__v16hu)__b); 92} 93 94static __inline__ __m256i __DEFAULT_FN_ATTRS256 95_mm256_add_epi32(__m256i __a, __m256i __b) 96{ 97 return (__m256i)((__v8su)__a + (__v8su)__b); 98} 99 100static __inline__ __m256i __DEFAULT_FN_ATTRS256 101_mm256_add_epi64(__m256i __a, __m256i __b) 102{ 103 return (__m256i)((__v4du)__a + (__v4du)__b); 104} 105 106static __inline__ __m256i __DEFAULT_FN_ATTRS256 107_mm256_adds_epi8(__m256i __a, __m256i __b) 108{ 109 return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b); 110} 111 112static __inline__ __m256i __DEFAULT_FN_ATTRS256 113_mm256_adds_epi16(__m256i __a, __m256i __b) 114{ 115 return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b); 116} 117 118static __inline__ __m256i __DEFAULT_FN_ATTRS256 119_mm256_adds_epu8(__m256i __a, __m256i __b) 120{ 121 return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b); 122} 123 124static __inline__ __m256i __DEFAULT_FN_ATTRS256 125_mm256_adds_epu16(__m256i __a, __m256i __b) 126{ 127 return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); 128} 129 130#define _mm256_alignr_epi8(a, b, n) \ 131 (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 132 (__v32qi)(__m256i)(b), (n)) 133 134static __inline__ __m256i __DEFAULT_FN_ATTRS256 135_mm256_and_si256(__m256i __a, __m256i __b) 136{ 137 return (__m256i)((__v4du)__a & (__v4du)__b); 138} 139 140static __inline__ __m256i __DEFAULT_FN_ATTRS256 141_mm256_andnot_si256(__m256i __a, __m256i __b) 142{ 143 return (__m256i)(~(__v4du)__a & (__v4du)__b); 144} 145 146static __inline__ __m256i __DEFAULT_FN_ATTRS256 147_mm256_avg_epu8(__m256i __a, __m256i __b) 148{ 149 typedef unsigned short __v32hu __attribute__((__vector_size__(64))); 150 return (__m256i)__builtin_convertvector( 151 ((__builtin_convertvector((__v32qu)__a, __v32hu) + 152 __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) 153 >> 1, __v32qu); 154} 155 156static __inline__ __m256i __DEFAULT_FN_ATTRS256 157_mm256_avg_epu16(__m256i __a, __m256i __b) 158{ 159 typedef unsigned int __v16su __attribute__((__vector_size__(64))); 160 return (__m256i)__builtin_convertvector( 161 ((__builtin_convertvector((__v16hu)__a, __v16su) + 162 __builtin_convertvector((__v16hu)__b, __v16su)) + 1) 163 >> 1, __v16hu); 164} 165 166static __inline__ __m256i __DEFAULT_FN_ATTRS256 167_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 168{ 169 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 170 (__v32qi)__M); 171} 172 173#define _mm256_blend_epi16(V1, V2, M) \ 174 (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 175 (__v16hi)(__m256i)(V2), (int)(M)) 176 177static __inline__ __m256i __DEFAULT_FN_ATTRS256 178_mm256_cmpeq_epi8(__m256i __a, __m256i __b) 179{ 180 return (__m256i)((__v32qi)__a == (__v32qi)__b); 181} 182 183static __inline__ __m256i __DEFAULT_FN_ATTRS256 184_mm256_cmpeq_epi16(__m256i __a, __m256i __b) 185{ 186 return (__m256i)((__v16hi)__a == (__v16hi)__b); 187} 188 189static __inline__ __m256i __DEFAULT_FN_ATTRS256 190_mm256_cmpeq_epi32(__m256i __a, __m256i __b) 191{ 192 return (__m256i)((__v8si)__a == (__v8si)__b); 193} 194 195static __inline__ __m256i __DEFAULT_FN_ATTRS256 196_mm256_cmpeq_epi64(__m256i __a, __m256i __b) 197{ 198 return (__m256i)((__v4di)__a == (__v4di)__b); 199} 200 201static __inline__ __m256i __DEFAULT_FN_ATTRS256 202_mm256_cmpgt_epi8(__m256i __a, __m256i __b) 203{ 204 /* This function always performs a signed comparison, but __v32qi is a char 205 which may be signed or unsigned, so use __v32qs. */ 206 return (__m256i)((__v32qs)__a > (__v32qs)__b); 207} 208 209static __inline__ __m256i __DEFAULT_FN_ATTRS256 210_mm256_cmpgt_epi16(__m256i __a, __m256i __b) 211{ 212 return (__m256i)((__v16hi)__a > (__v16hi)__b); 213} 214 215static __inline__ __m256i __DEFAULT_FN_ATTRS256 216_mm256_cmpgt_epi32(__m256i __a, __m256i __b) 217{ 218 return (__m256i)((__v8si)__a > (__v8si)__b); 219} 220 221static __inline__ __m256i __DEFAULT_FN_ATTRS256 222_mm256_cmpgt_epi64(__m256i __a, __m256i __b) 223{ 224 return (__m256i)((__v4di)__a > (__v4di)__b); 225} 226 227static __inline__ __m256i __DEFAULT_FN_ATTRS256 228_mm256_hadd_epi16(__m256i __a, __m256i __b) 229{ 230 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 231} 232 233static __inline__ __m256i __DEFAULT_FN_ATTRS256 234_mm256_hadd_epi32(__m256i __a, __m256i __b) 235{ 236 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 237} 238 239static __inline__ __m256i __DEFAULT_FN_ATTRS256 240_mm256_hadds_epi16(__m256i __a, __m256i __b) 241{ 242 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 243} 244 245static __inline__ __m256i __DEFAULT_FN_ATTRS256 246_mm256_hsub_epi16(__m256i __a, __m256i __b) 247{ 248 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 249} 250 251static __inline__ __m256i __DEFAULT_FN_ATTRS256 252_mm256_hsub_epi32(__m256i __a, __m256i __b) 253{ 254 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 255} 256 257static __inline__ __m256i __DEFAULT_FN_ATTRS256 258_mm256_hsubs_epi16(__m256i __a, __m256i __b) 259{ 260 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 261} 262 263static __inline__ __m256i __DEFAULT_FN_ATTRS256 264_mm256_maddubs_epi16(__m256i __a, __m256i __b) 265{ 266 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 267} 268 269static __inline__ __m256i __DEFAULT_FN_ATTRS256 270_mm256_madd_epi16(__m256i __a, __m256i __b) 271{ 272 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 273} 274 275static __inline__ __m256i __DEFAULT_FN_ATTRS256 276_mm256_max_epi8(__m256i __a, __m256i __b) 277{ 278 return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b); 279} 280 281static __inline__ __m256i __DEFAULT_FN_ATTRS256 282_mm256_max_epi16(__m256i __a, __m256i __b) 283{ 284 return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b); 285} 286 287static __inline__ __m256i __DEFAULT_FN_ATTRS256 288_mm256_max_epi32(__m256i __a, __m256i __b) 289{ 290 return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b); 291} 292 293static __inline__ __m256i __DEFAULT_FN_ATTRS256 294_mm256_max_epu8(__m256i __a, __m256i __b) 295{ 296 return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b); 297} 298 299static __inline__ __m256i __DEFAULT_FN_ATTRS256 300_mm256_max_epu16(__m256i __a, __m256i __b) 301{ 302 return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b); 303} 304 305static __inline__ __m256i __DEFAULT_FN_ATTRS256 306_mm256_max_epu32(__m256i __a, __m256i __b) 307{ 308 return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b); 309} 310 311static __inline__ __m256i __DEFAULT_FN_ATTRS256 312_mm256_min_epi8(__m256i __a, __m256i __b) 313{ 314 return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b); 315} 316 317static __inline__ __m256i __DEFAULT_FN_ATTRS256 318_mm256_min_epi16(__m256i __a, __m256i __b) 319{ 320 return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b); 321} 322 323static __inline__ __m256i __DEFAULT_FN_ATTRS256 324_mm256_min_epi32(__m256i __a, __m256i __b) 325{ 326 return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b); 327} 328 329static __inline__ __m256i __DEFAULT_FN_ATTRS256 330_mm256_min_epu8(__m256i __a, __m256i __b) 331{ 332 return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b); 333} 334 335static __inline__ __m256i __DEFAULT_FN_ATTRS256 336_mm256_min_epu16(__m256i __a, __m256i __b) 337{ 338 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b); 339} 340 341static __inline__ __m256i __DEFAULT_FN_ATTRS256 342_mm256_min_epu32(__m256i __a, __m256i __b) 343{ 344 return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b); 345} 346 347static __inline__ int __DEFAULT_FN_ATTRS256 348_mm256_movemask_epi8(__m256i __a) 349{ 350 return __builtin_ia32_pmovmskb256((__v32qi)__a); 351} 352 353static __inline__ __m256i __DEFAULT_FN_ATTRS256 354_mm256_cvtepi8_epi16(__m128i __V) 355{ 356 /* This function always performs a signed extension, but __v16qi is a char 357 which may be signed or unsigned, so use __v16qs. */ 358 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 359} 360 361static __inline__ __m256i __DEFAULT_FN_ATTRS256 362_mm256_cvtepi8_epi32(__m128i __V) 363{ 364 /* This function always performs a signed extension, but __v16qi is a char 365 which may be signed or unsigned, so use __v16qs. */ 366 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 367} 368 369static __inline__ __m256i __DEFAULT_FN_ATTRS256 370_mm256_cvtepi8_epi64(__m128i __V) 371{ 372 /* This function always performs a signed extension, but __v16qi is a char 373 which may be signed or unsigned, so use __v16qs. */ 374 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 375} 376 377static __inline__ __m256i __DEFAULT_FN_ATTRS256 378_mm256_cvtepi16_epi32(__m128i __V) 379{ 380 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 381} 382 383static __inline__ __m256i __DEFAULT_FN_ATTRS256 384_mm256_cvtepi16_epi64(__m128i __V) 385{ 386 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 387} 388 389static __inline__ __m256i __DEFAULT_FN_ATTRS256 390_mm256_cvtepi32_epi64(__m128i __V) 391{ 392 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 393} 394 395static __inline__ __m256i __DEFAULT_FN_ATTRS256 396_mm256_cvtepu8_epi16(__m128i __V) 397{ 398 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 399} 400 401static __inline__ __m256i __DEFAULT_FN_ATTRS256 402_mm256_cvtepu8_epi32(__m128i __V) 403{ 404 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 405} 406 407static __inline__ __m256i __DEFAULT_FN_ATTRS256 408_mm256_cvtepu8_epi64(__m128i __V) 409{ 410 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 411} 412 413static __inline__ __m256i __DEFAULT_FN_ATTRS256 414_mm256_cvtepu16_epi32(__m128i __V) 415{ 416 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 417} 418 419static __inline__ __m256i __DEFAULT_FN_ATTRS256 420_mm256_cvtepu16_epi64(__m128i __V) 421{ 422 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 423} 424 425static __inline__ __m256i __DEFAULT_FN_ATTRS256 426_mm256_cvtepu32_epi64(__m128i __V) 427{ 428 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 429} 430 431static __inline__ __m256i __DEFAULT_FN_ATTRS256 432_mm256_mul_epi32(__m256i __a, __m256i __b) 433{ 434 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 435} 436 437static __inline__ __m256i __DEFAULT_FN_ATTRS256 438_mm256_mulhrs_epi16(__m256i __a, __m256i __b) 439{ 440 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 441} 442 443static __inline__ __m256i __DEFAULT_FN_ATTRS256 444_mm256_mulhi_epu16(__m256i __a, __m256i __b) 445{ 446 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 447} 448 449static __inline__ __m256i __DEFAULT_FN_ATTRS256 450_mm256_mulhi_epi16(__m256i __a, __m256i __b) 451{ 452 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 453} 454 455static __inline__ __m256i __DEFAULT_FN_ATTRS256 456_mm256_mullo_epi16(__m256i __a, __m256i __b) 457{ 458 return (__m256i)((__v16hu)__a * (__v16hu)__b); 459} 460 461static __inline__ __m256i __DEFAULT_FN_ATTRS256 462_mm256_mullo_epi32 (__m256i __a, __m256i __b) 463{ 464 return (__m256i)((__v8su)__a * (__v8su)__b); 465} 466 467static __inline__ __m256i __DEFAULT_FN_ATTRS256 468_mm256_mul_epu32(__m256i __a, __m256i __b) 469{ 470 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 471} 472 473static __inline__ __m256i __DEFAULT_FN_ATTRS256 474_mm256_or_si256(__m256i __a, __m256i __b) 475{ 476 return (__m256i)((__v4du)__a | (__v4du)__b); 477} 478 479static __inline__ __m256i __DEFAULT_FN_ATTRS256 480_mm256_sad_epu8(__m256i __a, __m256i __b) 481{ 482 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 483} 484 485static __inline__ __m256i __DEFAULT_FN_ATTRS256 486_mm256_shuffle_epi8(__m256i __a, __m256i __b) 487{ 488 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 489} 490 491#define _mm256_shuffle_epi32(a, imm) \ 492 (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)) 493 494#define _mm256_shufflehi_epi16(a, imm) \ 495 (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)) 496 497#define _mm256_shufflelo_epi16(a, imm) \ 498 (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)) 499 500static __inline__ __m256i __DEFAULT_FN_ATTRS256 501_mm256_sign_epi8(__m256i __a, __m256i __b) 502{ 503 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 504} 505 506static __inline__ __m256i __DEFAULT_FN_ATTRS256 507_mm256_sign_epi16(__m256i __a, __m256i __b) 508{ 509 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 510} 511 512static __inline__ __m256i __DEFAULT_FN_ATTRS256 513_mm256_sign_epi32(__m256i __a, __m256i __b) 514{ 515 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 516} 517 518#define _mm256_slli_si256(a, imm) \ 519 (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) 520 521#define _mm256_bslli_epi128(a, imm) \ 522 (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) 523 524static __inline__ __m256i __DEFAULT_FN_ATTRS256 525_mm256_slli_epi16(__m256i __a, int __count) 526{ 527 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 528} 529 530static __inline__ __m256i __DEFAULT_FN_ATTRS256 531_mm256_sll_epi16(__m256i __a, __m128i __count) 532{ 533 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 534} 535 536static __inline__ __m256i __DEFAULT_FN_ATTRS256 537_mm256_slli_epi32(__m256i __a, int __count) 538{ 539 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 540} 541 542static __inline__ __m256i __DEFAULT_FN_ATTRS256 543_mm256_sll_epi32(__m256i __a, __m128i __count) 544{ 545 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 546} 547 548static __inline__ __m256i __DEFAULT_FN_ATTRS256 549_mm256_slli_epi64(__m256i __a, int __count) 550{ 551 return __builtin_ia32_psllqi256((__v4di)__a, __count); 552} 553 554static __inline__ __m256i __DEFAULT_FN_ATTRS256 555_mm256_sll_epi64(__m256i __a, __m128i __count) 556{ 557 return __builtin_ia32_psllq256((__v4di)__a, __count); 558} 559 560static __inline__ __m256i __DEFAULT_FN_ATTRS256 561_mm256_srai_epi16(__m256i __a, int __count) 562{ 563 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 564} 565 566static __inline__ __m256i __DEFAULT_FN_ATTRS256 567_mm256_sra_epi16(__m256i __a, __m128i __count) 568{ 569 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 570} 571 572static __inline__ __m256i __DEFAULT_FN_ATTRS256 573_mm256_srai_epi32(__m256i __a, int __count) 574{ 575 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 576} 577 578static __inline__ __m256i __DEFAULT_FN_ATTRS256 579_mm256_sra_epi32(__m256i __a, __m128i __count) 580{ 581 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 582} 583 584#define _mm256_srli_si256(a, imm) \ 585 (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) 586 587#define _mm256_bsrli_epi128(a, imm) \ 588 (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) 589 590static __inline__ __m256i __DEFAULT_FN_ATTRS256 591_mm256_srli_epi16(__m256i __a, int __count) 592{ 593 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 594} 595 596static __inline__ __m256i __DEFAULT_FN_ATTRS256 597_mm256_srl_epi16(__m256i __a, __m128i __count) 598{ 599 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 600} 601 602static __inline__ __m256i __DEFAULT_FN_ATTRS256 603_mm256_srli_epi32(__m256i __a, int __count) 604{ 605 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 606} 607 608static __inline__ __m256i __DEFAULT_FN_ATTRS256 609_mm256_srl_epi32(__m256i __a, __m128i __count) 610{ 611 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 612} 613 614static __inline__ __m256i __DEFAULT_FN_ATTRS256 615_mm256_srli_epi64(__m256i __a, int __count) 616{ 617 return __builtin_ia32_psrlqi256((__v4di)__a, __count); 618} 619 620static __inline__ __m256i __DEFAULT_FN_ATTRS256 621_mm256_srl_epi64(__m256i __a, __m128i __count) 622{ 623 return __builtin_ia32_psrlq256((__v4di)__a, __count); 624} 625 626static __inline__ __m256i __DEFAULT_FN_ATTRS256 627_mm256_sub_epi8(__m256i __a, __m256i __b) 628{ 629 return (__m256i)((__v32qu)__a - (__v32qu)__b); 630} 631 632static __inline__ __m256i __DEFAULT_FN_ATTRS256 633_mm256_sub_epi16(__m256i __a, __m256i __b) 634{ 635 return (__m256i)((__v16hu)__a - (__v16hu)__b); 636} 637 638static __inline__ __m256i __DEFAULT_FN_ATTRS256 639_mm256_sub_epi32(__m256i __a, __m256i __b) 640{ 641 return (__m256i)((__v8su)__a - (__v8su)__b); 642} 643 644static __inline__ __m256i __DEFAULT_FN_ATTRS256 645_mm256_sub_epi64(__m256i __a, __m256i __b) 646{ 647 return (__m256i)((__v4du)__a - (__v4du)__b); 648} 649 650static __inline__ __m256i __DEFAULT_FN_ATTRS256 651_mm256_subs_epi8(__m256i __a, __m256i __b) 652{ 653 return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b); 654} 655 656static __inline__ __m256i __DEFAULT_FN_ATTRS256 657_mm256_subs_epi16(__m256i __a, __m256i __b) 658{ 659 return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b); 660} 661 662static __inline__ __m256i __DEFAULT_FN_ATTRS256 663_mm256_subs_epu8(__m256i __a, __m256i __b) 664{ 665 return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b); 666} 667 668static __inline__ __m256i __DEFAULT_FN_ATTRS256 669_mm256_subs_epu16(__m256i __a, __m256i __b) 670{ 671 return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b); 672} 673 674static __inline__ __m256i __DEFAULT_FN_ATTRS256 675_mm256_unpackhi_epi8(__m256i __a, __m256i __b) 676{ 677 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 678} 679 680static __inline__ __m256i __DEFAULT_FN_ATTRS256 681_mm256_unpackhi_epi16(__m256i __a, __m256i __b) 682{ 683 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 684} 685 686static __inline__ __m256i __DEFAULT_FN_ATTRS256 687_mm256_unpackhi_epi32(__m256i __a, __m256i __b) 688{ 689 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 690} 691 692static __inline__ __m256i __DEFAULT_FN_ATTRS256 693_mm256_unpackhi_epi64(__m256i __a, __m256i __b) 694{ 695 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 696} 697 698static __inline__ __m256i __DEFAULT_FN_ATTRS256 699_mm256_unpacklo_epi8(__m256i __a, __m256i __b) 700{ 701 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 702} 703 704static __inline__ __m256i __DEFAULT_FN_ATTRS256 705_mm256_unpacklo_epi16(__m256i __a, __m256i __b) 706{ 707 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 708} 709 710static __inline__ __m256i __DEFAULT_FN_ATTRS256 711_mm256_unpacklo_epi32(__m256i __a, __m256i __b) 712{ 713 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 714} 715 716static __inline__ __m256i __DEFAULT_FN_ATTRS256 717_mm256_unpacklo_epi64(__m256i __a, __m256i __b) 718{ 719 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 720} 721 722static __inline__ __m256i __DEFAULT_FN_ATTRS256 723_mm256_xor_si256(__m256i __a, __m256i __b) 724{ 725 return (__m256i)((__v4du)__a ^ (__v4du)__b); 726} 727 728static __inline__ __m256i __DEFAULT_FN_ATTRS256 729_mm256_stream_load_si256(__m256i const *__V) 730{ 731 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 732 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 733} 734 735static __inline__ __m128 __DEFAULT_FN_ATTRS128 736_mm_broadcastss_ps(__m128 __X) 737{ 738 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 739} 740 741static __inline__ __m128d __DEFAULT_FN_ATTRS128 742_mm_broadcastsd_pd(__m128d __a) 743{ 744 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 745} 746 747static __inline__ __m256 __DEFAULT_FN_ATTRS256 748_mm256_broadcastss_ps(__m128 __X) 749{ 750 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 751} 752 753static __inline__ __m256d __DEFAULT_FN_ATTRS256 754_mm256_broadcastsd_pd(__m128d __X) 755{ 756 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 757} 758 759static __inline__ __m256i __DEFAULT_FN_ATTRS256 760_mm256_broadcastsi128_si256(__m128i __X) 761{ 762 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 763} 764 765#define _mm_blend_epi32(V1, V2, M) \ 766 (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 767 (__v4si)(__m128i)(V2), (int)(M)) 768 769#define _mm256_blend_epi32(V1, V2, M) \ 770 (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 771 (__v8si)(__m256i)(V2), (int)(M)) 772 773static __inline__ __m256i __DEFAULT_FN_ATTRS256 774_mm256_broadcastb_epi8(__m128i __X) 775{ 776 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 777} 778 779static __inline__ __m256i __DEFAULT_FN_ATTRS256 780_mm256_broadcastw_epi16(__m128i __X) 781{ 782 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 783} 784 785static __inline__ __m256i __DEFAULT_FN_ATTRS256 786_mm256_broadcastd_epi32(__m128i __X) 787{ 788 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 789} 790 791static __inline__ __m256i __DEFAULT_FN_ATTRS256 792_mm256_broadcastq_epi64(__m128i __X) 793{ 794 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 795} 796 797static __inline__ __m128i __DEFAULT_FN_ATTRS128 798_mm_broadcastb_epi8(__m128i __X) 799{ 800 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 801} 802 803static __inline__ __m128i __DEFAULT_FN_ATTRS128 804_mm_broadcastw_epi16(__m128i __X) 805{ 806 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 807} 808 809 810static __inline__ __m128i __DEFAULT_FN_ATTRS128 811_mm_broadcastd_epi32(__m128i __X) 812{ 813 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 814} 815 816static __inline__ __m128i __DEFAULT_FN_ATTRS128 817_mm_broadcastq_epi64(__m128i __X) 818{ 819 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 820} 821 822static __inline__ __m256i __DEFAULT_FN_ATTRS256 823_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 824{ 825 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 826} 827 828#define _mm256_permute4x64_pd(V, M) \ 829 (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)) 830 831static __inline__ __m256 __DEFAULT_FN_ATTRS256 832_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 833{ 834 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 835} 836 837#define _mm256_permute4x64_epi64(V, M) \ 838 (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)) 839 840#define _mm256_permute2x128_si256(V1, V2, M) \ 841 (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)) 842 843#define _mm256_extracti128_si256(V, M) \ 844 (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)) 845 846#define _mm256_inserti128_si256(V1, V2, M) \ 847 (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 848 (__v2di)(__m128i)(V2), (int)(M)) 849 850static __inline__ __m256i __DEFAULT_FN_ATTRS256 851_mm256_maskload_epi32(int const *__X, __m256i __M) 852{ 853 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 854} 855 856static __inline__ __m256i __DEFAULT_FN_ATTRS256 857_mm256_maskload_epi64(long long const *__X, __m256i __M) 858{ 859 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 860} 861 862static __inline__ __m128i __DEFAULT_FN_ATTRS128 863_mm_maskload_epi32(int const *__X, __m128i __M) 864{ 865 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 866} 867 868static __inline__ __m128i __DEFAULT_FN_ATTRS128 869_mm_maskload_epi64(long long const *__X, __m128i __M) 870{ 871 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 872} 873 874static __inline__ void __DEFAULT_FN_ATTRS256 875_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 876{ 877 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 878} 879 880static __inline__ void __DEFAULT_FN_ATTRS256 881_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 882{ 883 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 884} 885 886static __inline__ void __DEFAULT_FN_ATTRS128 887_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 888{ 889 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 890} 891 892static __inline__ void __DEFAULT_FN_ATTRS128 893_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 894{ 895 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 896} 897 898static __inline__ __m256i __DEFAULT_FN_ATTRS256 899_mm256_sllv_epi32(__m256i __X, __m256i __Y) 900{ 901 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 902} 903 904static __inline__ __m128i __DEFAULT_FN_ATTRS128 905_mm_sllv_epi32(__m128i __X, __m128i __Y) 906{ 907 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 908} 909 910static __inline__ __m256i __DEFAULT_FN_ATTRS256 911_mm256_sllv_epi64(__m256i __X, __m256i __Y) 912{ 913 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 914} 915 916static __inline__ __m128i __DEFAULT_FN_ATTRS128 917_mm_sllv_epi64(__m128i __X, __m128i __Y) 918{ 919 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 920} 921 922static __inline__ __m256i __DEFAULT_FN_ATTRS256 923_mm256_srav_epi32(__m256i __X, __m256i __Y) 924{ 925 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 926} 927 928static __inline__ __m128i __DEFAULT_FN_ATTRS128 929_mm_srav_epi32(__m128i __X, __m128i __Y) 930{ 931 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 932} 933 934static __inline__ __m256i __DEFAULT_FN_ATTRS256 935_mm256_srlv_epi32(__m256i __X, __m256i __Y) 936{ 937 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 938} 939 940static __inline__ __m128i __DEFAULT_FN_ATTRS128 941_mm_srlv_epi32(__m128i __X, __m128i __Y) 942{ 943 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 944} 945 946static __inline__ __m256i __DEFAULT_FN_ATTRS256 947_mm256_srlv_epi64(__m256i __X, __m256i __Y) 948{ 949 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 950} 951 952static __inline__ __m128i __DEFAULT_FN_ATTRS128 953_mm_srlv_epi64(__m128i __X, __m128i __Y) 954{ 955 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 956} 957 958#define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 959 (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 960 (double const *)(m), \ 961 (__v4si)(__m128i)(i), \ 962 (__v2df)(__m128d)(mask), (s)) 963 964#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 965 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 966 (double const *)(m), \ 967 (__v4si)(__m128i)(i), \ 968 (__v4df)(__m256d)(mask), (s)) 969 970#define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 971 (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 972 (double const *)(m), \ 973 (__v2di)(__m128i)(i), \ 974 (__v2df)(__m128d)(mask), (s)) 975 976#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 977 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 978 (double const *)(m), \ 979 (__v4di)(__m256i)(i), \ 980 (__v4df)(__m256d)(mask), (s)) 981 982#define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 983 (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 984 (float const *)(m), \ 985 (__v4si)(__m128i)(i), \ 986 (__v4sf)(__m128)(mask), (s)) 987 988#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 989 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 990 (float const *)(m), \ 991 (__v8si)(__m256i)(i), \ 992 (__v8sf)(__m256)(mask), (s)) 993 994#define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 995 (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 996 (float const *)(m), \ 997 (__v2di)(__m128i)(i), \ 998 (__v4sf)(__m128)(mask), (s)) 999 1000#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 1001 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 1002 (float const *)(m), \ 1003 (__v4di)(__m256i)(i), \ 1004 (__v4sf)(__m128)(mask), (s)) 1005 1006#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 1007 (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 1008 (int const *)(m), \ 1009 (__v4si)(__m128i)(i), \ 1010 (__v4si)(__m128i)(mask), (s)) 1011 1012#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 1013 (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 1014 (int const *)(m), \ 1015 (__v8si)(__m256i)(i), \ 1016 (__v8si)(__m256i)(mask), (s)) 1017 1018#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 1019 (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 1020 (int const *)(m), \ 1021 (__v2di)(__m128i)(i), \ 1022 (__v4si)(__m128i)(mask), (s)) 1023 1024#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 1025 (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 1026 (int const *)(m), \ 1027 (__v4di)(__m256i)(i), \ 1028 (__v4si)(__m128i)(mask), (s)) 1029 1030#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 1031 (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 1032 (long long const *)(m), \ 1033 (__v4si)(__m128i)(i), \ 1034 (__v2di)(__m128i)(mask), (s)) 1035 1036#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 1037 (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 1038 (long long const *)(m), \ 1039 (__v4si)(__m128i)(i), \ 1040 (__v4di)(__m256i)(mask), (s)) 1041 1042#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 1043 (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 1044 (long long const *)(m), \ 1045 (__v2di)(__m128i)(i), \ 1046 (__v2di)(__m128i)(mask), (s)) 1047 1048#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 1049 (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 1050 (long long const *)(m), \ 1051 (__v4di)(__m256i)(i), \ 1052 (__v4di)(__m256i)(mask), (s)) 1053 1054#define _mm_i32gather_pd(m, i, s) \ 1055 (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 1056 (double const *)(m), \ 1057 (__v4si)(__m128i)(i), \ 1058 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 1059 _mm_setzero_pd()), \ 1060 (s)) 1061 1062#define _mm256_i32gather_pd(m, i, s) \ 1063 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 1064 (double const *)(m), \ 1065 (__v4si)(__m128i)(i), \ 1066 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 1067 _mm256_setzero_pd(), \ 1068 _CMP_EQ_OQ), \ 1069 (s)) 1070 1071#define _mm_i64gather_pd(m, i, s) \ 1072 (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 1073 (double const *)(m), \ 1074 (__v2di)(__m128i)(i), \ 1075 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 1076 _mm_setzero_pd()), \ 1077 (s)) 1078 1079#define _mm256_i64gather_pd(m, i, s) \ 1080 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 1081 (double const *)(m), \ 1082 (__v4di)(__m256i)(i), \ 1083 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 1084 _mm256_setzero_pd(), \ 1085 _CMP_EQ_OQ), \ 1086 (s)) 1087 1088#define _mm_i32gather_ps(m, i, s) \ 1089 (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 1090 (float const *)(m), \ 1091 (__v4si)(__m128i)(i), \ 1092 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 1093 _mm_setzero_ps()), \ 1094 (s)) 1095 1096#define _mm256_i32gather_ps(m, i, s) \ 1097 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 1098 (float const *)(m), \ 1099 (__v8si)(__m256i)(i), \ 1100 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 1101 _mm256_setzero_ps(), \ 1102 _CMP_EQ_OQ), \ 1103 (s)) 1104 1105#define _mm_i64gather_ps(m, i, s) \ 1106 (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 1107 (float const *)(m), \ 1108 (__v2di)(__m128i)(i), \ 1109 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 1110 _mm_setzero_ps()), \ 1111 (s)) 1112 1113#define _mm256_i64gather_ps(m, i, s) \ 1114 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 1115 (float const *)(m), \ 1116 (__v4di)(__m256i)(i), \ 1117 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 1118 _mm_setzero_ps()), \ 1119 (s)) 1120 1121#define _mm_i32gather_epi32(m, i, s) \ 1122 (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 1123 (int const *)(m), (__v4si)(__m128i)(i), \ 1124 (__v4si)_mm_set1_epi32(-1), (s)) 1125 1126#define _mm256_i32gather_epi32(m, i, s) \ 1127 (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 1128 (int const *)(m), (__v8si)(__m256i)(i), \ 1129 (__v8si)_mm256_set1_epi32(-1), (s)) 1130 1131#define _mm_i64gather_epi32(m, i, s) \ 1132 (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 1133 (int const *)(m), (__v2di)(__m128i)(i), \ 1134 (__v4si)_mm_set1_epi32(-1), (s)) 1135 1136#define _mm256_i64gather_epi32(m, i, s) \ 1137 (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 1138 (int const *)(m), (__v4di)(__m256i)(i), \ 1139 (__v4si)_mm_set1_epi32(-1), (s)) 1140 1141#define _mm_i32gather_epi64(m, i, s) \ 1142 (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 1143 (long long const *)(m), \ 1144 (__v4si)(__m128i)(i), \ 1145 (__v2di)_mm_set1_epi64x(-1), (s)) 1146 1147#define _mm256_i32gather_epi64(m, i, s) \ 1148 (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 1149 (long long const *)(m), \ 1150 (__v4si)(__m128i)(i), \ 1151 (__v4di)_mm256_set1_epi64x(-1), (s)) 1152 1153#define _mm_i64gather_epi64(m, i, s) \ 1154 (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 1155 (long long const *)(m), \ 1156 (__v2di)(__m128i)(i), \ 1157 (__v2di)_mm_set1_epi64x(-1), (s)) 1158 1159#define _mm256_i64gather_epi64(m, i, s) \ 1160 (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 1161 (long long const *)(m), \ 1162 (__v4di)(__m256i)(i), \ 1163 (__v4di)_mm256_set1_epi64x(-1), (s)) 1164 1165#undef __DEFAULT_FN_ATTRS256 1166#undef __DEFAULT_FN_ATTRS128 1167 1168#endif /* __AVX2INTRIN_H */ 1169