1/* Copyright (C) 2011-2015 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24#ifndef _IMMINTRIN_H_INCLUDED 25# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26#endif 27 28#ifndef _AVX2INTRIN_H_INCLUDED 29#define _AVX2INTRIN_H_INCLUDED 30 31#ifndef __AVX2__ 32#pragma GCC push_options 33#pragma GCC target("avx2") 34#define __DISABLE_AVX2__ 35#endif /* __AVX2__ */ 36 37/* Sum absolute 8-bit integer difference of adjacent groups of 4 38 byte integers in the first 2 operands. Starting offsets within 39 operands are determined by the 3rd mask operand. */ 40#ifdef __OPTIMIZE__ 41extern __inline __m256i 42__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 43_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 44{ 45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 46 (__v32qi)__Y, __M); 47} 48#else 49#define _mm256_mpsadbw_epu8(X, Y, M) \ 50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 51 (__v32qi)(__m256i)(Y), (int)(M))) 52#endif 53 54extern __inline __m256i 55__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 56_mm256_abs_epi8 (__m256i __A) 57{ 58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 59} 60 61extern __inline __m256i 62__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 63_mm256_abs_epi16 (__m256i __A) 64{ 65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 66} 67 68extern __inline __m256i 69__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 70_mm256_abs_epi32 (__m256i __A) 71{ 72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 73} 74 75extern __inline __m256i 76__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 77_mm256_packs_epi32 (__m256i __A, __m256i __B) 78{ 79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 80} 81 82extern __inline __m256i 83__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 84_mm256_packs_epi16 (__m256i __A, __m256i __B) 85{ 86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 87} 88 89extern __inline __m256i 90__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 91_mm256_packus_epi32 (__m256i __A, __m256i __B) 92{ 93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 94} 95 96extern __inline __m256i 97__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 98_mm256_packus_epi16 (__m256i __A, __m256i __B) 99{ 100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 101} 102 103extern __inline __m256i 104__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 105_mm256_add_epi8 (__m256i __A, __m256i __B) 106{ 107 return (__m256i) ((__v32qu)__A + (__v32qu)__B); 108} 109 110extern __inline __m256i 111__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 112_mm256_add_epi16 (__m256i __A, __m256i __B) 113{ 114 return (__m256i) ((__v16hu)__A + (__v16hu)__B); 115} 116 117extern __inline __m256i 118__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 119_mm256_add_epi32 (__m256i __A, __m256i __B) 120{ 121 return (__m256i) ((__v8su)__A + (__v8su)__B); 122} 123 124extern __inline __m256i 125__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 126_mm256_add_epi64 (__m256i __A, __m256i __B) 127{ 128 return (__m256i) ((__v4du)__A + (__v4du)__B); 129} 130 131extern __inline __m256i 132__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 133_mm256_adds_epi8 (__m256i __A, __m256i __B) 134{ 135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 136} 137 138extern __inline __m256i 139__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 140_mm256_adds_epi16 (__m256i __A, __m256i __B) 141{ 142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 143} 144 145extern __inline __m256i 146__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 147_mm256_adds_epu8 (__m256i __A, __m256i __B) 148{ 149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 150} 151 152extern __inline __m256i 153__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 154_mm256_adds_epu16 (__m256i __A, __m256i __B) 155{ 156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 157} 158 159#ifdef __OPTIMIZE__ 160extern __inline __m256i 161__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 162_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 163{ 164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 165 (__v4di)__B, 166 __N * 8); 167} 168#else 169/* In that case (__N*8) will be in vreg, and insn will not be matched. */ 170/* Use define instead */ 171#define _mm256_alignr_epi8(A, B, N) \ 172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 173 (__v4di)(__m256i)(B), \ 174 (int)(N) * 8)) 175#endif 176 177extern __inline __m256i 178__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 179_mm256_and_si256 (__m256i __A, __m256i __B) 180{ 181 return (__m256i) ((__v4du)__A & (__v4du)__B); 182} 183 184extern __inline __m256i 185__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 186_mm256_andnot_si256 (__m256i __A, __m256i __B) 187{ 188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 189} 190 191extern __inline __m256i 192__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 193_mm256_avg_epu8 (__m256i __A, __m256i __B) 194{ 195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 196} 197 198extern __inline __m256i 199__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 200_mm256_avg_epu16 (__m256i __A, __m256i __B) 201{ 202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 203} 204 205extern __inline __m256i 206__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 207_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 208{ 209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 210 (__v32qi)__Y, 211 (__v32qi)__M); 212} 213 214#ifdef __OPTIMIZE__ 215extern __inline __m256i 216__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 217_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 218{ 219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 220 (__v16hi)__Y, 221 __M); 222} 223#else 224#define _mm256_blend_epi16(X, Y, M) \ 225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 226 (__v16hi)(__m256i)(Y), (int)(M))) 227#endif 228 229extern __inline __m256i 230__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 231_mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 232{ 233 return (__m256i) ((__v32qi)__A == (__v32qi)__B); 234} 235 236extern __inline __m256i 237__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 238_mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 239{ 240 return (__m256i) ((__v16hi)__A == (__v16hi)__B); 241} 242 243extern __inline __m256i 244__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 245_mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 246{ 247 return (__m256i) ((__v8si)__A == (__v8si)__B); 248} 249 250extern __inline __m256i 251__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 252_mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 253{ 254 return (__m256i) ((__v4di)__A == (__v4di)__B); 255} 256 257extern __inline __m256i 258__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 259_mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 260{ 261 return (__m256i) ((__v32qi)__A > (__v32qi)__B); 262} 263 264extern __inline __m256i 265__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 266_mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 267{ 268 return (__m256i) ((__v16hi)__A > (__v16hi)__B); 269} 270 271extern __inline __m256i 272__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 273_mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 274{ 275 return (__m256i) ((__v8si)__A > (__v8si)__B); 276} 277 278extern __inline __m256i 279__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 280_mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 281{ 282 return (__m256i) ((__v4di)__A > (__v4di)__B); 283} 284 285extern __inline __m256i 286__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 287_mm256_hadd_epi16 (__m256i __X, __m256i __Y) 288{ 289 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 290 (__v16hi)__Y); 291} 292 293extern __inline __m256i 294__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 295_mm256_hadd_epi32 (__m256i __X, __m256i __Y) 296{ 297 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 298} 299 300extern __inline __m256i 301__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 302_mm256_hadds_epi16 (__m256i __X, __m256i __Y) 303{ 304 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 305 (__v16hi)__Y); 306} 307 308extern __inline __m256i 309__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 310_mm256_hsub_epi16 (__m256i __X, __m256i __Y) 311{ 312 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 313 (__v16hi)__Y); 314} 315 316extern __inline __m256i 317__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 318_mm256_hsub_epi32 (__m256i __X, __m256i __Y) 319{ 320 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 321} 322 323extern __inline __m256i 324__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 325_mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 326{ 327 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 328 (__v16hi)__Y); 329} 330 331extern __inline __m256i 332__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 333_mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 334{ 335 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 336 (__v32qi)__Y); 337} 338 339extern __inline __m256i 340__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 341_mm256_madd_epi16 (__m256i __A, __m256i __B) 342{ 343 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 344 (__v16hi)__B); 345} 346 347extern __inline __m256i 348__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 349_mm256_max_epi8 (__m256i __A, __m256i __B) 350{ 351 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 352} 353 354extern __inline __m256i 355__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 356_mm256_max_epi16 (__m256i __A, __m256i __B) 357{ 358 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 359} 360 361extern __inline __m256i 362__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 363_mm256_max_epi32 (__m256i __A, __m256i __B) 364{ 365 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 366} 367 368extern __inline __m256i 369__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 370_mm256_max_epu8 (__m256i __A, __m256i __B) 371{ 372 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 373} 374 375extern __inline __m256i 376__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 377_mm256_max_epu16 (__m256i __A, __m256i __B) 378{ 379 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 380} 381 382extern __inline __m256i 383__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 384_mm256_max_epu32 (__m256i __A, __m256i __B) 385{ 386 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 387} 388 389extern __inline __m256i 390__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 391_mm256_min_epi8 (__m256i __A, __m256i __B) 392{ 393 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 394} 395 396extern __inline __m256i 397__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 398_mm256_min_epi16 (__m256i __A, __m256i __B) 399{ 400 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 401} 402 403extern __inline __m256i 404__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 405_mm256_min_epi32 (__m256i __A, __m256i __B) 406{ 407 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 408} 409 410extern __inline __m256i 411__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 412_mm256_min_epu8 (__m256i __A, __m256i __B) 413{ 414 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 415} 416 417extern __inline __m256i 418__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 419_mm256_min_epu16 (__m256i __A, __m256i __B) 420{ 421 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 422} 423 424extern __inline __m256i 425__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 426_mm256_min_epu32 (__m256i __A, __m256i __B) 427{ 428 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 429} 430 431extern __inline int 432__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 433_mm256_movemask_epi8 (__m256i __A) 434{ 435 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 436} 437 438extern __inline __m256i 439__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 440_mm256_cvtepi8_epi16 (__m128i __X) 441{ 442 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 443} 444 445extern __inline __m256i 446__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 447_mm256_cvtepi8_epi32 (__m128i __X) 448{ 449 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 450} 451 452extern __inline __m256i 453__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 454_mm256_cvtepi8_epi64 (__m128i __X) 455{ 456 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 457} 458 459extern __inline __m256i 460__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 461_mm256_cvtepi16_epi32 (__m128i __X) 462{ 463 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 464} 465 466extern __inline __m256i 467__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 468_mm256_cvtepi16_epi64 (__m128i __X) 469{ 470 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 471} 472 473extern __inline __m256i 474__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 475_mm256_cvtepi32_epi64 (__m128i __X) 476{ 477 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 478} 479 480extern __inline __m256i 481__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 482_mm256_cvtepu8_epi16 (__m128i __X) 483{ 484 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 485} 486 487extern __inline __m256i 488__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 489_mm256_cvtepu8_epi32 (__m128i __X) 490{ 491 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 492} 493 494extern __inline __m256i 495__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 496_mm256_cvtepu8_epi64 (__m128i __X) 497{ 498 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 499} 500 501extern __inline __m256i 502__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 503_mm256_cvtepu16_epi32 (__m128i __X) 504{ 505 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 506} 507 508extern __inline __m256i 509__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 510_mm256_cvtepu16_epi64 (__m128i __X) 511{ 512 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 513} 514 515extern __inline __m256i 516__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 517_mm256_cvtepu32_epi64 (__m128i __X) 518{ 519 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 520} 521 522extern __inline __m256i 523__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 524_mm256_mul_epi32 (__m256i __X, __m256i __Y) 525{ 526 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 527} 528 529extern __inline __m256i 530__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 531_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 532{ 533 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 534 (__v16hi)__Y); 535} 536 537extern __inline __m256i 538__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 539_mm256_mulhi_epu16 (__m256i __A, __m256i __B) 540{ 541 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 542} 543 544extern __inline __m256i 545__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 546_mm256_mulhi_epi16 (__m256i __A, __m256i __B) 547{ 548 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 549} 550 551extern __inline __m256i 552__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 553_mm256_mullo_epi16 (__m256i __A, __m256i __B) 554{ 555 return (__m256i) ((__v16hu)__A * (__v16hu)__B); 556} 557 558extern __inline __m256i 559__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 560_mm256_mullo_epi32 (__m256i __A, __m256i __B) 561{ 562 return (__m256i) ((__v8su)__A * (__v8su)__B); 563} 564 565extern __inline __m256i 566__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 567_mm256_mul_epu32 (__m256i __A, __m256i __B) 568{ 569 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 570} 571 572extern __inline __m256i 573__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 574_mm256_or_si256 (__m256i __A, __m256i __B) 575{ 576 return (__m256i) ((__v4du)__A | (__v4du)__B); 577} 578 579extern __inline __m256i 580__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 581_mm256_sad_epu8 (__m256i __A, __m256i __B) 582{ 583 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 584} 585 586extern __inline __m256i 587__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 588_mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 589{ 590 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 591 (__v32qi)__Y); 592} 593 594#ifdef __OPTIMIZE__ 595extern __inline __m256i 596__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 597_mm256_shuffle_epi32 (__m256i __A, const int __mask) 598{ 599 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 600} 601 602extern __inline __m256i 603__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 604_mm256_shufflehi_epi16 (__m256i __A, const int __mask) 605{ 606 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 607} 608 609extern __inline __m256i 610__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 611_mm256_shufflelo_epi16 (__m256i __A, const int __mask) 612{ 613 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 614} 615#else 616#define _mm256_shuffle_epi32(A, N) \ 617 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 618#define _mm256_shufflehi_epi16(A, N) \ 619 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 620#define _mm256_shufflelo_epi16(A, N) \ 621 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 622#endif 623 624extern __inline __m256i 625__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 626_mm256_sign_epi8 (__m256i __X, __m256i __Y) 627{ 628 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 629} 630 631extern __inline __m256i 632__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 633_mm256_sign_epi16 (__m256i __X, __m256i __Y) 634{ 635 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 636} 637 638extern __inline __m256i 639__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 640_mm256_sign_epi32 (__m256i __X, __m256i __Y) 641{ 642 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 643} 644 645#ifdef __OPTIMIZE__ 646extern __inline __m256i 647__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 648_mm256_bslli_epi128 (__m256i __A, const int __N) 649{ 650 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 651} 652 653extern __inline __m256i 654__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 655_mm256_slli_si256 (__m256i __A, const int __N) 656{ 657 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 658} 659#else 660#define _mm256_bslli_epi128(A, N) \ 661 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 662#define _mm256_slli_si256(A, N) \ 663 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 664#endif 665 666extern __inline __m256i 667__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 668_mm256_slli_epi16 (__m256i __A, int __B) 669{ 670 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 671} 672 673extern __inline __m256i 674__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 675_mm256_sll_epi16 (__m256i __A, __m128i __B) 676{ 677 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 678} 679 680extern __inline __m256i 681__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 682_mm256_slli_epi32 (__m256i __A, int __B) 683{ 684 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 685} 686 687extern __inline __m256i 688__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 689_mm256_sll_epi32 (__m256i __A, __m128i __B) 690{ 691 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 692} 693 694extern __inline __m256i 695__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 696_mm256_slli_epi64 (__m256i __A, int __B) 697{ 698 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 699} 700 701extern __inline __m256i 702__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 703_mm256_sll_epi64 (__m256i __A, __m128i __B) 704{ 705 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 706} 707 708extern __inline __m256i 709__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 710_mm256_srai_epi16 (__m256i __A, int __B) 711{ 712 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 713} 714 715extern __inline __m256i 716__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 717_mm256_sra_epi16 (__m256i __A, __m128i __B) 718{ 719 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 720} 721 722extern __inline __m256i 723__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 724_mm256_srai_epi32 (__m256i __A, int __B) 725{ 726 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 727} 728 729extern __inline __m256i 730__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 731_mm256_sra_epi32 (__m256i __A, __m128i __B) 732{ 733 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 734} 735 736#ifdef __OPTIMIZE__ 737extern __inline __m256i 738__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 739_mm256_bsrli_epi128 (__m256i __A, const int __N) 740{ 741 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 742} 743 744extern __inline __m256i 745__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 746_mm256_srli_si256 (__m256i __A, const int __N) 747{ 748 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 749} 750#else 751#define _mm256_bsrli_epi128(A, N) \ 752 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 753#define _mm256_srli_si256(A, N) \ 754 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 755#endif 756 757extern __inline __m256i 758__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 759_mm256_srli_epi16 (__m256i __A, int __B) 760{ 761 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 762} 763 764extern __inline __m256i 765__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 766_mm256_srl_epi16 (__m256i __A, __m128i __B) 767{ 768 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 769} 770 771extern __inline __m256i 772__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 773_mm256_srli_epi32 (__m256i __A, int __B) 774{ 775 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 776} 777 778extern __inline __m256i 779__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 780_mm256_srl_epi32 (__m256i __A, __m128i __B) 781{ 782 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 783} 784 785extern __inline __m256i 786__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 787_mm256_srli_epi64 (__m256i __A, int __B) 788{ 789 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 790} 791 792extern __inline __m256i 793__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 794_mm256_srl_epi64 (__m256i __A, __m128i __B) 795{ 796 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 797} 798 799extern __inline __m256i 800__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 801_mm256_sub_epi8 (__m256i __A, __m256i __B) 802{ 803 return (__m256i) ((__v32qu)__A - (__v32qu)__B); 804} 805 806extern __inline __m256i 807__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 808_mm256_sub_epi16 (__m256i __A, __m256i __B) 809{ 810 return (__m256i) ((__v16hu)__A - (__v16hu)__B); 811} 812 813extern __inline __m256i 814__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 815_mm256_sub_epi32 (__m256i __A, __m256i __B) 816{ 817 return (__m256i) ((__v8su)__A - (__v8su)__B); 818} 819 820extern __inline __m256i 821__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 822_mm256_sub_epi64 (__m256i __A, __m256i __B) 823{ 824 return (__m256i) ((__v4du)__A - (__v4du)__B); 825} 826 827extern __inline __m256i 828__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 829_mm256_subs_epi8 (__m256i __A, __m256i __B) 830{ 831 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 832} 833 834extern __inline __m256i 835__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 836_mm256_subs_epi16 (__m256i __A, __m256i __B) 837{ 838 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 839} 840 841extern __inline __m256i 842__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 843_mm256_subs_epu8 (__m256i __A, __m256i __B) 844{ 845 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 846} 847 848extern __inline __m256i 849__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 850_mm256_subs_epu16 (__m256i __A, __m256i __B) 851{ 852 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 853} 854 855extern __inline __m256i 856__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 857_mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 858{ 859 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 860} 861 862extern __inline __m256i 863__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 864_mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 865{ 866 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 867} 868 869extern __inline __m256i 870__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 871_mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 872{ 873 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 874} 875 876extern __inline __m256i 877__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 878_mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 879{ 880 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 881} 882 883extern __inline __m256i 884__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 885_mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 886{ 887 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 888} 889 890extern __inline __m256i 891__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 892_mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 893{ 894 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 895} 896 897extern __inline __m256i 898__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 899_mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 900{ 901 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 902} 903 904extern __inline __m256i 905__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 906_mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 907{ 908 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 909} 910 911extern __inline __m256i 912__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 913_mm256_xor_si256 (__m256i __A, __m256i __B) 914{ 915 return (__m256i) ((__v4du)__A ^ (__v4du)__B); 916} 917 918extern __inline __m256i 919__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 920_mm256_stream_load_si256 (__m256i const *__X) 921{ 922 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 923} 924 925extern __inline __m128 926__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 927_mm_broadcastss_ps (__m128 __X) 928{ 929 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 930} 931 932extern __inline __m256 933__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 934_mm256_broadcastss_ps (__m128 __X) 935{ 936 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 937} 938 939extern __inline __m256d 940__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 941_mm256_broadcastsd_pd (__m128d __X) 942{ 943 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 944} 945 946extern __inline __m256i 947__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 948_mm256_broadcastsi128_si256 (__m128i __X) 949{ 950 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 951} 952 953#ifdef __OPTIMIZE__ 954extern __inline __m128i 955__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 956_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 957{ 958 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 959 (__v4si)__Y, 960 __M); 961} 962#else 963#define _mm_blend_epi32(X, Y, M) \ 964 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 965 (__v4si)(__m128i)(Y), (int)(M))) 966#endif 967 968#ifdef __OPTIMIZE__ 969extern __inline __m256i 970__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 971_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 972{ 973 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 974 (__v8si)__Y, 975 __M); 976} 977#else 978#define _mm256_blend_epi32(X, Y, M) \ 979 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 980 (__v8si)(__m256i)(Y), (int)(M))) 981#endif 982 983extern __inline __m256i 984__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 985_mm256_broadcastb_epi8 (__m128i __X) 986{ 987 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 988} 989 990extern __inline __m256i 991__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 992_mm256_broadcastw_epi16 (__m128i __X) 993{ 994 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 995} 996 997extern __inline __m256i 998__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 999_mm256_broadcastd_epi32 (__m128i __X) 1000{ 1001 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 1002} 1003 1004extern __inline __m256i 1005__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1006_mm256_broadcastq_epi64 (__m128i __X) 1007{ 1008 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 1009} 1010 1011extern __inline __m128i 1012__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1013_mm_broadcastb_epi8 (__m128i __X) 1014{ 1015 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 1016} 1017 1018extern __inline __m128i 1019__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1020_mm_broadcastw_epi16 (__m128i __X) 1021{ 1022 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 1023} 1024 1025extern __inline __m128i 1026__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1027_mm_broadcastd_epi32 (__m128i __X) 1028{ 1029 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1030} 1031 1032extern __inline __m128i 1033__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1034_mm_broadcastq_epi64 (__m128i __X) 1035{ 1036 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1037} 1038 1039extern __inline __m256i 1040__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1041_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1042{ 1043 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1044} 1045 1046#ifdef __OPTIMIZE__ 1047extern __inline __m256d 1048__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1049_mm256_permute4x64_pd (__m256d __X, const int __M) 1050{ 1051 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1052} 1053#else 1054#define _mm256_permute4x64_pd(X, M) \ 1055 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1056#endif 1057 1058extern __inline __m256 1059__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1060_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1061{ 1062 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1063} 1064 1065#ifdef __OPTIMIZE__ 1066extern __inline __m256i 1067__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1068_mm256_permute4x64_epi64 (__m256i __X, const int __M) 1069{ 1070 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1071} 1072#else 1073#define _mm256_permute4x64_epi64(X, M) \ 1074 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1075#endif 1076 1077 1078#ifdef __OPTIMIZE__ 1079extern __inline __m256i 1080__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1081_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1082{ 1083 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1084} 1085#else 1086#define _mm256_permute2x128_si256(X, Y, M) \ 1087 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1088#endif 1089 1090#ifdef __OPTIMIZE__ 1091extern __inline __m128i 1092__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1093_mm256_extracti128_si256 (__m256i __X, const int __M) 1094{ 1095 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1096} 1097#else 1098#define _mm256_extracti128_si256(X, M) \ 1099 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1100#endif 1101 1102#ifdef __OPTIMIZE__ 1103extern __inline __m256i 1104__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1105_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1106{ 1107 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1108} 1109#else 1110#define _mm256_inserti128_si256(X, Y, M) \ 1111 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1112 (__v2di)(__m128i)(Y), \ 1113 (int)(M))) 1114#endif 1115 1116extern __inline __m256i 1117__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1118_mm256_maskload_epi32 (int const *__X, __m256i __M ) 1119{ 1120 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1121 (__v8si)__M); 1122} 1123 1124extern __inline __m256i 1125__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1126_mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1127{ 1128 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1129 (__v4di)__M); 1130} 1131 1132extern __inline __m128i 1133__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1134_mm_maskload_epi32 (int const *__X, __m128i __M ) 1135{ 1136 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1137 (__v4si)__M); 1138} 1139 1140extern __inline __m128i 1141__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1142_mm_maskload_epi64 (long long const *__X, __m128i __M ) 1143{ 1144 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1145 (__v2di)__M); 1146} 1147 1148extern __inline void 1149__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1150_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1151{ 1152 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1153} 1154 1155extern __inline void 1156__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1157_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1158{ 1159 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1160} 1161 1162extern __inline void 1163__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1164_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1165{ 1166 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1167} 1168 1169extern __inline void 1170__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1171_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1172{ 1173 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1174} 1175 1176extern __inline __m256i 1177__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1178_mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1179{ 1180 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1181} 1182 1183extern __inline __m128i 1184__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1185_mm_sllv_epi32 (__m128i __X, __m128i __Y) 1186{ 1187 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1188} 1189 1190extern __inline __m256i 1191__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1192_mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1193{ 1194 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1195} 1196 1197extern __inline __m128i 1198__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1199_mm_sllv_epi64 (__m128i __X, __m128i __Y) 1200{ 1201 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1202} 1203 1204extern __inline __m256i 1205__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1206_mm256_srav_epi32 (__m256i __X, __m256i __Y) 1207{ 1208 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1209} 1210 1211extern __inline __m128i 1212__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1213_mm_srav_epi32 (__m128i __X, __m128i __Y) 1214{ 1215 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1216} 1217 1218extern __inline __m256i 1219__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1220_mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1221{ 1222 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1223} 1224 1225extern __inline __m128i 1226__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1227_mm_srlv_epi32 (__m128i __X, __m128i __Y) 1228{ 1229 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1230} 1231 1232extern __inline __m256i 1233__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1234_mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1235{ 1236 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1237} 1238 1239extern __inline __m128i 1240__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1241_mm_srlv_epi64 (__m128i __X, __m128i __Y) 1242{ 1243 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1244} 1245 1246#ifdef __OPTIMIZE__ 1247extern __inline __m128d 1248__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1249_mm_i32gather_pd (double const *base, __m128i index, const int scale) 1250{ 1251 __v2df zero = _mm_setzero_pd (); 1252 __v2df mask = _mm_cmpeq_pd (zero, zero); 1253 1254 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), 1255 base, 1256 (__v4si)index, 1257 mask, 1258 scale); 1259} 1260 1261extern __inline __m128d 1262__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1263_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, 1264 __m128d mask, const int scale) 1265{ 1266 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, 1267 base, 1268 (__v4si)index, 1269 (__v2df)mask, 1270 scale); 1271} 1272 1273extern __inline __m256d 1274__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1275_mm256_i32gather_pd (double const *base, __m128i index, const int scale) 1276{ 1277 __v4df zero = _mm256_setzero_pd (); 1278 __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ); 1279 1280 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), 1281 base, 1282 (__v4si)index, 1283 mask, 1284 scale); 1285} 1286 1287extern __inline __m256d 1288__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1289_mm256_mask_i32gather_pd (__m256d src, double const *base, 1290 __m128i index, __m256d mask, const int scale) 1291{ 1292 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, 1293 base, 1294 (__v4si)index, 1295 (__v4df)mask, 1296 scale); 1297} 1298 1299extern __inline __m128d 1300__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1301_mm_i64gather_pd (double const *base, __m128i index, const int scale) 1302{ 1303 __v2df src = _mm_setzero_pd (); 1304 __v2df mask = _mm_cmpeq_pd (src, src); 1305 1306 return (__m128d) __builtin_ia32_gatherdiv2df (src, 1307 base, 1308 (__v2di)index, 1309 mask, 1310 scale); 1311} 1312 1313extern __inline __m128d 1314__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1315_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, 1316 __m128d mask, const int scale) 1317{ 1318 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, 1319 base, 1320 (__v2di)index, 1321 (__v2df)mask, 1322 scale); 1323} 1324 1325extern __inline __m256d 1326__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1327_mm256_i64gather_pd (double const *base, __m256i index, const int scale) 1328{ 1329 __v4df src = _mm256_setzero_pd (); 1330 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1331 1332 return (__m256d) __builtin_ia32_gatherdiv4df (src, 1333 base, 1334 (__v4di)index, 1335 mask, 1336 scale); 1337} 1338 1339extern __inline __m256d 1340__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1341_mm256_mask_i64gather_pd (__m256d src, double const *base, 1342 __m256i index, __m256d mask, const int scale) 1343{ 1344 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, 1345 base, 1346 (__v4di)index, 1347 (__v4df)mask, 1348 scale); 1349} 1350 1351extern __inline __m128 1352__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1353_mm_i32gather_ps (float const *base, __m128i index, const int scale) 1354{ 1355 __v4sf src = _mm_setzero_ps (); 1356 __v4sf mask = _mm_cmpeq_ps (src, src); 1357 1358 return (__m128) __builtin_ia32_gathersiv4sf (src, 1359 base, 1360 (__v4si)index, 1361 mask, 1362 scale); 1363} 1364 1365extern __inline __m128 1366__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1367_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, 1368 __m128 mask, const int scale) 1369{ 1370 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, 1371 base, 1372 (__v4si)index, 1373 (__v4sf)mask, 1374 scale); 1375} 1376 1377extern __inline __m256 1378__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1379_mm256_i32gather_ps (float const *base, __m256i index, const int scale) 1380{ 1381 __v8sf src = _mm256_setzero_ps (); 1382 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 1383 1384 return (__m256) __builtin_ia32_gathersiv8sf (src, 1385 base, 1386 (__v8si)index, 1387 mask, 1388 scale); 1389} 1390 1391extern __inline __m256 1392__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1393_mm256_mask_i32gather_ps (__m256 src, float const *base, 1394 __m256i index, __m256 mask, const int scale) 1395{ 1396 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, 1397 base, 1398 (__v8si)index, 1399 (__v8sf)mask, 1400 scale); 1401} 1402 1403extern __inline __m128 1404__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1405_mm_i64gather_ps (float const *base, __m128i index, const int scale) 1406{ 1407 __v4sf src = _mm_setzero_ps (); 1408 __v4sf mask = _mm_cmpeq_ps (src, src); 1409 1410 return (__m128) __builtin_ia32_gatherdiv4sf (src, 1411 base, 1412 (__v2di)index, 1413 mask, 1414 scale); 1415} 1416 1417extern __inline __m128 1418__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1419_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, 1420 __m128 mask, const int scale) 1421{ 1422 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, 1423 base, 1424 (__v2di)index, 1425 (__v4sf)mask, 1426 scale); 1427} 1428 1429extern __inline __m128 1430__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1431_mm256_i64gather_ps (float const *base, __m256i index, const int scale) 1432{ 1433 __v4sf src = _mm_setzero_ps (); 1434 __v4sf mask = _mm_cmpeq_ps (src, src); 1435 1436 return (__m128) __builtin_ia32_gatherdiv4sf256 (src, 1437 base, 1438 (__v4di)index, 1439 mask, 1440 scale); 1441} 1442 1443extern __inline __m128 1444__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1445_mm256_mask_i64gather_ps (__m128 src, float const *base, 1446 __m256i index, __m128 mask, const int scale) 1447{ 1448 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, 1449 base, 1450 (__v4di)index, 1451 (__v4sf)mask, 1452 scale); 1453} 1454 1455extern __inline __m128i 1456__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1457_mm_i32gather_epi64 (long long int const *base, 1458 __m128i index, const int scale) 1459{ 1460 __v2di src = __extension__ (__v2di){ 0, 0 }; 1461 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1462 1463 return (__m128i) __builtin_ia32_gathersiv2di (src, 1464 base, 1465 (__v4si)index, 1466 mask, 1467 scale); 1468} 1469 1470extern __inline __m128i 1471__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1472_mm_mask_i32gather_epi64 (__m128i src, long long int const *base, 1473 __m128i index, __m128i mask, const int scale) 1474{ 1475 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, 1476 base, 1477 (__v4si)index, 1478 (__v2di)mask, 1479 scale); 1480} 1481 1482extern __inline __m256i 1483__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1484_mm256_i32gather_epi64 (long long int const *base, 1485 __m128i index, const int scale) 1486{ 1487 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1488 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1489 1490 return (__m256i) __builtin_ia32_gathersiv4di (src, 1491 base, 1492 (__v4si)index, 1493 mask, 1494 scale); 1495} 1496 1497extern __inline __m256i 1498__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1499_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, 1500 __m128i index, __m256i mask, const int scale) 1501{ 1502 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, 1503 base, 1504 (__v4si)index, 1505 (__v4di)mask, 1506 scale); 1507} 1508 1509extern __inline __m128i 1510__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1511_mm_i64gather_epi64 (long long int const *base, 1512 __m128i index, const int scale) 1513{ 1514 __v2di src = __extension__ (__v2di){ 0, 0 }; 1515 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1516 1517 return (__m128i) __builtin_ia32_gatherdiv2di (src, 1518 base, 1519 (__v2di)index, 1520 mask, 1521 scale); 1522} 1523 1524extern __inline __m128i 1525__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1526_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, 1527 __m128i mask, const int scale) 1528{ 1529 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, 1530 base, 1531 (__v2di)index, 1532 (__v2di)mask, 1533 scale); 1534} 1535 1536extern __inline __m256i 1537__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1538_mm256_i64gather_epi64 (long long int const *base, 1539 __m256i index, const int scale) 1540{ 1541 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1542 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1543 1544 return (__m256i) __builtin_ia32_gatherdiv4di (src, 1545 base, 1546 (__v4di)index, 1547 mask, 1548 scale); 1549} 1550 1551extern __inline __m256i 1552__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1553_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, 1554 __m256i index, __m256i mask, const int scale) 1555{ 1556 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, 1557 base, 1558 (__v4di)index, 1559 (__v4di)mask, 1560 scale); 1561} 1562 1563extern __inline __m128i 1564__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1565_mm_i32gather_epi32 (int const *base, __m128i index, const int scale) 1566{ 1567 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1568 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1569 1570 return (__m128i) __builtin_ia32_gathersiv4si (src, 1571 base, 1572 (__v4si)index, 1573 mask, 1574 scale); 1575} 1576 1577extern __inline __m128i 1578__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1579_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, 1580 __m128i mask, const int scale) 1581{ 1582 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, 1583 base, 1584 (__v4si)index, 1585 (__v4si)mask, 1586 scale); 1587} 1588 1589extern __inline __m256i 1590__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1591_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) 1592{ 1593 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1594 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1595 1596 return (__m256i) __builtin_ia32_gathersiv8si (src, 1597 base, 1598 (__v8si)index, 1599 mask, 1600 scale); 1601} 1602 1603extern __inline __m256i 1604__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1605_mm256_mask_i32gather_epi32 (__m256i src, int const *base, 1606 __m256i index, __m256i mask, const int scale) 1607{ 1608 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, 1609 base, 1610 (__v8si)index, 1611 (__v8si)mask, 1612 scale); 1613} 1614 1615extern __inline __m128i 1616__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1617_mm_i64gather_epi32 (int const *base, __m128i index, const int scale) 1618{ 1619 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1620 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1621 1622 return (__m128i) __builtin_ia32_gatherdiv4si (src, 1623 base, 1624 (__v2di)index, 1625 mask, 1626 scale); 1627} 1628 1629extern __inline __m128i 1630__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1631_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, 1632 __m128i mask, const int scale) 1633{ 1634 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, 1635 base, 1636 (__v2di)index, 1637 (__v4si)mask, 1638 scale); 1639} 1640 1641extern __inline __m128i 1642__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1643_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) 1644{ 1645 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1646 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1647 1648 return (__m128i) __builtin_ia32_gatherdiv4si256 (src, 1649 base, 1650 (__v4di)index, 1651 mask, 1652 scale); 1653} 1654 1655extern __inline __m128i 1656__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1657_mm256_mask_i64gather_epi32 (__m128i src, int const *base, 1658 __m256i index, __m128i mask, const int scale) 1659{ 1660 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, 1661 base, 1662 (__v4di)index, 1663 (__v4si)mask, 1664 scale); 1665} 1666#else /* __OPTIMIZE__ */ 1667#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1668 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1669 (double const *)BASE, \ 1670 (__v4si)(__m128i)INDEX, \ 1671 (__v2df)_mm_set1_pd( \ 1672 (double)(long long int) -1), \ 1673 (int)SCALE) 1674 1675#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1676 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1677 (double const *)BASE, \ 1678 (__v4si)(__m128i)INDEX, \ 1679 (__v2df)(__m128d)MASK, \ 1680 (int)SCALE) 1681 1682#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1683 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1684 (double const *)BASE, \ 1685 (__v4si)(__m128i)INDEX, \ 1686 (__v4df)_mm256_set1_pd( \ 1687 (double)(long long int) -1), \ 1688 (int)SCALE) 1689 1690#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1691 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1692 (double const *)BASE, \ 1693 (__v4si)(__m128i)INDEX, \ 1694 (__v4df)(__m256d)MASK, \ 1695 (int)SCALE) 1696 1697#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1698 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1699 (double const *)BASE, \ 1700 (__v2di)(__m128i)INDEX, \ 1701 (__v2df)_mm_set1_pd( \ 1702 (double)(long long int) -1), \ 1703 (int)SCALE) 1704 1705#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1706 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1707 (double const *)BASE, \ 1708 (__v2di)(__m128i)INDEX, \ 1709 (__v2df)(__m128d)MASK, \ 1710 (int)SCALE) 1711 1712#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1713 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1714 (double const *)BASE, \ 1715 (__v4di)(__m256i)INDEX, \ 1716 (__v4df)_mm256_set1_pd( \ 1717 (double)(long long int) -1), \ 1718 (int)SCALE) 1719 1720#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1721 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1722 (double const *)BASE, \ 1723 (__v4di)(__m256i)INDEX, \ 1724 (__v4df)(__m256d)MASK, \ 1725 (int)SCALE) 1726 1727#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1728 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1729 (float const *)BASE, \ 1730 (__v4si)(__m128i)INDEX, \ 1731 _mm_set1_ps ((float)(int) -1), \ 1732 (int)SCALE) 1733 1734#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1735 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1736 (float const *)BASE, \ 1737 (__v4si)(__m128i)INDEX, \ 1738 (__v4sf)(__m128d)MASK, \ 1739 (int)SCALE) 1740 1741#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1742 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1743 (float const *)BASE, \ 1744 (__v8si)(__m256i)INDEX, \ 1745 (__v8sf)_mm256_set1_ps ( \ 1746 (float)(int) -1), \ 1747 (int)SCALE) 1748 1749#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1750 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1751 (float const *)BASE, \ 1752 (__v8si)(__m256i)INDEX, \ 1753 (__v8sf)(__m256d)MASK, \ 1754 (int)SCALE) 1755 1756#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1757 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1758 (float const *)BASE, \ 1759 (__v2di)(__m128i)INDEX, \ 1760 (__v4sf)_mm_set1_ps ( \ 1761 (float)(int) -1), \ 1762 (int)SCALE) 1763 1764#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1765 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1766 (float const *)BASE, \ 1767 (__v2di)(__m128i)INDEX, \ 1768 (__v4sf)(__m128d)MASK, \ 1769 (int)SCALE) 1770 1771#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1772 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1773 (float const *)BASE, \ 1774 (__v4di)(__m256i)INDEX, \ 1775 (__v4sf)_mm_set1_ps( \ 1776 (float)(int) -1), \ 1777 (int)SCALE) 1778 1779#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1780 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1781 (float const *)BASE, \ 1782 (__v4di)(__m256i)INDEX, \ 1783 (__v4sf)(__m128)MASK, \ 1784 (int)SCALE) 1785 1786#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1787 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1788 (long long const *)BASE, \ 1789 (__v4si)(__m128i)INDEX, \ 1790 (__v2di)_mm_set1_epi64x (-1), \ 1791 (int)SCALE) 1792 1793#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1794 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1795 (long long const *)BASE, \ 1796 (__v4si)(__m128i)INDEX, \ 1797 (__v2di)(__m128i)MASK, \ 1798 (int)SCALE) 1799 1800#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1801 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1802 (long long const *)BASE, \ 1803 (__v4si)(__m128i)INDEX, \ 1804 (__v4di)_mm256_set1_epi64x (-1), \ 1805 (int)SCALE) 1806 1807#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1808 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1809 (long long const *)BASE, \ 1810 (__v4si)(__m128i)INDEX, \ 1811 (__v4di)(__m256i)MASK, \ 1812 (int)SCALE) 1813 1814#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1815 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1816 (long long const *)BASE, \ 1817 (__v2di)(__m128i)INDEX, \ 1818 (__v2di)_mm_set1_epi64x (-1), \ 1819 (int)SCALE) 1820 1821#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1822 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1823 (long long const *)BASE, \ 1824 (__v2di)(__m128i)INDEX, \ 1825 (__v2di)(__m128i)MASK, \ 1826 (int)SCALE) 1827 1828#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1829 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1830 (long long const *)BASE, \ 1831 (__v4di)(__m256i)INDEX, \ 1832 (__v4di)_mm256_set1_epi64x (-1), \ 1833 (int)SCALE) 1834 1835#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1836 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1837 (long long const *)BASE, \ 1838 (__v4di)(__m256i)INDEX, \ 1839 (__v4di)(__m256i)MASK, \ 1840 (int)SCALE) 1841 1842#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1843 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1844 (int const *)BASE, \ 1845 (__v4si)(__m128i)INDEX, \ 1846 (__v4si)_mm_set1_epi32 (-1), \ 1847 (int)SCALE) 1848 1849#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1850 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1851 (int const *)BASE, \ 1852 (__v4si)(__m128i)INDEX, \ 1853 (__v4si)(__m128i)MASK, \ 1854 (int)SCALE) 1855 1856#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1857 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1858 (int const *)BASE, \ 1859 (__v8si)(__m256i)INDEX, \ 1860 (__v8si)_mm256_set1_epi32 (-1), \ 1861 (int)SCALE) 1862 1863#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1864 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1865 (int const *)BASE, \ 1866 (__v8si)(__m256i)INDEX, \ 1867 (__v8si)(__m256i)MASK, \ 1868 (int)SCALE) 1869 1870#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1871 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1872 (int const *)BASE, \ 1873 (__v2di)(__m128i)INDEX, \ 1874 (__v4si)_mm_set1_epi32 (-1), \ 1875 (int)SCALE) 1876 1877#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1878 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1879 (int const *)BASE, \ 1880 (__v2di)(__m128i)INDEX, \ 1881 (__v4si)(__m128i)MASK, \ 1882 (int)SCALE) 1883 1884#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1885 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1886 (int const *)BASE, \ 1887 (__v4di)(__m256i)INDEX, \ 1888 (__v4si)_mm_set1_epi32(-1), \ 1889 (int)SCALE) 1890 1891#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1892 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1893 (int const *)BASE, \ 1894 (__v4di)(__m256i)INDEX, \ 1895 (__v4si)(__m128i)MASK, \ 1896 (int)SCALE) 1897#endif /* __OPTIMIZE__ */ 1898 1899#ifdef __DISABLE_AVX2__ 1900#undef __DISABLE_AVX2__ 1901#pragma GCC pop_options 1902#endif /* __DISABLE_AVX2__ */ 1903 1904#endif /* _AVX2INTRIN_H_INCLUDED */ 1905