1/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24#ifndef _IMMINTRIN_H_INCLUDED
25# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef _AVX2INTRIN_H_INCLUDED
29#define _AVX2INTRIN_H_INCLUDED
30
31#ifndef __AVX2__
32#pragma GCC push_options
33#pragma GCC target("avx2")
34#define __DISABLE_AVX2__
35#endif /* __AVX2__ */
36
37/* Sum absolute 8-bit integer difference of adjacent groups of 4
38   byte integers in the first 2 operands.  Starting offsets within
39   operands are determined by the 3rd mask operand.  */
40#ifdef __OPTIMIZE__
41extern __inline __m256i
42__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
43_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
44{
45  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46					      (__v32qi)__Y, __M);
47}
48#else
49#define _mm256_mpsadbw_epu8(X, Y, M)					\
50  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
51					(__v32qi)(__m256i)(Y), (int)(M)))
52#endif
53
54extern __inline __m256i
55__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
56_mm256_abs_epi8 (__m256i __A)
57{
58  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
59}
60
61extern __inline __m256i
62__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
63_mm256_abs_epi16 (__m256i __A)
64{
65  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
66}
67
68extern __inline __m256i
69__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
70_mm256_abs_epi32 (__m256i __A)
71{
72  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
73}
74
75extern __inline __m256i
76__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
77_mm256_packs_epi32 (__m256i __A, __m256i __B)
78{
79  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
80}
81
82extern __inline __m256i
83__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
84_mm256_packs_epi16 (__m256i __A, __m256i __B)
85{
86  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
87}
88
89extern __inline __m256i
90__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
91_mm256_packus_epi32 (__m256i __A, __m256i __B)
92{
93  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
94}
95
96extern __inline __m256i
97__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
98_mm256_packus_epi16 (__m256i __A, __m256i __B)
99{
100  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
101}
102
103extern __inline __m256i
104__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
105_mm256_add_epi8 (__m256i __A, __m256i __B)
106{
107  return (__m256i) ((__v32qu)__A + (__v32qu)__B);
108}
109
110extern __inline __m256i
111__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
112_mm256_add_epi16 (__m256i __A, __m256i __B)
113{
114  return (__m256i) ((__v16hu)__A + (__v16hu)__B);
115}
116
117extern __inline __m256i
118__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
119_mm256_add_epi32 (__m256i __A, __m256i __B)
120{
121  return (__m256i) ((__v8su)__A + (__v8su)__B);
122}
123
124extern __inline __m256i
125__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
126_mm256_add_epi64 (__m256i __A, __m256i __B)
127{
128  return (__m256i) ((__v4du)__A + (__v4du)__B);
129}
130
131extern __inline __m256i
132__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
133_mm256_adds_epi8 (__m256i __A, __m256i __B)
134{
135  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
136}
137
138extern __inline __m256i
139__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
140_mm256_adds_epi16 (__m256i __A, __m256i __B)
141{
142  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
143}
144
145extern __inline __m256i
146__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
147_mm256_adds_epu8 (__m256i __A, __m256i __B)
148{
149  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
150}
151
152extern __inline __m256i
153__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154_mm256_adds_epu16 (__m256i __A, __m256i __B)
155{
156  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
157}
158
159#ifdef __OPTIMIZE__
160extern __inline __m256i
161__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
162_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
163{
164  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165					      (__v4di)__B,
166					      __N * 8);
167}
168#else
169/* In that case (__N*8) will be in vreg, and insn will not be matched. */
170/* Use define instead */
171#define _mm256_alignr_epi8(A, B, N)				   \
172  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
173					(__v4di)(__m256i)(B),	   \
174					(int)(N) * 8))
175#endif
176
177extern __inline __m256i
178__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
179_mm256_and_si256 (__m256i __A, __m256i __B)
180{
181  return (__m256i) ((__v4du)__A & (__v4du)__B);
182}
183
184extern __inline __m256i
185__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
186_mm256_andnot_si256 (__m256i __A, __m256i __B)
187{
188  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
189}
190
191extern __inline __m256i
192__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
193_mm256_avg_epu8 (__m256i __A, __m256i __B)
194{
195  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
196}
197
198extern __inline __m256i
199__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
200_mm256_avg_epu16 (__m256i __A, __m256i __B)
201{
202  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
203}
204
205extern __inline __m256i
206__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
207_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
208{
209  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210					       (__v32qi)__Y,
211					       (__v32qi)__M);
212}
213
214#ifdef __OPTIMIZE__
215extern __inline __m256i
216__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
217_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
218{
219  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220					      (__v16hi)__Y,
221					       __M);
222}
223#else
224#define _mm256_blend_epi16(X, Y, M)					\
225  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
226					(__v16hi)(__m256i)(Y), (int)(M)))
227#endif
228
229extern __inline __m256i
230__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
231_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
232{
233  return (__m256i) ((__v32qi)__A == (__v32qi)__B);
234}
235
236extern __inline __m256i
237__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
238_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
239{
240  return (__m256i) ((__v16hi)__A == (__v16hi)__B);
241}
242
243extern __inline __m256i
244__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
245_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
246{
247  return (__m256i) ((__v8si)__A == (__v8si)__B);
248}
249
250extern __inline __m256i
251__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
252_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
253{
254  return (__m256i) ((__v4di)__A == (__v4di)__B);
255}
256
257extern __inline __m256i
258__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
260{
261  return (__m256i) ((__v32qi)__A > (__v32qi)__B);
262}
263
264extern __inline __m256i
265__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
267{
268  return (__m256i) ((__v16hi)__A > (__v16hi)__B);
269}
270
271extern __inline __m256i
272__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
273_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
274{
275  return (__m256i) ((__v8si)__A > (__v8si)__B);
276}
277
278extern __inline __m256i
279__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
280_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
281{
282  return (__m256i) ((__v4di)__A > (__v4di)__B);
283}
284
285extern __inline __m256i
286__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
287_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
288{
289  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290					     (__v16hi)__Y);
291}
292
293extern __inline __m256i
294__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
295_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
296{
297  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
298}
299
300extern __inline __m256i
301__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
302_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
303{
304  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305					      (__v16hi)__Y);
306}
307
308extern __inline __m256i
309__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
310_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
311{
312  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313					     (__v16hi)__Y);
314}
315
316extern __inline __m256i
317__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
318_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
319{
320  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
321}
322
323extern __inline __m256i
324__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
325_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
326{
327  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328					      (__v16hi)__Y);
329}
330
331extern __inline __m256i
332__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
333_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
334{
335  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336						(__v32qi)__Y);
337}
338
339extern __inline __m256i
340__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
341_mm256_madd_epi16 (__m256i __A, __m256i __B)
342{
343  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344					     (__v16hi)__B);
345}
346
347extern __inline __m256i
348__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
349_mm256_max_epi8 (__m256i __A, __m256i __B)
350{
351  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
352}
353
354extern __inline __m256i
355__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
356_mm256_max_epi16 (__m256i __A, __m256i __B)
357{
358  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
359}
360
361extern __inline __m256i
362__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
363_mm256_max_epi32 (__m256i __A, __m256i __B)
364{
365  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
366}
367
368extern __inline __m256i
369__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
370_mm256_max_epu8 (__m256i __A, __m256i __B)
371{
372  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
373}
374
375extern __inline __m256i
376__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
377_mm256_max_epu16 (__m256i __A, __m256i __B)
378{
379  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
380}
381
382extern __inline __m256i
383__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
384_mm256_max_epu32 (__m256i __A, __m256i __B)
385{
386  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
387}
388
389extern __inline __m256i
390__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
391_mm256_min_epi8 (__m256i __A, __m256i __B)
392{
393  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
394}
395
396extern __inline __m256i
397__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
398_mm256_min_epi16 (__m256i __A, __m256i __B)
399{
400  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
401}
402
403extern __inline __m256i
404__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
405_mm256_min_epi32 (__m256i __A, __m256i __B)
406{
407  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
408}
409
410extern __inline __m256i
411__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
412_mm256_min_epu8 (__m256i __A, __m256i __B)
413{
414  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
415}
416
417extern __inline __m256i
418__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
419_mm256_min_epu16 (__m256i __A, __m256i __B)
420{
421  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
422}
423
424extern __inline __m256i
425__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
426_mm256_min_epu32 (__m256i __A, __m256i __B)
427{
428  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
429}
430
431extern __inline int
432__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
433_mm256_movemask_epi8 (__m256i __A)
434{
435  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
436}
437
438extern __inline __m256i
439__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
440_mm256_cvtepi8_epi16 (__m128i __X)
441{
442  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
443}
444
445extern __inline __m256i
446__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
447_mm256_cvtepi8_epi32 (__m128i __X)
448{
449  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
450}
451
452extern __inline __m256i
453__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
454_mm256_cvtepi8_epi64 (__m128i __X)
455{
456  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
457}
458
459extern __inline __m256i
460__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
461_mm256_cvtepi16_epi32 (__m128i __X)
462{
463  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
464}
465
466extern __inline __m256i
467__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
468_mm256_cvtepi16_epi64 (__m128i __X)
469{
470  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
471}
472
473extern __inline __m256i
474__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
475_mm256_cvtepi32_epi64 (__m128i __X)
476{
477  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
478}
479
480extern __inline __m256i
481__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
482_mm256_cvtepu8_epi16 (__m128i __X)
483{
484  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
485}
486
487extern __inline __m256i
488__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
489_mm256_cvtepu8_epi32 (__m128i __X)
490{
491  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
492}
493
494extern __inline __m256i
495__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496_mm256_cvtepu8_epi64 (__m128i __X)
497{
498  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
499}
500
501extern __inline __m256i
502__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503_mm256_cvtepu16_epi32 (__m128i __X)
504{
505  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
506}
507
508extern __inline __m256i
509__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
510_mm256_cvtepu16_epi64 (__m128i __X)
511{
512  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
513}
514
515extern __inline __m256i
516__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
517_mm256_cvtepu32_epi64 (__m128i __X)
518{
519  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
520}
521
522extern __inline __m256i
523__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
524_mm256_mul_epi32 (__m256i __X, __m256i __Y)
525{
526  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
527}
528
529extern __inline __m256i
530__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
531_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
532{
533  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534					       (__v16hi)__Y);
535}
536
537extern __inline __m256i
538__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
539_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
540{
541  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
542}
543
544extern __inline __m256i
545__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
546_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
547{
548  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
549}
550
551extern __inline __m256i
552__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
553_mm256_mullo_epi16 (__m256i __A, __m256i __B)
554{
555  return (__m256i) ((__v16hu)__A * (__v16hu)__B);
556}
557
558extern __inline __m256i
559__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
560_mm256_mullo_epi32 (__m256i __A, __m256i __B)
561{
562  return (__m256i) ((__v8su)__A * (__v8su)__B);
563}
564
565extern __inline __m256i
566__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
567_mm256_mul_epu32 (__m256i __A, __m256i __B)
568{
569  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
570}
571
572extern __inline __m256i
573__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
574_mm256_or_si256 (__m256i __A, __m256i __B)
575{
576  return (__m256i) ((__v4du)__A | (__v4du)__B);
577}
578
579extern __inline __m256i
580__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
581_mm256_sad_epu8 (__m256i __A, __m256i __B)
582{
583  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
584}
585
586extern __inline __m256i
587__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
588_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
589{
590  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591					     (__v32qi)__Y);
592}
593
594#ifdef __OPTIMIZE__
595extern __inline __m256i
596__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
597_mm256_shuffle_epi32 (__m256i __A, const int __mask)
598{
599  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
600}
601
602extern __inline __m256i
603__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
604_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
605{
606  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
607}
608
609extern __inline __m256i
610__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
611_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
612{
613  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
614}
615#else
616#define _mm256_shuffle_epi32(A, N) \
617  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618#define _mm256_shufflehi_epi16(A, N) \
619  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620#define _mm256_shufflelo_epi16(A, N) \
621  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622#endif
623
624extern __inline __m256i
625__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
626_mm256_sign_epi8 (__m256i __X, __m256i __Y)
627{
628  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
629}
630
631extern __inline __m256i
632__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
633_mm256_sign_epi16 (__m256i __X, __m256i __Y)
634{
635  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
636}
637
638extern __inline __m256i
639__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
640_mm256_sign_epi32 (__m256i __X, __m256i __Y)
641{
642  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
643}
644
645#ifdef __OPTIMIZE__
646extern __inline __m256i
647__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
648_mm256_bslli_epi128 (__m256i __A, const int __N)
649{
650  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
651}
652
653extern __inline __m256i
654__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
655_mm256_slli_si256 (__m256i __A, const int __N)
656{
657  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
658}
659#else
660#define _mm256_bslli_epi128(A, N) \
661  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662#define _mm256_slli_si256(A, N) \
663  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664#endif
665
666extern __inline __m256i
667__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668_mm256_slli_epi16 (__m256i __A, int __B)
669{
670  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
671}
672
673extern __inline __m256i
674__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675_mm256_sll_epi16 (__m256i __A, __m128i __B)
676{
677  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
678}
679
680extern __inline __m256i
681__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682_mm256_slli_epi32 (__m256i __A, int __B)
683{
684  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
685}
686
687extern __inline __m256i
688__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689_mm256_sll_epi32 (__m256i __A, __m128i __B)
690{
691  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
692}
693
694extern __inline __m256i
695__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696_mm256_slli_epi64 (__m256i __A, int __B)
697{
698  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
699}
700
701extern __inline __m256i
702__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703_mm256_sll_epi64 (__m256i __A, __m128i __B)
704{
705  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
706}
707
708extern __inline __m256i
709__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710_mm256_srai_epi16 (__m256i __A, int __B)
711{
712  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
713}
714
715extern __inline __m256i
716__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717_mm256_sra_epi16 (__m256i __A, __m128i __B)
718{
719  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
720}
721
722extern __inline __m256i
723__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
724_mm256_srai_epi32 (__m256i __A, int __B)
725{
726  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
727}
728
729extern __inline __m256i
730__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
731_mm256_sra_epi32 (__m256i __A, __m128i __B)
732{
733  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
734}
735
736#ifdef __OPTIMIZE__
737extern __inline __m256i
738__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
739_mm256_bsrli_epi128 (__m256i __A, const int __N)
740{
741  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
742}
743
744extern __inline __m256i
745__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
746_mm256_srli_si256 (__m256i __A, const int __N)
747{
748  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
749}
750#else
751#define _mm256_bsrli_epi128(A, N) \
752  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753#define _mm256_srli_si256(A, N) \
754  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755#endif
756
757extern __inline __m256i
758__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
759_mm256_srli_epi16 (__m256i __A, int __B)
760{
761  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
762}
763
764extern __inline __m256i
765__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
766_mm256_srl_epi16 (__m256i __A, __m128i __B)
767{
768  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
769}
770
771extern __inline __m256i
772__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
773_mm256_srli_epi32 (__m256i __A, int __B)
774{
775  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
776}
777
778extern __inline __m256i
779__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
780_mm256_srl_epi32 (__m256i __A, __m128i __B)
781{
782  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
783}
784
785extern __inline __m256i
786__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
787_mm256_srli_epi64 (__m256i __A, int __B)
788{
789  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
790}
791
792extern __inline __m256i
793__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
794_mm256_srl_epi64 (__m256i __A, __m128i __B)
795{
796  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
797}
798
799extern __inline __m256i
800__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
801_mm256_sub_epi8 (__m256i __A, __m256i __B)
802{
803  return (__m256i) ((__v32qu)__A - (__v32qu)__B);
804}
805
806extern __inline __m256i
807__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
808_mm256_sub_epi16 (__m256i __A, __m256i __B)
809{
810  return (__m256i) ((__v16hu)__A - (__v16hu)__B);
811}
812
813extern __inline __m256i
814__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
815_mm256_sub_epi32 (__m256i __A, __m256i __B)
816{
817  return (__m256i) ((__v8su)__A - (__v8su)__B);
818}
819
820extern __inline __m256i
821__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
822_mm256_sub_epi64 (__m256i __A, __m256i __B)
823{
824  return (__m256i) ((__v4du)__A - (__v4du)__B);
825}
826
827extern __inline __m256i
828__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
829_mm256_subs_epi8 (__m256i __A, __m256i __B)
830{
831  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
832}
833
834extern __inline __m256i
835__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
836_mm256_subs_epi16 (__m256i __A, __m256i __B)
837{
838  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
839}
840
841extern __inline __m256i
842__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
843_mm256_subs_epu8 (__m256i __A, __m256i __B)
844{
845  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
846}
847
848extern __inline __m256i
849__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
850_mm256_subs_epu16 (__m256i __A, __m256i __B)
851{
852  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
853}
854
855extern __inline __m256i
856__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
857_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
858{
859  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
860}
861
862extern __inline __m256i
863__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
864_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
865{
866  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
867}
868
869extern __inline __m256i
870__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
871_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
872{
873  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
874}
875
876extern __inline __m256i
877__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
878_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
879{
880  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
881}
882
883extern __inline __m256i
884__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
885_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
886{
887  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
888}
889
890extern __inline __m256i
891__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
892_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
893{
894  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
895}
896
897extern __inline __m256i
898__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
899_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
900{
901  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
902}
903
904extern __inline __m256i
905__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
906_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
907{
908  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
909}
910
911extern __inline __m256i
912__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
913_mm256_xor_si256 (__m256i __A, __m256i __B)
914{
915  return (__m256i) ((__v4du)__A ^ (__v4du)__B);
916}
917
918extern __inline __m256i
919__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
920_mm256_stream_load_si256 (__m256i const *__X)
921{
922  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
923}
924
925extern __inline __m128
926__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
927_mm_broadcastss_ps (__m128 __X)
928{
929  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
930}
931
932extern __inline __m256
933__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
934_mm256_broadcastss_ps (__m128 __X)
935{
936  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
937}
938
939extern __inline __m256d
940__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
941_mm256_broadcastsd_pd (__m128d __X)
942{
943  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
944}
945
946extern __inline __m256i
947__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948_mm256_broadcastsi128_si256 (__m128i __X)
949{
950  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
951}
952
953#ifdef __OPTIMIZE__
954extern __inline __m128i
955__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
956_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
957{
958  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
959					      (__v4si)__Y,
960					      __M);
961}
962#else
963#define _mm_blend_epi32(X, Y, M)					\
964  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
965					(__v4si)(__m128i)(Y), (int)(M)))
966#endif
967
968#ifdef __OPTIMIZE__
969extern __inline __m256i
970__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
971_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
972{
973  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
974					      (__v8si)__Y,
975					      __M);
976}
977#else
978#define _mm256_blend_epi32(X, Y, M)					\
979  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
980					(__v8si)(__m256i)(Y), (int)(M)))
981#endif
982
983extern __inline __m256i
984__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
985_mm256_broadcastb_epi8 (__m128i __X)
986{
987  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
988}
989
990extern __inline __m256i
991__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
992_mm256_broadcastw_epi16 (__m128i __X)
993{
994  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
995}
996
997extern __inline __m256i
998__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
999_mm256_broadcastd_epi32 (__m128i __X)
1000{
1001  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1002}
1003
1004extern __inline __m256i
1005__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1006_mm256_broadcastq_epi64 (__m128i __X)
1007{
1008  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1009}
1010
1011extern __inline __m128i
1012__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1013_mm_broadcastb_epi8 (__m128i __X)
1014{
1015  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1016}
1017
1018extern __inline __m128i
1019__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1020_mm_broadcastw_epi16 (__m128i __X)
1021{
1022  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1023}
1024
1025extern __inline __m128i
1026__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1027_mm_broadcastd_epi32 (__m128i __X)
1028{
1029  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1030}
1031
1032extern __inline __m128i
1033__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1034_mm_broadcastq_epi64 (__m128i __X)
1035{
1036  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1037}
1038
1039extern __inline __m256i
1040__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1041_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1042{
1043  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1044}
1045
1046#ifdef __OPTIMIZE__
1047extern __inline __m256d
1048__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1049_mm256_permute4x64_pd (__m256d __X, const int __M)
1050{
1051  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1052}
1053#else
1054#define _mm256_permute4x64_pd(X, M)			       \
1055  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1056#endif
1057
1058extern __inline __m256
1059__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1060_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1061{
1062  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1063}
1064
1065#ifdef __OPTIMIZE__
1066extern __inline __m256i
1067__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1068_mm256_permute4x64_epi64 (__m256i __X, const int __M)
1069{
1070  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1071}
1072#else
1073#define _mm256_permute4x64_epi64(X, M)			       \
1074  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1075#endif
1076
1077
1078#ifdef __OPTIMIZE__
1079extern __inline __m256i
1080__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1081_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1082{
1083  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1084}
1085#else
1086#define _mm256_permute2x128_si256(X, Y, M)				\
1087  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1088#endif
1089
1090#ifdef __OPTIMIZE__
1091extern __inline __m128i
1092__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1093_mm256_extracti128_si256 (__m256i __X, const int __M)
1094{
1095  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1096}
1097#else
1098#define _mm256_extracti128_si256(X, M)				\
1099  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1100#endif
1101
1102#ifdef __OPTIMIZE__
1103extern __inline __m256i
1104__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1105_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1106{
1107  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1108}
1109#else
1110#define _mm256_inserti128_si256(X, Y, M)			 \
1111  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1112					   (__v2di)(__m128i)(Y), \
1113					   (int)(M)))
1114#endif
1115
1116extern __inline __m256i
1117__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1118_mm256_maskload_epi32 (int const *__X, __m256i __M )
1119{
1120  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1121						(__v8si)__M);
1122}
1123
1124extern __inline __m256i
1125__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1126_mm256_maskload_epi64 (long long const *__X, __m256i __M )
1127{
1128  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1129						(__v4di)__M);
1130}
1131
1132extern __inline __m128i
1133__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134_mm_maskload_epi32 (int const *__X, __m128i __M )
1135{
1136  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1137					     (__v4si)__M);
1138}
1139
1140extern __inline __m128i
1141__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1142_mm_maskload_epi64 (long long const *__X, __m128i __M )
1143{
1144  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1145					     (__v2di)__M);
1146}
1147
1148extern __inline void
1149__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1150_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1151{
1152  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1153}
1154
1155extern __inline void
1156__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1157_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1158{
1159  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1160}
1161
1162extern __inline void
1163__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1164_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1165{
1166  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1167}
1168
1169extern __inline void
1170__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1171_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1172{
1173  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1174}
1175
1176extern __inline __m256i
1177__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1178_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1179{
1180  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1181}
1182
1183extern __inline __m128i
1184__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1185_mm_sllv_epi32 (__m128i __X, __m128i __Y)
1186{
1187  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1188}
1189
1190extern __inline __m256i
1191__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1192_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1193{
1194  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1195}
1196
1197extern __inline __m128i
1198__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1199_mm_sllv_epi64 (__m128i __X, __m128i __Y)
1200{
1201  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1202}
1203
1204extern __inline __m256i
1205__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1206_mm256_srav_epi32 (__m256i __X, __m256i __Y)
1207{
1208  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1209}
1210
1211extern __inline __m128i
1212__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1213_mm_srav_epi32 (__m128i __X, __m128i __Y)
1214{
1215  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1216}
1217
1218extern __inline __m256i
1219__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1220_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1221{
1222  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1223}
1224
1225extern __inline __m128i
1226__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1227_mm_srlv_epi32 (__m128i __X, __m128i __Y)
1228{
1229  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1230}
1231
1232extern __inline __m256i
1233__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1234_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1235{
1236  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1237}
1238
1239extern __inline __m128i
1240__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1241_mm_srlv_epi64 (__m128i __X, __m128i __Y)
1242{
1243  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1244}
1245
1246#ifdef __OPTIMIZE__
1247extern __inline __m128d
1248__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1249_mm_i32gather_pd (double const *base, __m128i index, const int scale)
1250{
1251  __v2df zero = _mm_setzero_pd ();
1252  __v2df mask = _mm_cmpeq_pd (zero, zero);
1253
1254  return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1255						base,
1256						(__v4si)index,
1257						mask,
1258						scale);
1259}
1260
1261extern __inline __m128d
1262__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1263_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1264		       __m128d mask, const int scale)
1265{
1266  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1267						base,
1268						(__v4si)index,
1269						(__v2df)mask,
1270						scale);
1271}
1272
1273extern __inline __m256d
1274__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1275_mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1276{
1277  __v4df zero = _mm256_setzero_pd ();
1278  __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ);
1279
1280  return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1281						base,
1282						(__v4si)index,
1283						mask,
1284						scale);
1285}
1286
1287extern __inline __m256d
1288__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1289_mm256_mask_i32gather_pd (__m256d src, double const *base,
1290			  __m128i index, __m256d mask, const int scale)
1291{
1292  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1293						base,
1294						(__v4si)index,
1295						(__v4df)mask,
1296						scale);
1297}
1298
1299extern __inline __m128d
1300__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1301_mm_i64gather_pd (double const *base, __m128i index, const int scale)
1302{
1303  __v2df src = _mm_setzero_pd ();
1304  __v2df mask = _mm_cmpeq_pd (src, src);
1305
1306  return (__m128d) __builtin_ia32_gatherdiv2df (src,
1307						base,
1308						(__v2di)index,
1309						mask,
1310						scale);
1311}
1312
1313extern __inline __m128d
1314__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1315_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1316		       __m128d mask, const int scale)
1317{
1318  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1319						base,
1320						(__v2di)index,
1321						(__v2df)mask,
1322						scale);
1323}
1324
1325extern __inline __m256d
1326__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1327_mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1328{
1329  __v4df src = _mm256_setzero_pd ();
1330  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1331
1332  return (__m256d) __builtin_ia32_gatherdiv4df (src,
1333						base,
1334						(__v4di)index,
1335						mask,
1336						scale);
1337}
1338
1339extern __inline __m256d
1340__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1341_mm256_mask_i64gather_pd (__m256d src, double const *base,
1342			  __m256i index, __m256d mask, const int scale)
1343{
1344  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1345						base,
1346						(__v4di)index,
1347						(__v4df)mask,
1348						scale);
1349}
1350
1351extern __inline __m128
1352__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1353_mm_i32gather_ps (float const *base, __m128i index, const int scale)
1354{
1355  __v4sf src = _mm_setzero_ps ();
1356  __v4sf mask = _mm_cmpeq_ps (src, src);
1357
1358  return (__m128) __builtin_ia32_gathersiv4sf (src,
1359					       base,
1360					       (__v4si)index,
1361					       mask,
1362					       scale);
1363}
1364
1365extern __inline __m128
1366__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1367_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1368		       __m128 mask, const int scale)
1369{
1370  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1371					       base,
1372					       (__v4si)index,
1373					       (__v4sf)mask,
1374					       scale);
1375}
1376
1377extern __inline __m256
1378__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1379_mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1380{
1381  __v8sf src = _mm256_setzero_ps ();
1382  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1383
1384  return (__m256) __builtin_ia32_gathersiv8sf (src,
1385					       base,
1386					       (__v8si)index,
1387					       mask,
1388					       scale);
1389}
1390
1391extern __inline __m256
1392__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1393_mm256_mask_i32gather_ps (__m256 src, float const *base,
1394			  __m256i index, __m256 mask, const int scale)
1395{
1396  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1397					       base,
1398					       (__v8si)index,
1399					       (__v8sf)mask,
1400					       scale);
1401}
1402
1403extern __inline __m128
1404__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1405_mm_i64gather_ps (float const *base, __m128i index, const int scale)
1406{
1407  __v4sf src = _mm_setzero_ps ();
1408  __v4sf mask = _mm_cmpeq_ps (src, src);
1409
1410  return (__m128) __builtin_ia32_gatherdiv4sf (src,
1411					       base,
1412					       (__v2di)index,
1413					       mask,
1414					       scale);
1415}
1416
1417extern __inline __m128
1418__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1419_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1420		       __m128 mask, const int scale)
1421{
1422  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1423						base,
1424						(__v2di)index,
1425						(__v4sf)mask,
1426						scale);
1427}
1428
1429extern __inline __m128
1430__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1431_mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1432{
1433  __v4sf src = _mm_setzero_ps ();
1434  __v4sf mask = _mm_cmpeq_ps (src, src);
1435
1436  return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1437						  base,
1438						  (__v4di)index,
1439						  mask,
1440						  scale);
1441}
1442
1443extern __inline __m128
1444__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1445_mm256_mask_i64gather_ps (__m128 src, float const *base,
1446			  __m256i index, __m128 mask, const int scale)
1447{
1448  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1449						  base,
1450						  (__v4di)index,
1451						  (__v4sf)mask,
1452						  scale);
1453}
1454
1455extern __inline __m128i
1456__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1457_mm_i32gather_epi64 (long long int const *base,
1458		     __m128i index, const int scale)
1459{
1460  __v2di src = __extension__ (__v2di){ 0, 0 };
1461  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1462
1463  return (__m128i) __builtin_ia32_gathersiv2di (src,
1464						base,
1465						(__v4si)index,
1466						mask,
1467						scale);
1468}
1469
1470extern __inline __m128i
1471__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1472_mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1473			  __m128i index, __m128i mask, const int scale)
1474{
1475  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1476						base,
1477						(__v4si)index,
1478						(__v2di)mask,
1479						scale);
1480}
1481
1482extern __inline __m256i
1483__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1484_mm256_i32gather_epi64 (long long int const *base,
1485			__m128i index, const int scale)
1486{
1487  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1488  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1489
1490  return (__m256i) __builtin_ia32_gathersiv4di (src,
1491						base,
1492						(__v4si)index,
1493						mask,
1494						scale);
1495}
1496
1497extern __inline __m256i
1498__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1499_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1500			     __m128i index, __m256i mask, const int scale)
1501{
1502  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1503						base,
1504						(__v4si)index,
1505						(__v4di)mask,
1506						scale);
1507}
1508
1509extern __inline __m128i
1510__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1511_mm_i64gather_epi64 (long long int const *base,
1512		     __m128i index, const int scale)
1513{
1514  __v2di src = __extension__ (__v2di){ 0, 0 };
1515  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1516
1517  return (__m128i) __builtin_ia32_gatherdiv2di (src,
1518						base,
1519						(__v2di)index,
1520						mask,
1521						scale);
1522}
1523
1524extern __inline __m128i
1525__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1526_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1527			  __m128i mask, const int scale)
1528{
1529  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1530						base,
1531						(__v2di)index,
1532						(__v2di)mask,
1533						scale);
1534}
1535
1536extern __inline __m256i
1537__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1538_mm256_i64gather_epi64 (long long int const *base,
1539			__m256i index, const int scale)
1540{
1541  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1542  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1543
1544  return (__m256i) __builtin_ia32_gatherdiv4di (src,
1545						base,
1546						(__v4di)index,
1547						mask,
1548						scale);
1549}
1550
1551extern __inline __m256i
1552__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1553_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1554			     __m256i index, __m256i mask, const int scale)
1555{
1556  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1557						base,
1558						(__v4di)index,
1559						(__v4di)mask,
1560						scale);
1561}
1562
1563extern __inline __m128i
1564__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1565_mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1566{
1567  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1568  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1569
1570  return (__m128i) __builtin_ia32_gathersiv4si (src,
1571					       base,
1572					       (__v4si)index,
1573					       mask,
1574					       scale);
1575}
1576
1577extern __inline __m128i
1578__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1579_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1580			  __m128i mask, const int scale)
1581{
1582  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1583						base,
1584						(__v4si)index,
1585						(__v4si)mask,
1586						scale);
1587}
1588
1589extern __inline __m256i
1590__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1591_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1592{
1593  __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1594  __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1595
1596  return (__m256i) __builtin_ia32_gathersiv8si (src,
1597						base,
1598						(__v8si)index,
1599						mask,
1600						scale);
1601}
1602
1603extern __inline __m256i
1604__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1605_mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1606			     __m256i index, __m256i mask, const int scale)
1607{
1608  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1609						base,
1610						(__v8si)index,
1611						(__v8si)mask,
1612						scale);
1613}
1614
1615extern __inline __m128i
1616__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1617_mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1618{
1619  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1620  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1621
1622  return (__m128i) __builtin_ia32_gatherdiv4si (src,
1623						base,
1624						(__v2di)index,
1625						mask,
1626						scale);
1627}
1628
1629extern __inline __m128i
1630__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1631_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1632			  __m128i mask, const int scale)
1633{
1634  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1635						base,
1636						(__v2di)index,
1637						(__v4si)mask,
1638						scale);
1639}
1640
1641extern __inline __m128i
1642__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1643_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1644{
1645  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1646  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1647
1648  return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1649						  base,
1650						  (__v4di)index,
1651						  mask,
1652						  scale);
1653}
1654
1655extern __inline __m128i
1656__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1657_mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1658			     __m256i index, __m128i mask, const int scale)
1659{
1660  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1661						   base,
1662						   (__v4di)index,
1663						   (__v4si)mask,
1664						   scale);
1665}
1666#else /* __OPTIMIZE__ */
1667#define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
1668  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
1669					 (double const *)BASE,		\
1670					 (__v4si)(__m128i)INDEX,	\
1671					 (__v2df)_mm_set1_pd(		\
1672					   (double)(long long int) -1), \
1673					 (int)SCALE)
1674
1675#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1676  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
1677					 (double const *)BASE,	 \
1678					 (__v4si)(__m128i)INDEX, \
1679					 (__v2df)(__m128d)MASK,	 \
1680					 (int)SCALE)
1681
1682#define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
1683  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
1684					 (double const *)BASE,		\
1685					 (__v4si)(__m128i)INDEX,	\
1686					 (__v4df)_mm256_set1_pd(	\
1687					   (double)(long long int) -1), \
1688					 (int)SCALE)
1689
1690#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1691  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
1692					 (double const *)BASE,	 \
1693					 (__v4si)(__m128i)INDEX, \
1694					 (__v4df)(__m256d)MASK,	 \
1695					 (int)SCALE)
1696
1697#define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
1698  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
1699					 (double const *)BASE,		\
1700					 (__v2di)(__m128i)INDEX,	\
1701					 (__v2df)_mm_set1_pd(		\
1702					   (double)(long long int) -1), \
1703					 (int)SCALE)
1704
1705#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1706  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
1707					 (double const *)BASE,	 \
1708					 (__v2di)(__m128i)INDEX, \
1709					 (__v2df)(__m128d)MASK,	 \
1710					 (int)SCALE)
1711
1712#define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
1713  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
1714					 (double const *)BASE,		\
1715					 (__v4di)(__m256i)INDEX,	\
1716					 (__v4df)_mm256_set1_pd(	\
1717					   (double)(long long int) -1), \
1718					 (int)SCALE)
1719
1720#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1721  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
1722					 (double const *)BASE,	 \
1723					 (__v4di)(__m256i)INDEX, \
1724					 (__v4df)(__m256d)MASK,	 \
1725					 (int)SCALE)
1726
1727#define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
1728  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
1729					(float const *)BASE,		\
1730					(__v4si)(__m128i)INDEX,		\
1731					_mm_set1_ps ((float)(int) -1),	\
1732					(int)SCALE)
1733
1734#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1735  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
1736					(float const *)BASE,	 \
1737					(__v4si)(__m128i)INDEX,	 \
1738					(__v4sf)(__m128d)MASK,	 \
1739					(int)SCALE)
1740
1741#define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
1742  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1743					(float const *)BASE,	       \
1744					(__v8si)(__m256i)INDEX,	       \
1745					(__v8sf)_mm256_set1_ps (       \
1746					  (float)(int) -1),	       \
1747					(int)SCALE)
1748
1749#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1750  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
1751					(float const *)BASE,	\
1752					(__v8si)(__m256i)INDEX, \
1753					(__v8sf)(__m256d)MASK,	\
1754					(int)SCALE)
1755
1756#define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
1757  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
1758					(float const *)BASE,		\
1759					(__v2di)(__m128i)INDEX,		\
1760					(__v4sf)_mm_set1_ps (		\
1761					  (float)(int) -1),		\
1762					(int)SCALE)
1763
1764#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1765  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
1766					(float const *)BASE,	 \
1767					(__v2di)(__m128i)INDEX,	 \
1768					(__v4sf)(__m128d)MASK,	 \
1769					(int)SCALE)
1770
1771#define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
1772  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
1773					   (float const *)BASE,		\
1774					   (__v4di)(__m256i)INDEX,	\
1775					   (__v4sf)_mm_set1_ps(		\
1776					     (float)(int) -1),		\
1777					   (int)SCALE)
1778
1779#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
1780  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
1781					   (float const *)BASE,	   \
1782					   (__v4di)(__m256i)INDEX, \
1783					   (__v4sf)(__m128)MASK,   \
1784					   (int)SCALE)
1785
1786#define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
1787  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1788					 (long long const *)BASE,	\
1789					 (__v4si)(__m128i)INDEX,	\
1790					 (__v2di)_mm_set1_epi64x (-1),	\
1791					 (int)SCALE)
1792
1793#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1794  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
1795					 (long long const *)BASE, \
1796					 (__v4si)(__m128i)INDEX,  \
1797					 (__v2di)(__m128i)MASK,	  \
1798					 (int)SCALE)
1799
1800#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
1801  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1802					 (long long const *)BASE,	   \
1803					 (__v4si)(__m128i)INDEX,	   \
1804					 (__v4di)_mm256_set1_epi64x (-1),  \
1805					 (int)SCALE)
1806
1807#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1808  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
1809					 (long long const *)BASE,  \
1810					 (__v4si)(__m128i)INDEX,   \
1811					 (__v4di)(__m256i)MASK,	   \
1812					 (int)SCALE)
1813
1814#define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
1815  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1816					 (long long const *)BASE,	\
1817					 (__v2di)(__m128i)INDEX,	\
1818					 (__v2di)_mm_set1_epi64x (-1),	\
1819					 (int)SCALE)
1820
1821#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1822  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
1823					 (long long const *)BASE, \
1824					 (__v2di)(__m128i)INDEX,  \
1825					 (__v2di)(__m128i)MASK,	  \
1826					 (int)SCALE)
1827
1828#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
1829  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1830					 (long long const *)BASE,	   \
1831					 (__v4di)(__m256i)INDEX,	   \
1832					 (__v4di)_mm256_set1_epi64x (-1),  \
1833					 (int)SCALE)
1834
1835#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1836  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
1837					 (long long const *)BASE,  \
1838					 (__v4di)(__m256i)INDEX,   \
1839					 (__v4di)(__m256i)MASK,	   \
1840					 (int)SCALE)
1841
1842#define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
1843  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
1844					 (int const *)BASE,		\
1845					 (__v4si)(__m128i)INDEX,	\
1846					 (__v4si)_mm_set1_epi32 (-1),	\
1847					 (int)SCALE)
1848
1849#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1850  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
1851					(int const *)BASE,	\
1852					(__v4si)(__m128i)INDEX, \
1853					(__v4si)(__m128i)MASK,	\
1854					(int)SCALE)
1855
1856#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
1857  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1858					 (int const *)BASE,		   \
1859					 (__v8si)(__m256i)INDEX,	   \
1860					 (__v8si)_mm256_set1_epi32 (-1),   \
1861					 (int)SCALE)
1862
1863#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1864  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
1865					(int const *)BASE,	   \
1866					(__v8si)(__m256i)INDEX,	   \
1867					(__v8si)(__m256i)MASK,	   \
1868					(int)SCALE)
1869
1870#define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
1871  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
1872					 (int const *)BASE,		\
1873					 (__v2di)(__m128i)INDEX,	\
1874					 (__v4si)_mm_set1_epi32 (-1),	\
1875					 (int)SCALE)
1876
1877#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1878  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
1879					(int const *)BASE,	\
1880					(__v2di)(__m128i)INDEX, \
1881					(__v4si)(__m128i)MASK,	\
1882					(int)SCALE)
1883
1884#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
1885  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1886					    (int const *)BASE,		   \
1887					    (__v4di)(__m256i)INDEX,	   \
1888					    (__v4si)_mm_set1_epi32(-1),	   \
1889					    (int)SCALE)
1890
1891#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1892  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
1893					   (int const *)BASE,	   \
1894					   (__v4di)(__m256i)INDEX, \
1895					   (__v4si)(__m128i)MASK,  \
1896					   (int)SCALE)
1897#endif  /* __OPTIMIZE__ */
1898
1899#ifdef __DISABLE_AVX2__
1900#undef __DISABLE_AVX2__
1901#pragma GCC pop_options
1902#endif /* __DISABLE_AVX2__ */
1903
1904#endif /* _AVX2INTRIN_H_INCLUDED */
1905