1/* Copyright (C) 2008-2015 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   Under Section 7 of GPL version 3, you are granted additional
16   permissions described in the GCC Runtime Library Exception, version
17   3.1, as published by the Free Software Foundation.
18
19   You should have received a copy of the GNU General Public License and
20   a copy of the GCC Runtime Library Exception along with this program;
21   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22   <http://www.gnu.org/licenses/>.  */
23
24/* Implemented from the specification included in the Intel C++ Compiler
25   User Guide and Reference, version 11.0.  */
26
27#ifndef _IMMINTRIN_H_INCLUDED
28# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29#endif
30
31#ifndef _AVXINTRIN_H_INCLUDED
32#define _AVXINTRIN_H_INCLUDED
33
34#ifndef __AVX__
35#pragma GCC push_options
36#pragma GCC target("avx")
37#define __DISABLE_AVX__
38#endif /* __AVX__ */
39
40/* Internal data types for implementing the intrinsics.  */
41typedef double __v4df __attribute__ ((__vector_size__ (32)));
42typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
45typedef int __v8si __attribute__ ((__vector_size__ (32)));
46typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
47typedef short __v16hi __attribute__ ((__vector_size__ (32)));
48typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
49typedef char __v32qi __attribute__ ((__vector_size__ (32)));
50typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
51
52/* The Intel API is flexible enough that we must allow aliasing with other
53   vector types, and their scalar components.  */
54typedef float __m256 __attribute__ ((__vector_size__ (32),
55				     __may_alias__));
56typedef long long __m256i __attribute__ ((__vector_size__ (32),
57					  __may_alias__));
58typedef double __m256d __attribute__ ((__vector_size__ (32),
59				       __may_alias__));
60
61/* Compare predicates for scalar and packed compare intrinsics.  */
62
63/* Equal (ordered, non-signaling)  */
64#define _CMP_EQ_OQ	0x00
65/* Less-than (ordered, signaling)  */
66#define _CMP_LT_OS	0x01
67/* Less-than-or-equal (ordered, signaling)  */
68#define _CMP_LE_OS	0x02
69/* Unordered (non-signaling)  */
70#define _CMP_UNORD_Q	0x03
71/* Not-equal (unordered, non-signaling)  */
72#define _CMP_NEQ_UQ	0x04
73/* Not-less-than (unordered, signaling)  */
74#define _CMP_NLT_US	0x05
75/* Not-less-than-or-equal (unordered, signaling)  */
76#define _CMP_NLE_US	0x06
77/* Ordered (nonsignaling)   */
78#define _CMP_ORD_Q	0x07
79/* Equal (unordered, non-signaling)  */
80#define _CMP_EQ_UQ	0x08
81/* Not-greater-than-or-equal (unordered, signaling)  */
82#define _CMP_NGE_US	0x09
83/* Not-greater-than (unordered, signaling)  */
84#define _CMP_NGT_US	0x0a
85/* False (ordered, non-signaling)  */
86#define _CMP_FALSE_OQ	0x0b
87/* Not-equal (ordered, non-signaling)  */
88#define _CMP_NEQ_OQ	0x0c
89/* Greater-than-or-equal (ordered, signaling)  */
90#define _CMP_GE_OS	0x0d
91/* Greater-than (ordered, signaling)  */
92#define _CMP_GT_OS	0x0e
93/* True (unordered, non-signaling)  */
94#define _CMP_TRUE_UQ	0x0f
95/* Equal (ordered, signaling)  */
96#define _CMP_EQ_OS	0x10
97/* Less-than (ordered, non-signaling)  */
98#define _CMP_LT_OQ	0x11
99/* Less-than-or-equal (ordered, non-signaling)  */
100#define _CMP_LE_OQ	0x12
101/* Unordered (signaling)  */
102#define _CMP_UNORD_S	0x13
103/* Not-equal (unordered, signaling)  */
104#define _CMP_NEQ_US	0x14
105/* Not-less-than (unordered, non-signaling)  */
106#define _CMP_NLT_UQ	0x15
107/* Not-less-than-or-equal (unordered, non-signaling)  */
108#define _CMP_NLE_UQ	0x16
109/* Ordered (signaling)  */
110#define _CMP_ORD_S	0x17
111/* Equal (unordered, signaling)  */
112#define _CMP_EQ_US	0x18
113/* Not-greater-than-or-equal (unordered, non-signaling)  */
114#define _CMP_NGE_UQ	0x19
115/* Not-greater-than (unordered, non-signaling)  */
116#define _CMP_NGT_UQ	0x1a
117/* False (ordered, signaling)  */
118#define _CMP_FALSE_OS	0x1b
119/* Not-equal (ordered, signaling)  */
120#define _CMP_NEQ_OS	0x1c
121/* Greater-than-or-equal (ordered, non-signaling)  */
122#define _CMP_GE_OQ	0x1d
123/* Greater-than (ordered, non-signaling)  */
124#define _CMP_GT_OQ	0x1e
125/* True (unordered, signaling)  */
126#define _CMP_TRUE_US	0x1f
127
128extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129_mm256_add_pd (__m256d __A, __m256d __B)
130{
131  return (__m256d) ((__v4df)__A + (__v4df)__B);
132}
133
134extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135_mm256_add_ps (__m256 __A, __m256 __B)
136{
137  return (__m256) ((__v8sf)__A + (__v8sf)__B);
138}
139
140extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141_mm256_addsub_pd (__m256d __A, __m256d __B)
142{
143  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
144}
145
146extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm256_addsub_ps (__m256 __A, __m256 __B)
148{
149  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
150}
151
152
153extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154_mm256_and_pd (__m256d __A, __m256d __B)
155{
156  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
157}
158
159extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160_mm256_and_ps (__m256 __A, __m256 __B)
161{
162  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
163}
164
165extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166_mm256_andnot_pd (__m256d __A, __m256d __B)
167{
168  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
169}
170
171extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm256_andnot_ps (__m256 __A, __m256 __B)
173{
174  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
175}
176
177/* Double/single precision floating point blend instructions - select
178   data from 2 sources using constant/variable mask.  */
179
180#ifdef __OPTIMIZE__
181extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
183{
184  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
185					      (__v4df)__Y,
186					      __M);
187}
188
189extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
191{
192  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
193					     (__v8sf)__Y,
194					     __M);
195}
196#else
197#define _mm256_blend_pd(X, Y, M)					\
198  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
199					(__v4df)(__m256d)(Y), (int)(M)))
200
201#define _mm256_blend_ps(X, Y, M)					\
202  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
203				       (__v8sf)(__m256)(Y), (int)(M)))
204#endif
205
206extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
208{
209  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
210					       (__v4df)__Y,
211					       (__v4df)__M);
212}
213
214extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
216{
217  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
218					      (__v8sf)__Y,
219					      (__v8sf)__M);
220}
221
222extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223_mm256_div_pd (__m256d __A, __m256d __B)
224{
225  return (__m256d) ((__v4df)__A / (__v4df)__B);
226}
227
228extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229_mm256_div_ps (__m256 __A, __m256 __B)
230{
231  return (__m256) ((__v8sf)__A / (__v8sf)__B);
232}
233
234/* Dot product instructions with mask-defined summing and zeroing parts
235   of result.  */
236
237#ifdef __OPTIMIZE__
238extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
240{
241  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
242					  (__v8sf)__Y,
243					  __M);
244}
245#else
246#define _mm256_dp_ps(X, Y, M)						\
247  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
248				    (__v8sf)(__m256)(Y), (int)(M)))
249#endif
250
251extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252_mm256_hadd_pd (__m256d __X, __m256d __Y)
253{
254  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
255}
256
257extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258_mm256_hadd_ps (__m256 __X, __m256 __Y)
259{
260  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
261}
262
263extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264_mm256_hsub_pd (__m256d __X, __m256d __Y)
265{
266  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
267}
268
269extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270_mm256_hsub_ps (__m256 __X, __m256 __Y)
271{
272  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
273}
274
275extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276_mm256_max_pd (__m256d __A, __m256d __B)
277{
278  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
279}
280
281extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282_mm256_max_ps (__m256 __A, __m256 __B)
283{
284  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
285}
286
287extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288_mm256_min_pd (__m256d __A, __m256d __B)
289{
290  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
291}
292
293extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294_mm256_min_ps (__m256 __A, __m256 __B)
295{
296  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
297}
298
299extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300_mm256_mul_pd (__m256d __A, __m256d __B)
301{
302  return (__m256d) ((__v4df)__A * (__v4df)__B);
303}
304
305extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306_mm256_mul_ps (__m256 __A, __m256 __B)
307{
308  return (__m256) ((__v8sf)__A * (__v8sf)__B);
309}
310
311extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312_mm256_or_pd (__m256d __A, __m256d __B)
313{
314  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
315}
316
317extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318_mm256_or_ps (__m256 __A, __m256 __B)
319{
320  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
321}
322
323#ifdef __OPTIMIZE__
324extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
326{
327  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
328					     __mask);
329}
330
331extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
333{
334  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
335					    __mask);
336}
337#else
338#define _mm256_shuffle_pd(A, B, N)					\
339  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
340				      (__v4df)(__m256d)(B), (int)(N)))
341
342#define _mm256_shuffle_ps(A, B, N)					\
343  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
344				      (__v8sf)(__m256)(B), (int)(N)))
345#endif
346
347extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348_mm256_sub_pd (__m256d __A, __m256d __B)
349{
350  return (__m256d) ((__v4df)__A - (__v4df)__B);
351}
352
353extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354_mm256_sub_ps (__m256 __A, __m256 __B)
355{
356  return (__m256) ((__v8sf)__A - (__v8sf)__B);
357}
358
359extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360_mm256_xor_pd (__m256d __A, __m256d __B)
361{
362  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
363}
364
365extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366_mm256_xor_ps (__m256 __A, __m256 __B)
367{
368  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
369}
370
371#ifdef __OPTIMIZE__
372extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
374{
375  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
376}
377
378extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
380{
381  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
382}
383
384extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
386{
387  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
388					    __P);
389}
390
391extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
393{
394  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
395					   __P);
396}
397
398extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
400{
401  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
402}
403
404extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
406{
407  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
408}
409#else
410#define _mm_cmp_pd(X, Y, P)						\
411  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
412				   (__v2df)(__m128d)(Y), (int)(P)))
413
414#define _mm_cmp_ps(X, Y, P)						\
415  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
416				  (__v4sf)(__m128)(Y), (int)(P)))
417
418#define _mm256_cmp_pd(X, Y, P)						\
419  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
420				      (__v4df)(__m256d)(Y), (int)(P)))
421
422#define _mm256_cmp_ps(X, Y, P)						\
423  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
424				     (__v8sf)(__m256)(Y), (int)(P)))
425
426#define _mm_cmp_sd(X, Y, P)						\
427  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
428				   (__v2df)(__m128d)(Y), (int)(P)))
429
430#define _mm_cmp_ss(X, Y, P)						\
431  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
432				  (__v4sf)(__m128)(Y), (int)(P)))
433#endif
434
435extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436_mm256_cvtepi32_pd (__m128i __A)
437{
438  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
439}
440
441extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442_mm256_cvtepi32_ps (__m256i __A)
443{
444  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
445}
446
447extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448_mm256_cvtpd_ps (__m256d __A)
449{
450  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
451}
452
453extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454_mm256_cvtps_epi32 (__m256 __A)
455{
456  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
457}
458
459extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460_mm256_cvtps_pd (__m128 __A)
461{
462  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
463}
464
465extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466_mm256_cvttpd_epi32 (__m256d __A)
467{
468  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
469}
470
471extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472_mm256_cvtpd_epi32 (__m256d __A)
473{
474  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
475}
476
477extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm256_cvttps_epi32 (__m256 __A)
479{
480  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
481}
482
483#ifdef __OPTIMIZE__
484extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485_mm256_extractf128_pd (__m256d __X, const int __N)
486{
487  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
488}
489
490extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491_mm256_extractf128_ps (__m256 __X, const int __N)
492{
493  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
494}
495
496extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497_mm256_extractf128_si256 (__m256i __X, const int __N)
498{
499  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
500}
501
502extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503_mm256_extract_epi32 (__m256i __X, int const __N)
504{
505  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
506  return _mm_extract_epi32 (__Y, __N % 4);
507}
508
509extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510_mm256_extract_epi16 (__m256i __X, int const __N)
511{
512  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
513  return _mm_extract_epi16 (__Y, __N % 8);
514}
515
516extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517_mm256_extract_epi8 (__m256i __X, int const __N)
518{
519  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
520  return _mm_extract_epi8 (__Y, __N % 16);
521}
522
523#ifdef __x86_64__
524extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525_mm256_extract_epi64 (__m256i __X, const int __N)
526{
527  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
528  return _mm_extract_epi64 (__Y, __N % 2);
529}
530#endif
531#else
532#define _mm256_extractf128_pd(X, N)					\
533  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
534						(int)(N)))
535
536#define _mm256_extractf128_ps(X, N)					\
537  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
538					       (int)(N)))
539
540#define _mm256_extractf128_si256(X, N)					\
541  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
542						(int)(N)))
543
544#define _mm256_extract_epi32(X, N)					\
545  (__extension__							\
546   ({									\
547      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
548      _mm_extract_epi32 (__Y, (N) % 4);					\
549    }))
550
551#define _mm256_extract_epi16(X, N)					\
552  (__extension__							\
553   ({									\
554      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
555      _mm_extract_epi16 (__Y, (N) % 8);					\
556    }))
557
558#define _mm256_extract_epi8(X, N)					\
559  (__extension__							\
560   ({									\
561      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
562      _mm_extract_epi8 (__Y, (N) % 16);					\
563    }))
564
565#ifdef __x86_64__
566#define _mm256_extract_epi64(X, N)					\
567  (__extension__							\
568   ({									\
569      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
570      _mm_extract_epi64 (__Y, (N) % 2);					\
571    }))
572#endif
573#endif
574
575extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576_mm256_zeroall (void)
577{
578  __builtin_ia32_vzeroall ();
579}
580
581extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582_mm256_zeroupper (void)
583{
584  __builtin_ia32_vzeroupper ();
585}
586
587extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588_mm_permutevar_pd (__m128d __A, __m128i __C)
589{
590  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
591						(__v2di)__C);
592}
593
594extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm256_permutevar_pd (__m256d __A, __m256i __C)
596{
597  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
598						   (__v4di)__C);
599}
600
601extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602_mm_permutevar_ps (__m128 __A, __m128i __C)
603{
604  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
605					       (__v4si)__C);
606}
607
608extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609_mm256_permutevar_ps (__m256 __A, __m256i __C)
610{
611  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
612						  (__v8si)__C);
613}
614
615#ifdef __OPTIMIZE__
616extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617_mm_permute_pd (__m128d __X, const int __C)
618{
619  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
620}
621
622extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623_mm256_permute_pd (__m256d __X, const int __C)
624{
625  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
626}
627
628extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629_mm_permute_ps (__m128 __X, const int __C)
630{
631  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
632}
633
634extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635_mm256_permute_ps (__m256 __X, const int __C)
636{
637  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
638}
639#else
640#define _mm_permute_pd(X, C)						\
641  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
642
643#define _mm256_permute_pd(X, C)						\
644  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
645
646#define _mm_permute_ps(X, C)						\
647  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
648
649#define _mm256_permute_ps(X, C)						\
650  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
651#endif
652
653#ifdef __OPTIMIZE__
654extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
656{
657  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
658						    (__v4df)__Y,
659						    __C);
660}
661
662extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
664{
665  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
666						   (__v8sf)__Y,
667						   __C);
668}
669
670extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
672{
673  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
674						    (__v8si)__Y,
675						    __C);
676}
677#else
678#define _mm256_permute2f128_pd(X, Y, C)					\
679  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
680					      (__v4df)(__m256d)(Y),	\
681					      (int)(C)))
682
683#define _mm256_permute2f128_ps(X, Y, C)					\
684  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
685					     (__v8sf)(__m256)(Y),	\
686					     (int)(C)))
687
688#define _mm256_permute2f128_si256(X, Y, C)				\
689  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
690					      (__v8si)(__m256i)(Y),	\
691					      (int)(C)))
692#endif
693
694extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_broadcast_ss (float const *__X)
696{
697  return (__m128) __builtin_ia32_vbroadcastss (__X);
698}
699
700extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701_mm256_broadcast_sd (double const *__X)
702{
703  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
704}
705
706extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707_mm256_broadcast_ss (float const *__X)
708{
709  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
710}
711
712extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713_mm256_broadcast_pd (__m128d const *__X)
714{
715  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
716}
717
718extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719_mm256_broadcast_ps (__m128 const *__X)
720{
721  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
722}
723
724#ifdef __OPTIMIZE__
725extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
727{
728  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
729						     (__v2df)__Y,
730						     __O);
731}
732
733extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
735{
736  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
737						    (__v4sf)__Y,
738						    __O);
739}
740
741extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
743{
744  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
745						     (__v4si)__Y,
746						     __O);
747}
748
749extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
750_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
751{
752  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
753  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
754  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
755}
756
757extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
759{
760  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
761  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
762  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
763}
764
765extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
767{
768  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
769  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
770  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
771}
772
773#ifdef __x86_64__
774extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
776{
777  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
778  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
779  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
780}
781#endif
782#else
783#define _mm256_insertf128_pd(X, Y, O)					\
784  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
785					       (__v2df)(__m128d)(Y),	\
786					       (int)(O)))
787
788#define _mm256_insertf128_ps(X, Y, O)					\
789  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
790					      (__v4sf)(__m128)(Y),  	\
791					      (int)(O)))
792
793#define _mm256_insertf128_si256(X, Y, O)				\
794  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
795					       (__v4si)(__m128i)(Y),	\
796					       (int)(O)))
797
798#define _mm256_insert_epi32(X, D, N)					\
799  (__extension__							\
800   ({									\
801      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
802      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
803      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
804    }))
805
806#define _mm256_insert_epi16(X, D, N)					\
807  (__extension__							\
808   ({									\
809      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
810      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
811      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
812    }))
813
814#define _mm256_insert_epi8(X, D, N)					\
815  (__extension__							\
816   ({									\
817      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
818      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
819      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
820    }))
821
822#ifdef __x86_64__
823#define _mm256_insert_epi64(X, D, N)					\
824  (__extension__							\
825   ({									\
826      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
827      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
828      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
829    }))
830#endif
831#endif
832
833extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834_mm256_load_pd (double const *__P)
835{
836  return *(__m256d *)__P;
837}
838
839extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840_mm256_store_pd (double *__P, __m256d __A)
841{
842  *(__m256d *)__P = __A;
843}
844
845extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846_mm256_load_ps (float const *__P)
847{
848  return *(__m256 *)__P;
849}
850
851extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
852_mm256_store_ps (float *__P, __m256 __A)
853{
854  *(__m256 *)__P = __A;
855}
856
857extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858_mm256_loadu_pd (double const *__P)
859{
860  return (__m256d) __builtin_ia32_loadupd256 (__P);
861}
862
863extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
864_mm256_storeu_pd (double *__P, __m256d __A)
865{
866  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
867}
868
869extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
870_mm256_loadu_ps (float const *__P)
871{
872  return (__m256) __builtin_ia32_loadups256 (__P);
873}
874
875extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876_mm256_storeu_ps (float *__P, __m256 __A)
877{
878  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
879}
880
881extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882_mm256_load_si256 (__m256i const *__P)
883{
884  return *__P;
885}
886
887extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
888_mm256_store_si256 (__m256i *__P, __m256i __A)
889{
890  *__P = __A;
891}
892
893extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894_mm256_loadu_si256 (__m256i const *__P)
895{
896  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
897}
898
899extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900_mm256_storeu_si256 (__m256i *__P, __m256i __A)
901{
902  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
903}
904
905extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_maskload_pd (double const *__P, __m128i __M)
907{
908  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
909					      (__v2di)__M);
910}
911
912extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
914{
915  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
916}
917
918extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919_mm256_maskload_pd (double const *__P, __m256i __M)
920{
921  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
922						 (__v4di)__M);
923}
924
925extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
927{
928  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
929}
930
931extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932_mm_maskload_ps (float const *__P, __m128i __M)
933{
934  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
935					     (__v4si)__M);
936}
937
938extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
940{
941  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
942}
943
944extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945_mm256_maskload_ps (float const *__P, __m256i __M)
946{
947  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
948						(__v8si)__M);
949}
950
951extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
953{
954  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
955}
956
957extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958_mm256_movehdup_ps (__m256 __X)
959{
960  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
961}
962
963extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964_mm256_moveldup_ps (__m256 __X)
965{
966  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
967}
968
969extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970_mm256_movedup_pd (__m256d __X)
971{
972  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
973}
974
975extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976_mm256_lddqu_si256 (__m256i const *__P)
977{
978  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
979}
980
981extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982_mm256_stream_si256 (__m256i *__A, __m256i __B)
983{
984  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
985}
986
987extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988_mm256_stream_pd (double *__A, __m256d __B)
989{
990  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
991}
992
993extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994_mm256_stream_ps (float *__P, __m256 __A)
995{
996  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
997}
998
999extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000_mm256_rcp_ps (__m256 __A)
1001{
1002  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1003}
1004
1005extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006_mm256_rsqrt_ps (__m256 __A)
1007{
1008  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1009}
1010
1011extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012_mm256_sqrt_pd (__m256d __A)
1013{
1014  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1015}
1016
1017extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018_mm256_sqrt_ps (__m256 __A)
1019{
1020  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1021}
1022
1023#ifdef __OPTIMIZE__
1024extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025_mm256_round_pd (__m256d __V, const int __M)
1026{
1027  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1028}
1029
1030extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031_mm256_round_ps (__m256 __V, const int __M)
1032{
1033  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1034}
1035#else
1036#define _mm256_round_pd(V, M) \
1037  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1038
1039#define _mm256_round_ps(V, M) \
1040  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1041#endif
1042
1043#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
1044#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
1045#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
1046#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
1047
1048extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1049_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1050{
1051  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1052}
1053
1054extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1056{
1057  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1058}
1059
1060extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1062{
1063  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1064}
1065
1066extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1068{
1069  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1070}
1071
1072extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073_mm_testz_pd (__m128d __M, __m128d __V)
1074{
1075  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1076}
1077
1078extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079_mm_testc_pd (__m128d __M, __m128d __V)
1080{
1081  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1082}
1083
1084extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085_mm_testnzc_pd (__m128d __M, __m128d __V)
1086{
1087  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1088}
1089
1090extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1091_mm_testz_ps (__m128 __M, __m128 __V)
1092{
1093  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1094}
1095
1096extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097_mm_testc_ps (__m128 __M, __m128 __V)
1098{
1099  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1100}
1101
1102extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103_mm_testnzc_ps (__m128 __M, __m128 __V)
1104{
1105  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1106}
1107
1108extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1109_mm256_testz_pd (__m256d __M, __m256d __V)
1110{
1111  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1112}
1113
1114extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115_mm256_testc_pd (__m256d __M, __m256d __V)
1116{
1117  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1118}
1119
1120extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121_mm256_testnzc_pd (__m256d __M, __m256d __V)
1122{
1123  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1124}
1125
1126extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127_mm256_testz_ps (__m256 __M, __m256 __V)
1128{
1129  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1130}
1131
1132extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133_mm256_testc_ps (__m256 __M, __m256 __V)
1134{
1135  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1136}
1137
1138extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139_mm256_testnzc_ps (__m256 __M, __m256 __V)
1140{
1141  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1142}
1143
1144extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145_mm256_testz_si256 (__m256i __M, __m256i __V)
1146{
1147  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1148}
1149
1150extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151_mm256_testc_si256 (__m256i __M, __m256i __V)
1152{
1153  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1154}
1155
1156extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1158{
1159  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1160}
1161
1162extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163_mm256_movemask_pd (__m256d __A)
1164{
1165  return __builtin_ia32_movmskpd256 ((__v4df)__A);
1166}
1167
1168extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169_mm256_movemask_ps (__m256 __A)
1170{
1171  return __builtin_ia32_movmskps256 ((__v8sf)__A);
1172}
1173
1174extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175_mm256_undefined_pd (void)
1176{
1177  __m256d __Y = __Y;
1178  return __Y;
1179}
1180
1181extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182_mm256_undefined_ps (void)
1183{
1184  __m256 __Y = __Y;
1185  return __Y;
1186}
1187
1188extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189_mm256_undefined_si256 (void)
1190{
1191  __m256i __Y = __Y;
1192  return __Y;
1193}
1194
1195extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196_mm256_setzero_pd (void)
1197{
1198  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1199}
1200
1201extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202_mm256_setzero_ps (void)
1203{
1204  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1205				 0.0, 0.0, 0.0, 0.0 };
1206}
1207
1208extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209_mm256_setzero_si256 (void)
1210{
1211  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1212}
1213
1214/* Create the vector [A B C D].  */
1215extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216_mm256_set_pd (double __A, double __B, double __C, double __D)
1217{
1218  return __extension__ (__m256d){ __D, __C, __B, __A };
1219}
1220
1221/* Create the vector [A B C D E F G H].  */
1222extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223_mm256_set_ps (float __A, float __B, float __C, float __D,
1224	       float __E, float __F, float __G, float __H)
1225{
1226  return __extension__ (__m256){ __H, __G, __F, __E,
1227				 __D, __C, __B, __A };
1228}
1229
1230/* Create the vector [A B C D E F G H].  */
1231extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1233		  int __E, int __F, int __G, int __H)
1234{
1235  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1236					  __D, __C, __B, __A };
1237}
1238
1239extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1241		  short __q11, short __q10, short __q09, short __q08,
1242		  short __q07, short __q06, short __q05, short __q04,
1243		  short __q03, short __q02, short __q01, short __q00)
1244{
1245  return __extension__ (__m256i)(__v16hi){
1246    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1247    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1248  };
1249}
1250
1251extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1253		  char __q27, char __q26, char __q25, char __q24,
1254		  char __q23, char __q22, char __q21, char __q20,
1255		  char __q19, char __q18, char __q17, char __q16,
1256		  char __q15, char __q14, char __q13, char __q12,
1257		  char __q11, char __q10, char __q09, char __q08,
1258		  char __q07, char __q06, char __q05, char __q04,
1259		  char __q03, char __q02, char __q01, char __q00)
1260{
1261  return __extension__ (__m256i)(__v32qi){
1262    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1263    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1264    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1265    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1266  };
1267}
1268
1269extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270_mm256_set_epi64x (long long __A, long long __B, long long __C,
1271		   long long __D)
1272{
1273  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1274}
1275
1276/* Create a vector with all elements equal to A.  */
1277extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm256_set1_pd (double __A)
1279{
1280  return __extension__ (__m256d){ __A, __A, __A, __A };
1281}
1282
1283/* Create a vector with all elements equal to A.  */
1284extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285_mm256_set1_ps (float __A)
1286{
1287  return __extension__ (__m256){ __A, __A, __A, __A,
1288				 __A, __A, __A, __A };
1289}
1290
1291/* Create a vector with all elements equal to A.  */
1292extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293_mm256_set1_epi32 (int __A)
1294{
1295  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1296					  __A, __A, __A, __A };
1297}
1298
1299extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300_mm256_set1_epi16 (short __A)
1301{
1302  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1303			   __A, __A, __A, __A, __A, __A, __A, __A);
1304}
1305
1306extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307_mm256_set1_epi8 (char __A)
1308{
1309  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1310			  __A, __A, __A, __A, __A, __A, __A, __A,
1311			  __A, __A, __A, __A, __A, __A, __A, __A,
1312			  __A, __A, __A, __A, __A, __A, __A, __A);
1313}
1314
1315extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316_mm256_set1_epi64x (long long __A)
1317{
1318  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1319}
1320
1321/* Create vectors of elements in the reversed order from the
1322   _mm256_set_XXX functions.  */
1323
1324extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325_mm256_setr_pd (double __A, double __B, double __C, double __D)
1326{
1327  return _mm256_set_pd (__D, __C, __B, __A);
1328}
1329
1330extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331_mm256_setr_ps (float __A, float __B, float __C, float __D,
1332		float __E, float __F, float __G, float __H)
1333{
1334  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1335}
1336
1337extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1339		   int __E, int __F, int __G, int __H)
1340{
1341  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1342}
1343
1344extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1346		   short __q11, short __q10, short __q09, short __q08,
1347		   short __q07, short __q06, short __q05, short __q04,
1348		   short __q03, short __q02, short __q01, short __q00)
1349{
1350  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1351			   __q04, __q05, __q06, __q07,
1352			   __q08, __q09, __q10, __q11,
1353			   __q12, __q13, __q14, __q15);
1354}
1355
1356extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1358		   char __q27, char __q26, char __q25, char __q24,
1359		   char __q23, char __q22, char __q21, char __q20,
1360		   char __q19, char __q18, char __q17, char __q16,
1361		   char __q15, char __q14, char __q13, char __q12,
1362		   char __q11, char __q10, char __q09, char __q08,
1363		   char __q07, char __q06, char __q05, char __q04,
1364		   char __q03, char __q02, char __q01, char __q00)
1365{
1366  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1367			  __q04, __q05, __q06, __q07,
1368			  __q08, __q09, __q10, __q11,
1369			  __q12, __q13, __q14, __q15,
1370			  __q16, __q17, __q18, __q19,
1371			  __q20, __q21, __q22, __q23,
1372			  __q24, __q25, __q26, __q27,
1373			  __q28, __q29, __q30, __q31);
1374}
1375
1376extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1378		    long long __D)
1379{
1380  return _mm256_set_epi64x (__D, __C, __B, __A);
1381}
1382
1383/* Casts between various SP, DP, INT vector types.  Note that these do no
1384   conversion of values, they just change the type.  */
1385extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386_mm256_castpd_ps (__m256d __A)
1387{
1388  return (__m256) __A;
1389}
1390
1391extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392_mm256_castpd_si256 (__m256d __A)
1393{
1394  return (__m256i) __A;
1395}
1396
1397extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398_mm256_castps_pd (__m256 __A)
1399{
1400  return (__m256d) __A;
1401}
1402
1403extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404_mm256_castps_si256(__m256 __A)
1405{
1406  return (__m256i) __A;
1407}
1408
1409extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410_mm256_castsi256_ps (__m256i __A)
1411{
1412  return (__m256) __A;
1413}
1414
1415extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1416_mm256_castsi256_pd (__m256i __A)
1417{
1418  return (__m256d) __A;
1419}
1420
1421extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422_mm256_castpd256_pd128 (__m256d __A)
1423{
1424  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1425}
1426
1427extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428_mm256_castps256_ps128 (__m256 __A)
1429{
1430  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1431}
1432
1433extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1434_mm256_castsi256_si128 (__m256i __A)
1435{
1436  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1437}
1438
1439/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1440   the 256-bit result contain source parameter value and the upper 128
1441   bits of the result are undefined.  Those intrinsics shouldn't
1442   generate any extra moves.  */
1443
1444extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445_mm256_castpd128_pd256 (__m128d __A)
1446{
1447  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1448}
1449
1450extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1451_mm256_castps128_ps256 (__m128 __A)
1452{
1453  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1454}
1455
1456extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457_mm256_castsi128_si256 (__m128i __A)
1458{
1459  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1460}
1461
1462#ifdef __DISABLE_AVX__
1463#undef __DISABLE_AVX__
1464#pragma GCC pop_options
1465#endif /* __DISABLE_AVX__ */
1466
1467#endif /* _AVXINTRIN_H_INCLUDED */
1468